Question

我想为每个键训练模型，但是我有很多键。我正在尝试RDD和数据框方法：使用rdd.map并使用数据框方法将mllib函数捆绑在一个函数中，以针对每个键训练模型并获取相应的模型测试结果。我看过this个帖子，但对我没有帮助。

def metrics_per_key(key):
   import pyspark.sql.functions as F
   df = spark.read.csv('path to csv', header=True, inferSchema=True)  
   df = df.withColumn('label', df['rank'] - 1)   
   df = df.withColumn('day_part', F.when(df.hour < 3, 'g1').when(df.hour < 6, 'g2').when(df.hour < 9, 'g3').when(df.hour < 12, 'g4').when(df.hour < 15, 'g5').when(df.hour < 18, 'g6').when(df.hour < 21, 'g7').otherwise('g8'))
   df_filtered = df.filter(F.col('key') == key).drop('key')

   stringIndexer_day = StringIndexer(inputCol="day", outputCol="dayIndex")
   stringIndexer_day_hr = StringIndexer(inputCol="day_hour", outputCol="day_hourIndex")
   stringIndexer_day_part = StringIndexer(inputCol="day_part", outputCol="day_partIndex")
   model_day = stringIndexer_day.fit(df)
   indexed_day = model_day.transform(df)
   model_day_hour = stringIndexer_day_hr.fit(indexed_day)
   indexed_all = model_day_hour.transform(indexed_day)
   model_day_part = stringIndexer_day_part.fit(indexed_all)
   indexed_all_including_day_part = model_day_part.transform(indexed_all)
   encoder_day = OneHotEncoder(inputCol="dayIndex", outputCol="dayIndexVec")
   encoder_dayHour = OneHotEncoder(inputCol="day_hourIndex", outputCol="day_hourIndexVec")
   encoder_hour = OneHotEncoder(inputCol="hour", outputCol="hourIndexVec")
   encoder_day_part = OneHotEncoder(inputCol="day_partIndex", outputCol="day_partIndexVec")
   encoded_day = encoder_day.transform(indexed_all_including_day_part)
   encode_day_dayHour = encoder_dayHour.transform(encoded_day)
   encoded_all = encoder_hour.transform(encode_day_dayHour)
   encoded_all_with_day_part = encoder_day_part.transform(encoded_all)
   assembler = VectorAssembler(inputCols=["hourIndexVec", "dayIndexVec", "day_hourIndexVec", "day_partIndexVec","bid"], outputCol="features")
   assembled = assembler.transform(encoded_all_with_day_part)
   assembled = assembled.select(["key","label","features"])
   assembled.persist()
   labelIndexer = StringIndexer(inputCol="label", 
   outputCol="indexedLabel").fit(assembled)
   featureIndexer = VectorIndexer(inputCol="features", 
   outputCol="indexedFeatures", maxCategories=4).fit(assembled)
   (trainingData, testData) = assembled.randomSplit([0.8, 0.2], seed = 0)
   rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
   labelConverter = IndexToString(inputCol="prediction", 
   outputCol="predictedLabel",labels=labelIndexer.labels)
   pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

    paramGrid_rf = ParamGridBuilder().addGrid(rf.maxDepth, [10,20,25,30]).addGrid(rf.numTrees, [10,20,30, 40, 50]).addGrid(rf.maxBins, [16, 32,48,64]).build()
   crossval = CrossValidator(estimator=pipeline,estimatorParamMaps=paramGrid_rf,evaluator=MulticlassClassificationEvaluator(),numFolds=5,parallelism = 10)
   model = crossval.fit(trainingData)
   predictions = model.transform(testData)
   precision = MulticlassClassificationEvaluator(labelCol="indexedLabel", 
   predictionCol="prediction", 
   metricName="weightedPrecision").evaluate(predictions)
   recall = MulticlassClassificationEvaluator(labelCol="indexedLabel", 
 predictionCol="prediction", metricName="weightedRecall").evaluate(predictions)
  accuracy = MulticlassClassificationEvaluator(labelCol="indexedLabel", 
  predictionCol="prediction", metricName="accuracy").evaluate(predictions)
  f1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", 
  predictionCol="prediction", metricName="f1").evaluate(predictions)
  return {'f1_test':f1, 'precision_test':precision, 'accuracy_test':accuracy, 'recall_test':recall}

我想将以上功能与rdd.map一起使用

df = spark.read.csv('path to csv', header=True, inferSchema=True) 
keys = df.select('key').rdd
results = keys.map(metrics_per_key).collect()

异常：似乎您正在尝试从广播变量，操作或转换引用SparkContext。 SparkContext只能在驱动程序上使用，而不能在工作程序上运行的代码中使用。有关更多信息，请参阅SPARK-5063。

您似乎正在尝试从广播变量，操作或转换中引用SparkContext

0 个答案: