我想为每个键训练模型,但是我有很多键。是否可以按键并行构建模型。 我有一个数据帧,每个键都有响应和预测变量。下面是每个密钥具有随机森林的示例代码。由于它循环每个键,因此无法并行化。
def get_metrics(keys_list, data = df):
f1_test = []
precision_test = []
recall_test = []
accuracy_test = []
for key in keys_list:
this_df = df.filter(F.col('key') == key).drop('key')
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(this_df)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(this_df)
# Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = this_df.randomSplit([0.8, 0.2], seed = 0)
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
labels=labelIndexer.labels)
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])
paramGrid_rf = ParamGridBuilder() \
.addGrid(rf.maxDepth, [10,20,25,30]) \
.addGrid(rf.numTrees, [10,20,30, 40, 50]) \
.addGrid(rf.maxBins, [16, 32,48,64]) \
.build()
crossval = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid_rf,
evaluator=MulticlassClassificationEvaluator(),
numFolds=5,
parallelism = 10)
# Train model. This also runs the indexers.
model = crossval.fit(trainingData)
# Make predictions.
predictions = model.transform(testData)
# Select (prediction, true label) and compute test error
precision = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision").evaluate(predictions)
recall = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall").evaluate(predictions)
accuracy = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy").evaluate(predictions)
f1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1").evaluate(predictions)
f1_test += [f1]
precision_test += [precision]
recall_test += [recall]
accuracy_test += [accuracy]
return {'f1_test':f1_test, 'precision_test':precision_test, 'accuracy_test':accuracy_test, 'recall_test':recall_test}