与PySpark并行训练多个模型

时间:2019-09-25 19:53:06

标签: machine-learning pyspark

我想为每个键训练模型,但是我有很多键。是否可以按键并行构建模型。 我有一个数据帧,每个键都有响应和预测变量。下面是每个密钥具有随机森林的示例代码。由于它循环每个键,因此无法并行化。

def get_metrics(keys_list, data = df):

f1_test = []
precision_test  = []
recall_test = []
accuracy_test = []
for key in keys_list:
    this_df  = df.filter(F.col('key') == key).drop('key')
    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(this_df)

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(this_df)


    # Split the data into training and test sets (20% held out for testing)
    (trainingData, testData) = this_df.randomSplit([0.8, 0.2], seed = 0)

    # Train a RandomForest model.
    rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                                   labels=labelIndexer.labels)

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])


    paramGrid_rf = ParamGridBuilder() \
                  .addGrid(rf.maxDepth, [10,20,25,30]) \
                  .addGrid(rf.numTrees, [10,20,30, 40, 50]) \
                  .addGrid(rf.maxBins, [16, 32,48,64]) \
                  .build()

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid_rf,
                              evaluator=MulticlassClassificationEvaluator(),
                              numFolds=5,
                              parallelism = 10)  
    # Train model.  This also runs the indexers.
    model = crossval.fit(trainingData)



    # Make predictions.
    predictions = model.transform(testData)

    # Select (prediction, true label) and compute test error
    precision = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision").evaluate(predictions)
    recall = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall").evaluate(predictions)

    accuracy = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy").evaluate(predictions)
    f1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1").evaluate(predictions)

    f1_test +=  [f1]
    precision_test  += [precision]
    recall_test +=   [recall]
    accuracy_test += [accuracy]
return {'f1_test':f1_test, 'precision_test':precision_test, 'accuracy_test':accuracy_test, 'recall_test':recall_test}

0 个答案:

没有答案