rest_data =
+--------------------+-----+---------+
| features|label| old_label
+--------------------+-----+---------+
|[1.0,0.0,1.0,0.0,...| 1.0| 0.0|
|[1.0,1.0,1.0,1.0,...| 1.0| 0.0|
|[0.42857143282890...| 0.0| 0.0|
|(10,[0,2,5,7,9],[...| 0.0| 0.0|
|[1.0,1.0,0.0,0.0,...| 1.0| 0.0|
|[1.0,1.0,0.800000...| 1.0 0.0|
|[0.40000000596046...| 0.0| 0.0|
|(10,[0,2,5,9],[0....| 0.0| 0.0|
|[1.0,1.0,1.0,1.0,...| 1.0| 0.0|
|[0.40000000596046...| 0.0| 0.0|
|[1.0,1.0,1.0,1.0,...| 1.0| 0.0|
|[1.0,1.0,0.888888...| 1.0| 0.0|
|[0.0,0.0,1.0,1.0,...| 1.0| 0.0|
|[0.16666667163372...| 0.0| 0.0|
|[0.375,0.0,0.1428...| 0.0| 0.0|
|(10,[0,2,5,7],[0....| 0.0| 0.0|
|[1.0,1.0,1.0,1.0,...| 1.0| 0.0|
|[1.0,1.0,1.0,1.0,...| 1.0| 0.0|
|[0.25,0.0,0.5,0.0...| 0.0| 0.0|
每次执行这段代码,我都会得到不同的计数结果,其中old_label ==预测值或精度值和召回率
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
precision_list = list()
recall_list = list()
i = 0
for model in model_list:
x = model.transform(rest_data)
t = x.select(col('old_label').cast('int'),col('prediction').cast('int'))
t.createOrReplaceTempView('t')
print spark.sql('select count(*) from t where old_label=prediction').show()
prediction_and_labels = x.select(col('prediction').cast('float'),col('label').cast('float')).rdd.map(lambda pl : (pl.prediction,pl.label))
metrics = MulticlassMetrics(prediction_and_labels)
print metrics.confusionMatrix()
recall_list.append(metrics.recall(label=1.0))
precision_list.append(metrics.precision(label=1.0))
x = x.withColumn('old_label',col('prediction').cast('int'))
# display(x)
i +=1
即使我使用过滤器命令而不是spark sql,每次的结果也不一致。