我在此网站上找到以下代码: https://spark.apache.org/docs/2.3.1/ml-tuning.html
// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
// is areaUnderROC.
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(new BinaryClassificationEvaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(2) // Use 3+ in practice
.setParallelism(2) // Evaluate up to 2 parameter settings in parallel
正如他们所说,BinaryClassificationEvaluator的默认指标是“ AUC”。 如何将默认指标更改为F1分数?
我尝试过:
// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
// is areaUnderROC.
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(new BinaryClassificationEvaluator.setMetricName("f1"))
.setEstimatorParamMaps(paramGrid)
.setNumFolds(2) // Use 3+ in practice
.setParallelism(2) // Evaluate up to 2 parameter settings in parallel
但是我遇到了一些错误... 我在许多站点上搜索,但找不到解决方案...
答案 0 :(得分:0)
setMetricName
仅接受“ areaUnderPR”或“ areaUnderROC”。您将需要编写自己的Evaluator
;像这样的东西:
import org.apache.spark.ml.evaluation.Evaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.{Dataset, functions => F}
class FScoreEvaluator(override val uid: String) extends Evaluator with HasPredictionCol with HasLabelCol{
def this() = this(Identifiable.randomUID("FScoreEvaluator"))
def evaluate(dataset: Dataset[_]): Double = {
val truePositive = F.sum(((F.col(getLabelCol) === 1) && (F.col(getPredictionCol) === 1)).cast(IntegerType))
val predictedPositive = F.sum((F.col(getPredictionCol) === 1).cast(IntegerType))
val actualPositive = F.sum((F.col(getLabelCol) === 1).cast(IntegerType))
val precision = truePositive / predictedPositive
val recall = truePositive / actualPositive
val fScore = F.lit(2) * (precision * recall) / (precision + recall)
dataset.select(fScore).collect()(0)(0).asInstanceOf[Double]
}
override def copy(extra: ParamMap): Evaluator = defaultCopy(extra)
}
答案 1 :(得分:0)
基于@gmds的答案。确保Spark版本> = 2.3。
您还可以按照the implementation of RegressionEvaluator in Spark实施其他自定义评估程序。
我还添加了isLargerBetter
,以便实例化的评估器可以用于模型选择(例如CV)
import org.apache.spark.ml.evaluation.Evaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol, HasWeightCol}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.{Dataset, functions => F}
class WRmseEvaluator(override val uid: String) extends Evaluator with HasPredictionCol with HasLabelCol with HasWeightCol {
def this() = this(Identifiable.randomUID("wrmseEval"))
def setPredictionCol(value: String): this.type = set(predictionCol, value)
def setLabelCol(value: String): this.type = set(labelCol, value)
def setWeightCol(value: String): this.type = set(weightCol, value)
def evaluate(dataset: Dataset[_]): Double = {
dataset
.withColumn("residual", F.col(getLabelCol) - F.col(getPredictionCol))
.select(
F.sqrt(F.sum(F.col(getWeightCol) * F.pow(F.col("residual"), 2)) / F.sum(getWeightCol))
)
.collect()(0)(0).asInstanceOf[Double]
}
override def copy(extra: ParamMap): Evaluator = defaultCopy(extra)
override def isLargerBetter: Boolean = false
}
以下是使用方法。
val wrmseEvaluator = new WRmseEvaluator()
.setLabelCol(labelColName)
.setPredictionCol(predColName)
.setWeightCol(weightColName)