我是Scala上spark的新用户,这是我的代码,但是我不知道如何计算预测和准确性。 我一定要CSV文件转换成LIBSVM格式,或者我可以只加载CSV文件?
object Test2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("WineQualityDecisionTreeRegressorPMML")
.master("local")
.getOrCreate()
// Load and parse the data file.
val df = spark.read
.format("csv")
.option("header", "true")
.option("mode", "DROPMALFORMED")
.option("delimiter", ",")
.load("file:///c:/tmp/spark-warehouse/winequality_red_names.csv")
val inputFields = List("fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides",
"free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol")
val toDouble = udf[Double, String]( _.toDouble)
val dff = df.
withColumn("fixed acidity", toDouble(df("fixed acidity"))). // 0 +
withColumn("volatile acidity", toDouble(df("volatile acidity"))). // 1 +
withColumn("citric acid", toDouble(df("citric acid"))). // 2 -
withColumn("residual sugar", toDouble(df("residual sugar"))). // 3 +
withColumn("chlorides", toDouble(df("chlorides"))). // 4 -
withColumn("free sulfur dioxide", toDouble(df("free sulfur dioxide"))). // 5 +
withColumn("total sulfur dioxide", toDouble(df("total sulfur dioxide"))). // 6 +
withColumn("density", toDouble(df("density"))). // 7 -
withColumn("pH", toDouble(df("pH"))). // 8 +
withColumn("sulphates", toDouble(df("sulphates"))). // 9 +
withColumn("alcohol", toDouble(df("alcohol"))) // 10 +
val assembler = new VectorAssembler().
setInputCols(inputFields.toArray).
setOutputCol("features")
// Fit on whole dataset to include all labels in index.
val labelIndexer = new StringIndexer()
.setInputCol("quality")
.setOutputCol("indexedLabel")
.fit(dff)
// specify layers for the neural network:
// input layer of size 11 (features), two intermediate of size 10 and 20
// and output of size 6 (classes)
val layers = Array[Int](11, 10, 20, 6)
// Train a DecisionTree model.
val dt = new MultilayerPerceptronClassifier()
.setLayers(layers)
.setBlockSize(128)
.setSeed(1234L)
.setMaxIter(100)
.setLabelCol("indexedLabel")
.setFeaturesCol("features")
// Convert indexed labels back to original labels.
val labelConverter = new IndexToString()
.setInputCol("prediction")
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels)
// create pileline
val pipeline = new Pipeline()
.setStages(Array(assembler, labelIndexer, dt, labelConverter))
// Train model
val model = pipeline.fit(dff)
}
}
请问有什么主意吗? 我找不到使用pipline用CSV文件进行神经网络连接的任何示例。
答案 0 :(得分:1)
训练完模型(val model = pipeline.fit(dff)
)后,需要使用model.transform
方法为每个测试样本预测标签。对于每个预测,您都必须检查它是否与标签匹配。那么准确度就是正确分类与训练集大小的比率。
如果要使用用于培训的同一DataFrame
,则只需val predictions = model.transform(dff)
。然后遍历predictions
并检查它们是否与相应的标签匹配。但是,我不建议重用DataFrame
-最好将其拆分以训练和测试子集。