我想为我真正愚蠢的问题道歉,但我的线性回归问题。我很挣钱。你能帮我吗?
这是我的主要代码。我目前正在使用一些外部库来绘制数据。
import com.fundtrml.config.ConfigSetUp
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.SparkSession
object SimpleLinearRegression {
def main(args: Array[String]): Unit = {
ConfigSetUp.HadoopBinariesConfig();
val ss = SparkSession.builder().appName("DataSet Test")
.master("local[*]").getOrCreate()
import ss.implicits._
var listOfData = List(40, 41, 45, 43, 42, 60, 61, 59, 50, 49, 47, 39, 41, 37, 36, 34, 33, 37)
val data = listOfData //(1 to 21 by 1) // create a collection of Doubles
.map(n => (n, n)) // make it pairs
.map { case (label, features) =>
LabeledPoint(label, Vectors.dense(features)) } // create labeled points of dense vectors
.toDF // make it a DataFrame
var splittedData = data.randomSplit(Array(0.6,0.4))
var trainingData = splittedData(0)
var testSetData = splittedData(1)
trainingData.show()
val lr = new LinearRegression()
.setMaxIter(10)
.setRegParam(0.3)
.setElasticNetParam(0.8)
//train
val model = lr.fit(trainingData)
println(s"model.intercept: ${model.intercept}")
println(s"model.coefficients : ${model.coefficients}")
// Summarize the model over the training set and print out some metrics
val trainingSummary = model.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")
val predictions = model.transform(testSetData)
predictions.show()
//Display the data
import com.quantifind.charts.Highcharts._
regression(listOfData) //using this external library with embeded functionality about regression
var currentPredictions = predictions.select("prediction").rdd.map(r => r(0)).collect.toList
println(currentPredictions)
// regression(currentPredictions.map(_.toString.toDouble))
}
}
我的训练集如下,标签列 - 值,应该预测,特征值,应该用来做预测:
+-----+--------+
|label|features|
+-----+--------+
| 43.0| [43.0]|
| 45.0| [45.0]|
| 42.0| [42.0]|
| 60.0| [60.0]|
| 50.0| [50.0]|
| 59.0| [59.0]|
| 61.0| [61.0]|
| 47.0| [47.0]|
| 49.0| [49.0]|
| 41.0| [41.0]|
| 34.0| [34.0]|
+-----+--------+
评估回归模型,我得到以下数据:
model.intercept: 1.7363839862169372
model.coefficients : [0.9640297102666925]
numIterations: 3
objectiveHistory: [0.5,0.406233822167566,0.031956224821402285]
RMSE: 0.29784178261548705
r2: 0.9987061382565019 --> Extremely High Close to 1
最后,我得到了以下预测:
+-----+--------+------------------+
|label|features| prediction|
+-----+--------+------------------+
| 40.0| [40.0]| 40.29757239688463|
| 41.0| [41.0]|41.261602107151326|
| 39.0| [39.0]|39.333542686617946|
| 36.0| [36.0]|36.441453555817866|
| 37.0| [37.0]| 37.40548326608456|
| 33.0| [33.0]| 33.54936442501779|
| 37.0| [37.0]| 37.40548326608456|
+-----+--------+------------------+
很容易看出预测不在同一条线上。它不可能位于直线上。 This is whole data set, plotted using the Scala Library- WISP