我一直在开发pyspark中的线性回归函数,并使用交叉验证来验证准确性。但这会引发错误,因为“ llegalArgumentException:“字段“标签”不存在”。但是我已经给它分配了响应变量。在Google中看不到几个示例,并观察到它对其他人有用,除非我做错了。
Herewith, I have given the code and it would of great help if u can point out my mistake .Thanks in Advance
# Linear regression - SPARK
# Libraries to be loaded
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
def spark_linear_regression(dsname, target, ratio):
spark = SparkSession.builder.appName('ml-bank').getOrCreate()
data = spark.read.csv(dsname, header=True, inferSchema=True)
data = data.dropna()
print("Completed : 10 % ")
cat_col = [t[0] for t in data.dtypes if t[1] == 'string']
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index").fit(data) for column in
cat_col]
index_colname = [t[0] + "_index" for t in data.dtypes if t[1] == 'string']
selected_column = list(set(data.columns) - set(cat_col))
selected_column.extend(index_colname)
pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(data).transform(data)
df_r = df_r.select(selected_column)
print("Completed : 30 %")
selected_column.remove(target)
assembler = VectorAssembler(inputCols=selected_column, outputCol="features")
final_data = assembler.transform(df_r)
final_data = final_data.select(['features', target])
train_ratio = ratio / 100
train_ratio = round(train_ratio, 2)
val_ratio = round(1 - train_ratio, 2)
train_data, test_data = final_data.randomSplit([train_ratio, val_ratio], seed=500)
print("Completed : 50 %")
linear_regression = LinearRegression(featuresCol='features', labelCol=target, maxIter=10)
param_grid = ParamGridBuilder() \
.addGrid(linear_regression.regParam, [0.1, 0.01]) \
.addGrid(linear_regression.fitIntercept, [False, True]) \
.addGrid(linear_regression.elasticNetParam, [0.0, 0.5, 1.0]) \
.build()
linear_regression = CrossValidator(estimator=linear_regression,
estimatorParamMaps=param_grid,
evaluator=RegressionEvaluator(),
numFolds=5)
print("Completed : 60 %")
linear_regression_model = linear_regression.fit(train_data)
linear_regression_summary = linear_regression_model.summary
print("Completed : 80 %")
print("Training RMSE: %f" % linear_regression_summary.rootMeanSquaredError)
print("Training R^2: %f" % linear_regression_summary.r2)
linear_pred = linear_regression_model.transform(test_data)
linear_pred.select("prediction", target, "features").show(5)
linear_evaluator = RegressionEvaluator(predictionCol="prediction",
labelCol=target, metricName="r2")
print("Completed : 95 %")
print("R Squared (R2) on test data = %g" % linear_evaluator.evaluate(linear_pred))
spark_linear_regression(Mart_Sales.csv", "Item_MRP",
80)