IllegalArgumentException:'字段“ label”不存在。在PYSPARK

时间:2018-09-27 10:34:07

标签: pyspark linear-regression apache-spark-mllib

我一直在开发pyspark中的线性回归函数,并使用交叉验证来验证准确性。但这会引发错误,因为“ llegalArgumentException:“字段“标签”不存在”。但是我已经给它分配了响应变量。在Google中看不到几个示例,并观察到它对其他人有用,除非我做错了。

Herewith, I have given the code and it would of great help if u can point out my mistake .Thanks in Advance 
# Linear regression - SPARK

# Libraries to be loaded
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


def spark_linear_regression(dsname, target, ratio):
    spark = SparkSession.builder.appName('ml-bank').getOrCreate()
    data = spark.read.csv(dsname, header=True, inferSchema=True)
    data = data.dropna()
    print("Completed : 10 % ")

    cat_col = [t[0] for t in data.dtypes if t[1] == 'string']
    indexers = [StringIndexer(inputCol=column, outputCol=column + "_index").fit(data) for column in
                cat_col]
    index_colname = [t[0] + "_index" for t in data.dtypes if t[1] == 'string']
    selected_column = list(set(data.columns) - set(cat_col))
    selected_column.extend(index_colname)

    pipeline = Pipeline(stages=indexers)
    df_r = pipeline.fit(data).transform(data)
    df_r = df_r.select(selected_column)

    print("Completed : 30 %")

    selected_column.remove(target)
    assembler = VectorAssembler(inputCols=selected_column, outputCol="features")
    final_data = assembler.transform(df_r)
    final_data = final_data.select(['features', target])

    train_ratio = ratio / 100
    train_ratio = round(train_ratio, 2)
    val_ratio = round(1 - train_ratio, 2)

    train_data, test_data = final_data.randomSplit([train_ratio, val_ratio], seed=500)
    print("Completed : 50 %")
    linear_regression = LinearRegression(featuresCol='features', labelCol=target, maxIter=10)
    param_grid = ParamGridBuilder() \
        .addGrid(linear_regression.regParam, [0.1, 0.01]) \
        .addGrid(linear_regression.fitIntercept, [False, True]) \
        .addGrid(linear_regression.elasticNetParam, [0.0, 0.5, 1.0]) \
        .build()

    linear_regression = CrossValidator(estimator=linear_regression,
                                       estimatorParamMaps=param_grid,
                                       evaluator=RegressionEvaluator(),
                                       numFolds=5)

    print("Completed : 60 %")
    linear_regression_model = linear_regression.fit(train_data)
    linear_regression_summary = linear_regression_model.summary

    print("Completed : 80 %")
    print("Training RMSE: %f" % linear_regression_summary.rootMeanSquaredError)
    print("Training R^2: %f" % linear_regression_summary.r2)

    linear_pred = linear_regression_model.transform(test_data)
    linear_pred.select("prediction", target, "features").show(5)
    linear_evaluator = RegressionEvaluator(predictionCol="prediction",
                                           labelCol=target, metricName="r2")
    print("Completed : 95 %")
    print("R Squared (R2) on test data = %g" % linear_evaluator.evaluate(linear_pred))


spark_linear_regression(Mart_Sales.csv", "Item_MRP",
                        80)

0 个答案:

没有答案