我有下面的代码。它来自博客文章:
http://www.sparktutorials.net/Spark+MLLib+-+Predict+Store+Sales+with+ML+Pipelines
我正在尝试通过线性回归预测商店的销售额。我正在做一些数据清理,然后创建一个ml管道。我是scala新手。运行脚本时,我在屏幕上显示了一些火花搁置消息。我将其张贴在开始引发错误的地方。有人可以让我知道问题是什么以及如何解决?所有提示都将不胜感激。
代码:
// loading packages
// from example: http://www.sparktutorials.net/Spark+MLLib+-+Predict+Store+Sales+with+ML+Pipelines
import org.apache.log4j.{Logger}
//core and SparkSQL
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.DataFrame
// ML Feature Creation, Tuning, Models, and Model Evaluation
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, OneHotEncoder}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.{RegressionEvaluator}
import org.apache.spark.ml.regression.{LinearRegression}
import org.apache.spark.ml.Pipeline
import org.apache.spark.mllib.evaluation.RegressionMetrics
// preprocessing & preparing Pipelines
// Indexers & Encoders
val stateHolidayIndexer = new StringIndexer()
.setInputCol("StateHoliday")
.setOutputCol("StateHolidayIndex")
val schoolHolidayIndexer = new StringIndexer()
.setInputCol("SchoolHoliday")
.setOutputCol("SchoolHolidayIndex")
val stateHolidayEncoder = new OneHotEncoder()
.setInputCol("StateHolidayIndex")
.setOutputCol("StateHolidayVec")
val schoolHolidayEncoder = new OneHotEncoder()
.setInputCol("SchoolHolidayIndex")
.setOutputCol("SchoolHolidayVec")
val dayOfMonthEncoder = new OneHotEncoder()
.setInputCol("DayOfMonth")
.setOutputCol("DayOfMonthVec")
val dayOfWeekEncoder = new OneHotEncoder()
.setInputCol("DayOfWeek")
.setOutputCol("DayOfWeekVec")
val storeEncoder = new OneHotEncoder()
.setInputCol("Store")
.setOutputCol("StoreVec")
// assemble all vectors in to one vector to input to Model
val assembler = new VectorAssembler()
.setInputCols(Array("StoreVec", "DayOfWeekVec", "Open", "DayOfMonthVec", "StateHolidayVec", "SchoolHolidayVec"))
.setOutputCol("features")
// Pipeline
def preppedLRPipeline():TrainValidationSplit = {
val lr = new LinearRegression()
val paramGrid = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.1, 0.01))
.addGrid(lr.fitIntercept)
.addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
.build()
val pipeline = new Pipeline()
.setStages(Array(stateHolidayIndexer, schoolHolidayIndexer,stateHolidayEncoder, schoolHolidayEncoder, storeEncoder,dayOfWeekEncoder, dayOfMonthEncoder, assembler, lr))
val tvs = new TrainValidationSplit()
.setEstimator(pipeline)
.setEvaluator(new RegressionEvaluator)
.setEstimatorParamMaps(paramGrid)
.setTrainRatio(0.75)
tvs
}
// bringing in data and removing null values
def loadTrainingData(sqlContext:HiveContext):DataFrame = {
val trainRaw = sqlContext
.read.format("com.databricks.spark.csv")
.option("header", "true")
// path to training data
// .load("../mlproject/rossman/train.csv")
.load("/Users/username/Desktop/stuff/comp/clint/store_forecast/input/train.csv")
.repartition(6)
trainRaw.registerTempTable("raw_training_data")
sqlContext.sql("""SELECT
double(Sales) label, double(Store) Store, int(Open) Open, double(DayOfWeek) DayOfWeek,
StateHoliday, SchoolHoliday, (double(regexp_extract(Date, '\\d+-\\d+-(\\d+)', 1))) DayOfMonth
FROM raw_training_data
""").na.drop()
}
def loadKaggleTestData(sqlContext:HiveContext) = {
val testRaw = sqlContext
.read.format("com.databricks.spark.csv")
.option("header", "true")
// load test data
// .load("../mlproject/rossman/test.csv")
.load("/Users/username/Desktop/stuff/comp/clint/store_forecast/input/test.csv")
.repartition(6)
testRaw.registerTempTable("raw_test_data")
val testData = sqlContext.sql("""SELECT
Id, double(Store) Store, int(Open) Open, double(DayOfWeek) DayOfWeek, StateHoliday,
SchoolHoliday, (double(regexp_extract(Date, '\\d+-\\d+-(\\d+)', 1))) DayOfMonth
FROM raw_test_data
WHERE !(ISNULL(Id) OR ISNULL(Store) OR ISNULL(Open) OR ISNULL(DayOfWeek)
OR ISNULL(StateHoliday) OR ISNULL(SchoolHoliday))
""").na.drop() // weird things happen if you don't filter out the null values manually
Array(testRaw, testData) // got to hold onto testRaw so we can make sure
// to have all the prediction IDs to submit to kaggle
}
// save predictions
def savePredictions(predictions:DataFrame, testRaw:DataFrame) = {
val tdOut = testRaw
.select("Id")
.distinct()
.join(predictions, testRaw("Id") === predictions("PredId"), "outer")
.select("Id", "Sales")
.na.fill(0:Double) // some of our inputs were null so we have to
// fill these with something
tdOut
.coalesce(1)
.write.format("com.databricks.spark.csv")
.option("header", "true")
// save predictions
.save("linear_regression_predictions.csv")
//.save("/Users/username/Desktop/stuff/comp/clint/store_forecast/Output/linear_regression_predictions.csv")
}
// fitting and testing
def fitModel(tvs:TrainValidationSplit, data:DataFrame) = {
val Array(training, test) = data.randomSplit(Array(0.8, 0.2), seed = 12345)
logger.info("Fitting data")
val model = tvs.fit(training)
logger.info("Now performing test on hold out set")
val holdout = model.transform(test).select("prediction","label")
// have to do a type conversion for RegressionMetrics
val rm = new RegressionMetrics(holdout.rdd.map(x =>(x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))
logger.info("Test Metrics")
logger.info("Test Explained Variance:")
logger.info(rm.explainedVariance)
logger.info("Test R^2 Coef:")
logger.info(rm.r2)
logger.info("Test MSE:")
logger.info(rm.meanSquaredError)
logger.info("Test RMSE:")
logger.info(rm.rootMeanSquaredError)
model
}
// linear Regression
val data = loadTrainingData(sqlContext)
val Array(testRaw, testData) = loadKaggleTestData(sqlContext)
// The linear Regression Pipeline
val linearTvs = preppedLRPipeline()
logger.info("evaluating linear regression")
val lrModel = fitModel(linearTvs, data)
logger.info("Generating kaggle predictions")
val lrOut = lrModel.transform(testData)
.withColumnRenamed("prediction","Sales")
.withColumnRenamed("Id","PredId")
.select("PredId", "Sales")
savePredictions(lrOut, testRaw)
spark-shell输出:
warning: there were two deprecation warnings; re-run with -deprecation for details
loadKaggleTestData: (sqlContext: org.apache.spark.sql.hive.HiveContext)Array[org.apache.spark.sql.DataFrame]
savePredictions: (predictions: org.apache.spark.sql.DataFrame, testRaw: org.apache.spark.sql.DataFrame)Unit
<console>:146: error: not found: value logger
logger.info("Fitting data")
^
<console>:148: error: not found: value logger
logger.info("Now performing test on hold out set")
更新:
//package net.sparktutorials.examples
import org.apache.log4j.{Logger}
//core and SparkSQL
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.DataFrame
// ML Feature Creation, Tuning, Models, and Model Evaluation
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, OneHotEncoder}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.{RegressionEvaluator}
import org.apache.spark.ml.regression.{RandomForestRegressor, LinearRegression}
import org.apache.spark.ml.Pipeline
import org.apache.spark.mllib.evaluation.RegressionMetrics
object RossmannRegression extends Serializable {
@transient lazy val logger = Logger.getLogger(getClass.getName)
val stateHolidayIndexer = new StringIndexer()
.setInputCol("StateHoliday")
.setOutputCol("StateHolidayIndex")
val schoolHolidayIndexer = new StringIndexer()
.setInputCol("SchoolHoliday")
.setOutputCol("SchoolHolidayIndex")
val stateHolidayEncoder = new OneHotEncoder()
.setInputCol("StateHolidayIndex")
.setOutputCol("StateHolidayVec")
val schoolHolidayEncoder = new OneHotEncoder()
.setInputCol("SchoolHolidayIndex")
.setOutputCol("SchoolHolidayVec")
val dayOfMonthEncoder = new OneHotEncoder()
.setInputCol("DayOfMonth")
.setOutputCol("DayOfMonthVec")
val dayOfWeekEncoder = new OneHotEncoder()
.setInputCol("DayOfWeek")
.setOutputCol("DayOfWeekVec")
val storeEncoder = new OneHotEncoder()
.setInputCol("Store")
.setOutputCol("StoreVec")
val assembler = new VectorAssembler()
.setInputCols(Array("StoreVec", "DayOfWeekVec", "Open",
"DayOfMonthVec", "StateHolidayVec", "SchoolHolidayVec"))
.setOutputCol("features")
def preppedLRPipeline():TrainValidationSplit = {
val lr = new LinearRegression()
val paramGrid = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.1, 0.01))
.addGrid(lr.fitIntercept)
.addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
.build()
val pipeline = new Pipeline()
.setStages(Array(stateHolidayIndexer, schoolHolidayIndexer,
stateHolidayEncoder, schoolHolidayEncoder, storeEncoder,
dayOfWeekEncoder, dayOfMonthEncoder,
assembler, lr))
val tvs = new TrainValidationSplit()
.setEstimator(pipeline)
.setEvaluator(new RegressionEvaluator)
.setEstimatorParamMaps(paramGrid)
.setTrainRatio(0.75)
tvs
}
def preppedRFPipeline():TrainValidationSplit = {
val dfr = new RandomForestRegressor()
val paramGrid = new ParamGridBuilder()
.addGrid(dfr.minInstancesPerNode, Array(1, 5, 15))
.addGrid(dfr.maxDepth, Array(2, 4, 8))
.addGrid(dfr.numTrees, Array(20, 50, 100))
.build()
val pipeline = new Pipeline()
.setStages(Array(stateHolidayIndexer, schoolHolidayIndexer,
stateHolidayEncoder, schoolHolidayEncoder, storeEncoder,
dayOfWeekEncoder, dayOfMonthEncoder,
assembler, dfr))
val tvs = new TrainValidationSplit()
.setEstimator(pipeline)
.setEvaluator(new RegressionEvaluator)
.setEstimatorParamMaps(paramGrid)
.setTrainRatio(0.55)
tvs
}
def fitModel(tvs:TrainValidationSplit, data:DataFrame) = {
val Array(training, test) = data.randomSplit(Array(0.8, 0.2), seed = 12345)
logger.info("Fitting data")
val model = tvs.fit(training)
logger.info("Now performing test on hold out set")
val holdout = model.transform(test).select("prediction","label")
// have to do a type conversion for RegressionMetrics
val rm = new RegressionMetrics(holdout.rdd.map(x =>
(x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))
logger.info("Test Metrics")
logger.info("Test Explained Variance:")
logger.info(rm.explainedVariance)
logger.info("Test R^2 Coef:")
logger.info(rm.r2)
logger.info("Test MSE:")
logger.info(rm.meanSquaredError)
logger.info("Test RMSE:")
logger.info(rm.rootMeanSquaredError)
model
}
def savePredictions(predictions:DataFrame, testRaw:DataFrame, filePath:String) = {
val tdOut = testRaw
.select("Id")
.distinct()
.join(predictions, testRaw("Id") === predictions("PredId"), "outer")
.select("Id", "Sales")
.na.fill(0:Double) // some of our inputs were null so we have to
// fill these with something
tdOut
.coalesce(1)
.write.format("com.databricks.spark.csv")
.option("header", "true")
.save(filePath)
}
def loadTrainingData(sqlContext:HiveContext, filePath:String):DataFrame = {
val trainRaw = sqlContext
.read.format("com.databricks.spark.csv")
.option("header", "true")
//.load(filePath)
.load("/Users/username/Desktop/stuff/comp/clnt/store_forecast/input/train.csv")
.repartition(30)
trainRaw.registerTempTable("raw_training_data")
sqlContext.sql("""SELECT
double(Sales) label, double(Store) Store, int(Open) Open, double(DayOfWeek) DayOfWeek,
StateHoliday, SchoolHoliday, (double(regexp_extract(Date, '\\d+-\\d+-(\\d+)', 1))) DayOfMonth
FROM raw_training_data
""").na.drop()
}
def loadKaggleTestData(sqlContext:HiveContext, filePath:String) = {
val testRaw = sqlContext
.read.format("com.databricks.spark.csv")
.option("header", "true")
//.load(filePath)
.load("/Users/username/Desktop/stuff/comp/clnt/store_forecast/input/test.csv")
.repartition(30)
testRaw.registerTempTable("raw_test_data")
val testData = sqlContext.sql("""SELECT
Id, double(Store) Store, int(Open) Open, double(DayOfWeek) DayOfWeek, StateHoliday,
SchoolHoliday, (double(regexp_extract(Date, '\\d+-\\d+-(\\d+)', 1))) DayOfMonth
FROM raw_test_data
WHERE !(ISNULL(Id) OR ISNULL(Store) OR ISNULL(Open) OR ISNULL(DayOfWeek)
OR ISNULL(StateHoliday) OR ISNULL(SchoolHoliday))
""").na.drop() // weird things happen if you don't filter out the null values manually
Array(testRaw, testData) // got to hold onto testRaw so we can make sure
// to have all the prediction IDs to submit to kaggle
}
def main(args:Array[String]) = {
val name = "Linear Regression Application"
logger.info(s"Starting up $name")
val conf = new SparkConf().setAppName(name)
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
// sc.setLogLevel("INFO")
logger.info("Set Up Complete")
val data = loadTrainingData(sqlContext, args(0))
val Array(testRaw, testData) = loadKaggleTestData(sqlContext, args(1))
// The linear Regression Pipeline
val linearTvs = preppedLRPipeline()
logger.info("evaluating linear regression")
val lrModel = fitModel(linearTvs, data)
logger.info("Generating kaggle predictions")
val lrOut = lrModel.transform(testData)
.withColumnRenamed("prediction","Sales")
.withColumnRenamed("Id","PredId")
.select("PredId", "Sales")
//savePredictions(lrOut, testRaw, "linear_predictions.csv")
savePredictions(lrOut, testRaw, "/Users/username/Desktop/stuff/comp/clnt/store_forecast/Output/linear_predictions.csv")
}
}
spark-shell中的代码:
:load /Users/username/Desktop/stuff/comp/clnt/simple_spark_regression
输出:
Loading /Users/username/Desktop/stuff/comp/clnt/simple_spark_regression...
import org.apache.log4j.Logger
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, OneHotEncoder}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.{RandomForestRegressor, LinearRegression}
import org.apache.spark.ml.Pipeline
import org.apache.spark.mllib.evaluation.RegressionMetrics
warning: there were 10 deprecation warnings; re-run with -deprecation for details
defined object RossmannRegression