如何在Spark中使用LinearRegression基于文本文件

时间:2017-07-12 11:31:29

标签: java apache-spark rdd

我对使用spark编程很新。我想设置一个线性回归模型,使用基于日志文件的spark,标签为&#34; column&#34;分隔符。我发现的所有教程和示例都是从这样的JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();开始的。 但是,我有一堆我想要使用的日志文件。所以我到目前为止尝试的是以下内容:

public static void main(String... args)
{
    if(!new File("LogisticRegressionModel").exists())
    {
        buildTrainingModel();
    }
    else
    {
        testModel();
    }
}

private static void testModel()
{
    SparkSession sc = SparkSession.builder().master("local[2]").appName("LogisticRegressionTest").getOrCreate();
    Dataset<Row> dataSet = sc.read().option("delimiter", "-").option("header", "false").csv("EI/eyeliteidemo/TAP01.log");

    PipelineModel model = PipelineModel.load("LogisticRegressionModel");
    Dataset<Row> predictions = model.transform(dataSet);

}

private static void buildTrainingModel()
{
    SparkSession sc = SparkSession.builder().master("local[2]").appName("LogisticRegressionTest").getOrCreate();

    StructType schema = new StructType(new StructField[]{
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
            DataTypes.createStructField("features", DataTypes.StringType, false),
    });

    Dataset<Row> input = sc.read().option("delimiter", "-").option("header", "false").csv("foo/bar/Foo_*.log");
    input = input.drop("_c1", "_c3", "_c4");
    input = input.select(functions.concat(input.col("_c0"), input.col("_c2"), input.col("_c5")));
    input = input.withColumnRenamed("concat(_c0, _c2, _c5)", "features");
    input.show(30, false);
    Dataset<Row> dataSet = sc.createDataFrame(input.collectAsList(), schema);

    Tokenizer tokenizer = new Tokenizer()
            .setInputCol("features")
            .setOutputCol("rawTokens");
    StopWordsRemover swRemover = new StopWordsRemover().setInputCol(tokenizer.getOutputCol()).setOutputCol("cleanedTerms").setStopWords(readStopwords());
    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(swRemover.getOutputCol())
            .setOutputCol("hashedTerms");
    IDF idf = new IDF().setInputCol(hashingTF.getOutputCol()).setOutputCol("featuresIDF");
    LogisticRegression lr = new LogisticRegression().setMaxIter(10).setRegParam(0.001);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[] {tokenizer, swRemover, hashingTF, idf, lr});

    // Fit the pipeline to training documents.
    PipelineModel model = pipeline.fit(dataSet);
    try
    {
        model.save("LogisticRegressionModel");
    }
    catch (IOException e)
    {
        e.printStackTrace();
    }
}

private static String[] readStopwords()
{
    List<String> words = new ArrayList();
    try (Stream<String> stream = Files.lines(Paths.get(LogisticRegressionTest.class.getResource("stopwords_en.txt").toURI()))) {

        words = stream
                .map(String::toLowerCase)
                .collect(Collectors.toList());

    } catch (IOException e) {
        e.printStackTrace();
    }
    catch (URISyntaxException e)
    {
        e.printStackTrace();
    }
    String[] retWords = new String[words.size()];
    return words.toArray(retWords);
}

不幸的是,我遇到了异常:

Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: Column features must be of type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 but was actually StringType.
at scala.Predef$.require(Predef.scala:224)
at org.apache.spark.ml.util.SchemaUtils$.checkColumnType(SchemaUtils.scala:42)
at org.apache.spark.ml.PredictorParams$class.validateAndTransformSchema(Predictor.scala:51)
at org.apache.spark.ml.classification.Classifier.org$apache$spark$ml$classification$ClassifierParams$$super$validateAndTransformSchema(Classifier.scala:58)
at org.apache.spark.ml.classification.ClassifierParams$class.validateAndTransformSchema(Classifier.scala:42)
at org.apache.spark.ml.classification.ProbabilisticClassifier.org$apache$spark$ml$classification$ProbabilisticClassifierParams$$super$validateAndTransformSchema(ProbabilisticClassifier.scala:53)
at org.apache.spark.ml.classification.ProbabilisticClassifierParams$class.validateAndTransformSchema(ProbabilisticClassifier.scala:37)
at org.apache.spark.ml.classification.LogisticRegression.org$apache$spark$ml$classification$LogisticRegressionParams$$super$validateAndTransformSchema(LogisticRegression.scala:193)
at org.apache.spark.ml.classification.LogisticRegressionParams$class.validateAndTransformSchema(LogisticRegression.scala:184)
at org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema(LogisticRegression.scala:193)
at org.apache.spark.ml.Predictor.transformSchema(Predictor.scala:122)
at org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:184)
at org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:184)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
at scala.collection.mutable.ArrayOps$ofRef.foldLeft(ArrayOps.scala:186)
at org.apache.spark.ml.Pipeline.transformSchema(Pipeline.scala:184)
at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:74)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:136)
at LogisticRegressionTest.buildTrainingModel(LogisticRegressionTest.java:92)
at LogisticRegressionTest.main(LogisticRegressionTest.java:40)

现在我的问题/问题是如何让这个数据类型问题正确?此外,我的代码首先对Spark专家有意义吗?

谢谢!

0 个答案:

没有答案