我正在尝试在Spark中构建一个简单的TFIDF矢量化器并编译到jar以在本地测试它。但是,我继续为(Int,String)提供No TypeTag。这是我的代码:
package com.valiant.ml
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer, StopWordsRemover}
object TextClassification
{
def main(args: Array[String])
{
val spark = SparkSession
.builder
.appName("TextClassification")
.getOrCreate()
val sentenceData = spark.createDataFrame(Seq(
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
)).toDF("label", "text")
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
val remover = new StopWordsRemover()
.setInputCol("words")
.setOutputCol("filtered_words")
val filteredWordsData = remover.transform(wordsData)
val hashingTF = new HashingTF()
.setInputCol("filtered_words")
.setOutputCol("rawFeatures")
.setNumFeatures(20)
val featurizedData = hashingTF.transform(filteredWordsData)
// alternatively, CountVectorizer can also be used to get term frequency vectors
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("features", "label").take(3).foreach(println)
spark.stop()
}
}
这是构建文件:
name := "text-classification"
version := "0.0.1"
scalaVersion := "2.11.8"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.0.0" % "provided",
"org.apache.spark" %% "spark-sql" % "2.0.0",
"org.apache.spark" %% "spark-mllib" % "2.0.0",
"org.apache.spark" %% "spark-catalyst" % "2.0.0"
)
以下是我的错误报告的顶部:
[0m[[0minfo[0m] [0mSet current project to text-classification (in build file:/arete/repos/ml/classification/text/)[0m
[0m[[0minfo[0m] [0mCompiling 1 Scala source to /arete/repos/ml/classification/text/target/scala-2.11/classes...[0m
[0m[[31merror[0m] [0m/arete/repos/ml/classification/text/text.scala:15: No TypeTag available for (Int, String)[0m
[0m[[31merror[0m] [0m val sentenceData = spark.createDataFrame(Seq([0m
[0m[[31merror[0m] [0m ^[0m
[0m[[31merror[0m] [0m[0m
[0m[[31merror[0m] [0m while compiling: /arete/repos/ml/classification/text/text.scala[0m
[0m[[31merror[0m] [0m during phase: typer[0m
[0m[[31merror[0m] [0m library version: version 2.10.4[0m
[0m[[31merror[0m] [0m compiler version: version 2.10.4[0m