我密切关注了Spark ML网站上的example,看不到哪里出了问题。尝试对新闻文章语料库进行TF-IDF。这是我的代码,在最后一行中断:
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
// load in the data
val data = spark.read.
option("header", "true").
format("csv").
load("data/articles.csv")
// select article content column
val content = data.select("content")
//tokenize content
val tokenizer = new Tokenizer().
setInputCol("content").
setOutputCol("words")
val wordsDF = tokenizer.transform(content)
// calculate TF
val hashingTF = new HashingTF().
setInputCol("words").
setOutputCol("rawFeatures").
setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsDF)
// calculate IDF
val idf = new IDF().
setInputCol("rawFeatures").
setOutputCol("features")
val idfModel = idf.fit(featurizedData)
这是我的堆栈跟踪的顶部:
2019-02-25 13:24:58 ERROR Executor:91 - Exception in task 0.0 in stage 3.0 (TID 5)
org.apache.spark.SparkException: Failed to execute user defined function($anonfun$createTransformFunc$1: (string) => array<string>)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
at scala.collection.AbstractIterator.aggregate(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$23.apply(RDD.scala:1145)
at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$23.apply(RDD.scala:1145)
at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1146)
at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1146)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)