当我想使用spark2.0.1来计算文本相似性时,我遇到了问题,代码如下:
val sparkConf = new SparkConf().setAppName("cos").setMaster("local")
val sc = new SparkContext(sparkConf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
Logger.getRootLogger.setLevel(Level.ERROR)
var srcRDD = sc.textFile("D:\\test").map {
x =>
var data = x.split(",")
RawDataRecord(data(0), data(1))
}
var trainingDF = srcRDD.toDF()
trainingDF.take(2).foreach(println);
var tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
var wordsData = tokenizer.transform(trainingDF)
//===============hashing tf =============
var hashingTF = new HashingTF().setNumFeatures(500).setInputCol("words").setOutputCol("rawFeatures")
var featurizedData = hashingTF.transform(wordsData)
var idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
var idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
val content_list = rescaledData.select($"category", $"features").collect().toList
val docSims = rescaledData.select($"category",$"features").map {
case Row(id1, idf1) =>
import breeze.linalg._
val sv1 = idf1.asInstanceOf[SV]
println(sv1)
val bsv1 = new SparseVector[Double](sv1.indices, sv1.values, sv1.size)
content_list.filter(_(0) != id1).map {
case Row(id2, idf2) =>
val sv2 = idf2.asInstanceOf[SV]
val bsv2 = new SparseVector[Double](sv2.indices, sv2.values, sv2.size)
val cosSim = bsv1.dot(bsv2).asInstanceOf[Double] / (norm(bsv1) * norm(bsv2))
(id1, id2, cosSim)
}
}
错误的细节是:
Exception in thread "main" java.lang.UnsupportedOperationException: No Encoder found for Any
- field (class: "java.lang.Object", name: "_1")
- array element class: "scala.Tuple3"
- root class: "scala.collection.immutable.List"
at org.apache.spark.sql.catalyst.ScalaReflection$.org$apache$spark$sql$catalyst$ScalaReflection$$serializerFor(ScalaReflection.scala:598)
at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$9.apply(ScalaReflection.scala:592)
at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$9.apply(ScalaReflection.scala:583)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:344)
at org.apache.spark.sql.catalyst.ScalaReflection$.org$apache$spark$sql$catalyst$ScalaReflection$$serializerFor(ScalaReflection.scala:583)
at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$toCatalystArray$1$1.apply(ScalaReflection.scala:442)
at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$toCatalystArray$1$1.apply(ScalaReflection.scala:442)
at org.apache.spark.sql.catalyst.expressions.objects.MapObjects$.apply(objects.scala:383)
at org.apache.spark.sql.catalyst.ScalaReflection$.toCatalystArray$1(ScalaReflection.scala:442)
at org.apache.spark.sql.catalyst.ScalaReflection$.org$apache$spark$sql$catalyst$ScalaReflection$$serializerFor(ScalaReflection.scala:467)
at org.apache.spark.sql.catalyst.ScalaReflection$.serializerFor(ScalaReflection.scala:425)
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$.apply(ExpressionEncoder.scala:61)
at org.apache.spark.sql.Encoders$.product(Encoders.scala:274)
at org.apache.spark.sql.SQLImplicits.newProductEncoder(SQLImplicits.scala:47)
at com.model.Cossimilarty$.main(Cossimilarty.scala:59)
这是因为火花版本?或者关于什么?