UnsupportedOperationException:找不到Any的编码器

时间:2017-01-03 04:23:48

标签: scala apache-spark apache-spark-sql apache-spark-mllib

当我想使用spark2.0.1来计算文本相似性时,我遇到了问题,代码如下:

val sparkConf = new SparkConf().setAppName("cos").setMaster("local")
val sc = new SparkContext(sparkConf)

val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
Logger.getRootLogger.setLevel(Level.ERROR)
var srcRDD = sc.textFile("D:\\test").map {
  x =>
    var data = x.split(",")
    RawDataRecord(data(0), data(1))
}

var trainingDF = srcRDD.toDF()
trainingDF.take(2).foreach(println);
var tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
var wordsData = tokenizer.transform(trainingDF)

//===============hashing tf =============
var hashingTF = new HashingTF().setNumFeatures(500).setInputCol("words").setOutputCol("rawFeatures")

var featurizedData = hashingTF.transform(wordsData)

var idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
var idfModel = idf.fit(featurizedData)


val rescaledData = idfModel.transform(featurizedData)
val content_list = rescaledData.select($"category", $"features").collect().toList

val docSims = rescaledData.select($"category",$"features").map {
  case Row(id1, idf1) =>
    import breeze.linalg._
    val sv1 = idf1.asInstanceOf[SV]
    println(sv1)
    val bsv1 = new SparseVector[Double](sv1.indices, sv1.values, sv1.size)
    content_list.filter(_(0) != id1).map {
      case Row(id2, idf2) =>
        val sv2 = idf2.asInstanceOf[SV]
        val bsv2 = new SparseVector[Double](sv2.indices, sv2.values, sv2.size)
        val cosSim = bsv1.dot(bsv2).asInstanceOf[Double] / (norm(bsv1) * norm(bsv2))
        (id1, id2, cosSim)
    }
}

错误的细节是:

 Exception in thread "main" java.lang.UnsupportedOperationException: No Encoder found for Any
- field (class: "java.lang.Object", name: "_1")
- array element class: "scala.Tuple3"
- root class: "scala.collection.immutable.List"
    at org.apache.spark.sql.catalyst.ScalaReflection$.org$apache$spark$sql$catalyst$ScalaReflection$$serializerFor(ScalaReflection.scala:598)
    at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$9.apply(ScalaReflection.scala:592)
    at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$9.apply(ScalaReflection.scala:583)
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
    at scala.collection.immutable.List.foreach(List.scala:381)
    at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
    at scala.collection.immutable.List.flatMap(List.scala:344)
    at org.apache.spark.sql.catalyst.ScalaReflection$.org$apache$spark$sql$catalyst$ScalaReflection$$serializerFor(ScalaReflection.scala:583)
    at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$toCatalystArray$1$1.apply(ScalaReflection.scala:442)
    at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$toCatalystArray$1$1.apply(ScalaReflection.scala:442)
    at org.apache.spark.sql.catalyst.expressions.objects.MapObjects$.apply(objects.scala:383)
    at org.apache.spark.sql.catalyst.ScalaReflection$.toCatalystArray$1(ScalaReflection.scala:442)
    at org.apache.spark.sql.catalyst.ScalaReflection$.org$apache$spark$sql$catalyst$ScalaReflection$$serializerFor(ScalaReflection.scala:467)
    at org.apache.spark.sql.catalyst.ScalaReflection$.serializerFor(ScalaReflection.scala:425)
    at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$.apply(ExpressionEncoder.scala:61)
    at org.apache.spark.sql.Encoders$.product(Encoders.scala:274)
    at org.apache.spark.sql.SQLImplicits.newProductEncoder(SQLImplicits.scala:47)
    at com.model.Cossimilarty$.main(Cossimilarty.scala:59)

这是因为火花版本?或者关于什么?

0 个答案:

没有答案