scala中的错误不匹配

时间:2015-11-16 14:40:36

标签: scala apache-spark memsql

我正在尝试返回RDD[(String,String,String)],而我无法使用flatMap执行此操作。我尝试了(tweetId, tweetBody, gender)(tweetId, tweetBody, gender),但它给了我一个类型不匹配的错误,你可以告诉我如何从RDD[(String, String, String)]

返回flatMap
override def transform(sqlContext: SQLContext, rdd: RDD[Array[Byte]], config: UserTransformConfig, logger: PhaseLogger): DataFrame = {
    val idColumnName = config.getConfigString("column_name").getOrElse("id")
    val bodyColumnName = config.getConfigString("column_name").getOrElse("body")
    val genderColumnName = config.getConfigString("column_name").getOrElse("gender")

    // convert each input element to a JsonValue
    val jsonRDD = rdd.map(r => byteUtils.bytesToUTF8String(r))

    val hashtagsRDD: RDD[(String,String, String)] = jsonRDD.mapPartitions(r => {
      // register jackson mapper (this needs to be instantiated per partition
      // since it is not serializable)
      val mapper = new ObjectMapper()
      mapper.registerModule(DefaultScalaModule)

      r.flatMap(tweet => tweet match {
        case _ :: tweet =>
        val rootNode = mapper.readTree(tweet)
        val tweetId = rootNode.path("id").asText.split(":")(2)
        val tweetBody = rootNode.path("body").asText
        val tweetVector =  new HashingTF().transform(tweetBody.split(" "))
        val result =genderModel.predict(tweetVector)
        val gender = if(result == 1.0){"Male"}else{"Female"}

        (tweetId, tweetBody, gender)
       // Array(1).map(x => (tweetId, tweetBody, gender))

      })

    })

    val rowRDD: RDD[Row] = hashtagsRDD.map(x => Row(x._1,x._2,x._3))
    val schema = StructType(Array(StructField(idColumnName,StringType, true),StructField(bodyColumnName, StringType, true),StructField(genderColumnName,StringType, true)))
    sqlContext.createDataFrame(rowRDD, schema)
  }
}

1 个答案:

答案 0 :(得分:0)

尝试使用map代替flatMap。 当参数函数的结果类型为集合或flatMap

时,将使用RDD

即。当前集合的每个元素都映射到零个或多个元素时,将使用flatMap。 当当前集合的每个元素都映射到一个元素时使用map

<map A => Bfunctorial types中的符号A交换符号B,即将RDD[A]转换为RDD[B]

flatMap可以在monadic types中读作 map然后flatten 。例如。您有RDD[A]且参数函数属于A => RDD[B]类型,简单map的结果将为RDD[RDD[B]],并且该对出现可以简化为仅RDD[B] flatten

这里是成功编译代码的示例。

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StringType, StructField, StructType}

class UserTransformConfig {
  def getConfigString(name: String): Option[String] = ???
}

class PhaseLogger
object byteUtils {
  def bytesToUTF8String(r: Array[Byte]): String = ???
}

class HashingTF {
  def transform(strs: Array[String]): Array[Double] = ???
}

object genderModel {
  def predict(v: Array[Double]): Double = ???
}

def transform(sqlContext: SQLContext, rdd: RDD[Array[Byte]], config: UserTransformConfig, logger: PhaseLogger): DataFrame = {
  val idColumnName = config.getConfigString("column_name").getOrElse("id")
  val bodyColumnName = config.getConfigString("column_name").getOrElse("body")
  val genderColumnName = config.getConfigString("column_name").getOrElse("gender")

  // convert each input element to a JsonValue
  val jsonRDD = rdd.map(r => byteUtils.bytesToUTF8String(r))

  val hashtagsRDD: RDD[(String, String, String)] = jsonRDD.mapPartitions(r => {
    // register jackson mapper (this needs to be instantiated per partition
    // since it is not serializable)
    val mapper = new ObjectMapper
    mapper.registerModule(DefaultScalaModule)

    r.map { tweet =>
      val rootNode = mapper.readTree(tweet)
      val tweetId = rootNode.path("id").asText.split(":")(2)
      val tweetBody = rootNode.path("body").asText
      val tweetVector = new HashingTF().transform(tweetBody.split(" "))
      val result = genderModel.predict(tweetVector)
      val gender = if (result == 1.0) {"Male"} else {"Female"}

      (tweetId, tweetBody, gender)

    }
  })

  val rowRDD: RDD[Row] = hashtagsRDD.map(x => Row(x._1, x._2, x._3))
  val schema = StructType(Array(StructField(idColumnName, StringType, true), StructField(bodyColumnName, StringType, true), StructField(genderColumnName, StringType, true)))
  sqlContext.createDataFrame(rowRDD, schema)
}

请注意我应该从我的想象中带来多少,因为你没有提供minimum example。一般来说,这样的问题不值得回答