在案例类上使用updateStateWithKey

时间:2018-04-18 12:49:11

标签: apache-spark spark-streaming

我正在尝试编写一个updateStateWithKey来获取输入的第一个值。当我尝试在案例类中使用它时,我收到错误

import org.apache.log4j.{Level, Logger}
import org.apache.spark._
import org.apache.spark.streaming._

case class Persons(name : String, school : String)

object StatefulNetworkWordCount {

  def getPerson (str : String) : Persons = {
    val splitArray = str.split(",")
    val name = splitArray(0)
    val school = splitArray(1)
    Persons(name, school)
  }

  //Now, newValues is the new set of values
  //runningCount is the existing values for each key

  def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {

    val newCount = runningCount.getOrElse(0) + newValues.sum
    Some(newCount)
  }


  def updateFunctionFrist(newValues: Seq[String], runningCount: Option[String]): Option[String] =
  {
    val newWord = if (runningCount.getOrElse("") == "") 
    {
      val str = newValues.head.toString //Use existing values
      Some(str)
    }
    else
    {
      val str = runningCount.getOrElse(newValues.head.toString) 
      Some(str)
    }
    newWord
  }

  def updateFunctionFirstPerson(newValues: Seq[Person], state: Option[Person]): Option[Person] =
  {
    val newWord = if (state.getOrElse("") == "") //If running count is empty
    {
      val str = newValues.head.asInstanceOf[Person]
      Some(str)
    }
    else
    {
      val str = state.getOrElse(newValues.head.asInstanceOf[Person])
      Some(str)
    }
    newWord
  }


  def main(args: Array[String]) {

    Logger.getLogger("org").setLevel(Level.ERROR)
    val conf = new SparkConf().setMaster("local[8]").setAppName("StatefulNetworkWordCount")


    val ssc = new StreamingContext(conf, Seconds(10))

    // Set checkpoint directory
    ssc.checkpoint(".")

    // Create a DStream that will connect to hostname:port, like localhost:9999
    val lines = ssc.socketTextStream("localhost", 9999)

    // Split each line into words
    val words = lines.flatMap(_.split(" "))

    // Count each word in each batch
    val pairs = words.map(word => (word.hashCode, word))

    val runningCounts = pairs.updateStateByKey[Persons](updateFunctionFirstPerson _)

    runningCounts.print()

    ssc.start() // Start the computation
    ssc.awaitTermination() // Wait for the computation to terminate
  }

}

该行

val runningCounts = pairs.updateStateByKey[Persons](updateFunctionFirstPerson _) 

抛出错误,但如果我使用

val runningCounts = pairs.updateStateByKey[String](updateFunctionFirst _)

获取密钥的第一个值,它可以正常工作。我们可以在updateStateBykey中使用自定义类吗?我怎么用呢?

0 个答案:

没有答案