Spark Streaming中的单元测试用例

时间:2018-09-19 19:32:44

标签: apache-spark spark-streaming

我正在尝试使用DStreams在Spark Streaming中编写单元测试。这是我的测试用例。

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{SparkConf, SparkContext}
import org.specs2.Specification
import org.specs2.specification.core.SpecStructure

import scala.collection.mutable

case class Visit(userId: Long, page: String, duration: Long) extends Serializable

class MapWithStateTest extends Specification with Serializable {
  override def is: SpecStructure = "testing aggregation of Hcom_ClickstreamHit DStream"


  val slideDuration = Milliseconds(100)
  val conf = new SparkConf()
  @transient val sc = new SparkContext(master = "local[*]", appName = "test1", conf = conf)
  sc.setLogLevel("Error")
  @transient val sparkSession: SparkSession = SparkSession.builder.config(conf).getOrCreate()

  val sparkContext: SparkContext = sparkSession.sparkContext
  val queueOfRDDs = mutable.Queue[RDD[Visit]]()

  @transient val streamingContext: StreamingContext = new StreamingContext(sparkContext, slideDuration)


  val rddFloodOfQueuesDS: DStream[Visit] = streamingContext.queueStream(queueOfRDDs, oneAtATime = false)

  //rddFloodOfQueuesDS.foreachRDD(rdd => rdd.foreach(println))

  val visits = Seq(
    Visit(1, "home.html", 10),
    Visit(2, "cart.html", 5),
    Visit(1, "home.html", 10),
    Visit(2, "address/shipping.html", 10),
    Visit(2, "address/billing.html", 10)
  )

  visits.foreach(visit => queueOfRDDs += streamingContext.sparkContext.makeRDD(Seq(visit)))

  val output = streamingContext.queueStream(queueOfRDDs)
    .map(visit => (visit.userId, visit))
    .mapWithState(StateSpec.function(TransformFunctions.handleVisit _)
      .timeout(Durations.seconds(1)))

  println(output.count())
  output.count() must_==(2)

  streamingContext.checkpoint("/tmp/")
  streamingContext.start()
  streamingContext.awaitTerminationOrTimeout(10000)

}

帮助对象在这里(单独创建)

import org.apache.spark.streaming.State

object TransformFunctions  extends Serializable  {

  def handleVisit(key: Long, visit: Option[Visit], state: State[Long]): Option[Any] = {
    (visit, state.getOption()) match {
      case (Some(newVisit), None) => {
        // the 1st visit
        state.update(newVisit.duration)
        None
      }
      case (Some(newVisit), Some(totalDuration)) => {
        // next visit
        state.update(totalDuration + newVisit.duration)
        None
      }
      case (None, Some(totalDuration)) => {
        // last state - timeout occurred and passed
        // value is None in this case
        Some(key, totalDuration)
      }
      case _ => None
    }
  }

}

我没有收到任何错误,但是我收到了以下消息:空测试套件。如果我有任何遗漏,请告诉我。另外,请使用DStreams发布一个简单的示例单元测试用例。或将我指向任何博客。

0 个答案:

没有答案