Spark Kafka Streaming多分区CommitAsync问题

时间:2018-03-20 02:23:28

标签: scala apache-spark spark-streaming rdd scala-streams

我正在阅读来自Kafka主题的消息,该主题有多个分区。虽然从消息中读取没有问题,但在向Kafka提交偏移范围时,我收到错误。我尝试了最好的水平,但无法解决此问题。

代码

Sub RAWtransfertoTRUST()

    Dim MainWorkfile As Workbook, OtherWorkfile As Workbook
    Dim TrackerSht As Worksheet, FilterSht As Worksheet
    Dim lRow As Long, lRw As Long

    Application.ScreenUpdating = False
    Application.DisplayAlerts = False

    Set MainWorkfile = ActiveWorkbook
    Set TrackerSht = MainWorkfile.Sheets("Trust Activities Raw")

    With TrackerSht
        lRow = .Cells(.Rows.Count, "B").End(xlUp).Row
    End With

    Application.AskToUpdateLinks = False

    Set OtherWorkfile = Workbooks.Open(Filename:=Application.GetOpenFilename)
    Set FilterSht = OtherWorkfile.Sheets("Raw Data")

    With FilterSht
        .AutoFilterMode = False
        lRw = .Cells(.Rows.Count, "B").End(xlUp).Row
        .Range("B1:F" & lRw).AutoFilter Field:=3, Criteria1:="Mary"
        .AutoFilter.Range.Copy
    End With

    ' paste
    TrackerSht.Range("B" & lRow).PasteSpecial Paste:=xlPasteValues, _
                    Operation:=xlNone, SkipBlanks:=False, Transpose:=False


    With FilterSht
        If .FilterMode Or .AutoFilterMode Then .AutoFilterMode = False
        lRw = .Cells(.Rows.Count, "C").End(xlUp).Row
        .Range("J1:J" & lRw).Copy ' copy your range
    End With

    ' paste
    TrackerSht.Range("G" & lRow).PasteSpecial Paste:=xlPasteValues, _
                    Operation:=xlNone, SkipBlanks:=False, Transpose:=False

    With FilterSht
        If .FilterMode Or .AutoFilterMode Then .AutoFilterMode = False
        lRw = .Cells(.Rows.Count, "C").End(xlUp).Row ' last row with data in column "C"
        .Range("N1:Q" & lRw).Copy ' copy your range
    End With

    ' paste
    TrackerSht.Range("H" & lRow).PasteSpecial Paste:=xlPasteValues, _
                    Operation:=xlNone, SkipBlanks:=False, Transpose:=False

    With FilterSht
        If .FilterMode Or .AutoFilterMode Then .AutoFilterMode = False
        lRw = .Cells(.Rows.Count, "C").End(xlUp).Row ' last row with data in column "C"
        .Range("T1:W" & lRw).Copy ' copy your range
    End With

    ' paste
    TrackerSht.Range("L" & lRow).PasteSpecial Paste:=xlPasteValues, _
                    Operation:=xlNone, SkipBlanks:=False, Transpose:=False

    With FilterSht
        If .FilterMode Or .AutoFilterMode Then .AutoFilterMode = False
        lRw = .Cells(.Rows.Count, "C").End(xlUp).Row ' last row with data in column "C"
        .Range("Y1:Z" & lRw).Copy ' copy your range
    End With

    ' paste
    TrackerSht.Range("P" & lRow).PasteSpecial Paste:=xlPasteValues, _
                    Operation:=xlNone, SkipBlanks:=False, Transpose:=False

    With FilterSht
        If .FilterMode Or .AutoFilterMode Then .AutoFilterMode = False
        lRw = .Cells(.Rows.Count, "C").End(xlUp).Row ' last row with data in column "C"
        .Range("AB1:AC" & lRw).Copy ' copy your range
    End With

    ' paste
    TrackerSht.Range("R" & lRow).PasteSpecial Paste:=xlPasteValues, _
                    Operation:=xlNone, SkipBlanks:=False, Transpose:=False

End Sub

错误

object ParallelStreamJob {

  def main(args: Array[String]): Unit = {
    val spark = SparkHelper.getOrCreateSparkSession()
    val ssc = new StreamingContext(spark.sparkContext, Seconds(10))
    spark.sparkContext.setLogLevel("WARN")
    val kafkaStream = {
      val kafkaParams = Map[String, Object](
        "bootstrap.servers" -> "localhost:9092",
        "key.deserializer" -> classOf[StringDeserializer],
        "value.deserializer" -> classOf[StringDeserializer],
        "group.id" -> "welcome3",

        "auto.offset.reset" -> "latest",
        "enable.auto.commit" -> (false: java.lang.Boolean)
      )

      val topics = Array("test2")
      val numPartitionsOfInputTopic = 2
      val streams = (1 to numPartitionsOfInputTopic) map {
        _ => KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams) )
      }
     streams
    }

   // var offsetRanges = Array[OffsetRange]()
    kafkaStream.foreach(rdd=> {
      rdd.foreachRDD(conRec=> {
        val offsetRanges = conRec.asInstanceOf[HasOffsetRanges].offsetRanges
        conRec.foreach(str=> {
          println(str.value())
          for (o <- offsetRanges) {
            println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
          }
        })

          kafkaStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
      })

    })

    println(" Spark parallel reader is ready !!!")


    ssc.start()
    ssc.awaitTermination()
  }
}

2 个答案:

答案 0 :(得分:0)

您可以像

一样提交偏移量
stream.foreachRDD { rdd =>
  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

  // some time later, after outputs have completed
  stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}

在你的情况下kafkaStreamSeq的流。改变你提交线。 参考:https://spark.apache.org/docs/latest/streaming-kafka-0-10-integration.html

答案 1 :(得分:0)

将kafkaStream.asInstanceOf [CanCommitOffsets] .commitAsync(offsetRanges)行更改为rdd.asInstanceOf [CanCommitOffsets] .commitAsync(offsetRanges)