Azure事件中心的Spark流式传输

时间:2016-05-24 14:08:36

标签: azure apache-spark azure-eventhub

我一步一步地尝试了给定的过程(https://azure.microsoft.com/en-in/documentation/articles/hdinsight-apache-spark-eventhub-streaming/)。我刚刚根据我的要求修改了火花接收器代码。火花流消费者api,当我点​​火提交它从EventHub获取数据作为DStream [Array [Bytes]]我正在做foreachRDD并转换成RDD [String]。我在这里面临的问题是,直到我通过按ctrl + c停止程序执行,流线之下的语句才会执行。

package com.onerm.spark

import org.apache.spark.streaming.{Seconds, StreamingContext} 
import org.apache.spark.streaming.eventhubs.EventHubsUtils 
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark._
import org.apache.spark.sql.hive.HiveContext
import java.util.concurrent.{Executors, ExecutorService}

object HiveEvents {

def b2s(a: Array[Byte]): String = new String(a)

def main(args: Array[String]): Unit = {

val ehParams = Map[String, String]( 
"eventhubs.policyname" -> "myreceivepolicy", 
"eventhubs.policykey" -> "jgrH/5yjdMjajQ1WUAQsKAVGTu34=", 
"eventhubs.namespace" -> "SparkeventHubTest-ns", 
"eventhubs.name" -> "SparkeventHubTest", 
"eventhubs.partition.count" -> "4", 
"eventhubs.consumergroup" -> "$default", 
"eventhubs.checkpoint.dir" -> "/EventCheckpoint_0.1", 
"eventhubs.checkpoint.interval" -> "10" 
)

val conf = new SparkConf().setAppName("Eventhubs Onerm")
val sc= new SparkContext(conf)
val hiveContext = new HiveContext(sc)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val pool:ExecutorService=Executors.newFixedThreadPool(5)
val ssc = new StreamingContext(sc, Seconds(120)) 
var dataString :RDD[String] =sc.emptyRDD

val stream=EventHubsUtils.createUnionStream(ssc, ehParams)
**//lines below are not getting executed until I stop the execution**
stream.print()
stream.foreachRDD {

rdd =>
if(rdd.isEmpty())
{
println("RDD IS EMPTY ")
}
else
{
dataString=rdd.map(line=>b2s(line))
println("COUNT" +dataString.count())
sqlContext.read.json(dataString).registerTempTable("jsoneventdata")
val filterData=sqlContext.sql("SELECT    id,ClientProperties.PID,ClientProperties.Program,ClientProperties.Platform,ClientProperties.Version,ClientProperties.HWType,ClientProperties.OffVer,ContentID,Data,Locale,MappedSources,MarketingMessageContext.ActivityInstanceID,MarketingMessageContext.CampaignID,MarketingMessageContext.SegmentName,MarketingMessageContext.OneRMInstanceID,MarketingMessageContext.DateTimeSegmented,Source,Timestamp.Date,Timestamp.Epoch,TransactionID,UserAction,EventProcessedUtcTime,PartitionId,EventEnqueuedUtcTime from jsoneventdata")

filterData.show(10)
filterData.saveAsParquetFile("EventCheckpoint_0.1/ParquetEvent")

} }

ssc.start()
ssc.awaitTermination()

}
}

0 个答案:

没有答案