我一直在尝试使用scala编写的spark流来传输来自kafka主题的avro数据。(i followed this link)。 主要的座右铭是解析来自kafka主题的avro数据,并从dstreams中创建一个数据框并持续到hdfs。 代码运行正常,没有任何异常,但也没有输出。
case class mobileData(action: String, tenantid: Int, lat: Float, lon: Float, memberid: Int, event_name: String, productUpccd: Int, device_type: String, device_os_ver: Float, item_name: String)
object AvroConsumer {
val eventSchema = SchemaBuilder.record("eventRecord").fields
.name("action").`type`().stringType().noDefault()
.name("tenantid").`type`().intType().noDefault()
.name("lat").`type`().doubleType().noDefault()
.name("lon").`type`().doubleType().noDefault()
.name("memberid").`type`().intType().noDefault()
.name("event_name").`type`().stringType().noDefault()
.name("productUpccd").`type`().intType().noDefault()
.name("device_type").`type`().stringType().noDefault()
.name("device_os_ver").`type`().stringType().noDefault()
.name("item_name").`type`().stringType().noDefault().endRecord
def main(args: Array[String]): Unit = {
Logger.getLogger("kafka.consumer.AvroConsumer").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark.storage.BlockManager").setLevel(Level.ERROR)
val logger: Logger = Logger.getLogger("kafka.consumer.AvroConsumer")
val sparkConf = new SparkConf().setAppName("Avro Consumer").
setMaster("dev-mapr-node2.fishbowl.com").set("spark.driver.allowMultipleContexts", "true")
sparkConf.set("spark.cores.max", "2")
sparkConf.set("spark.serializer", classOf[KryoSerializer].getName)
sparkConf.set("spark.sql.tungsten.enabled", "true")
sparkConf.set("spark.eventLog.enabled", "true")
sparkConf.set("spark.app.id", "KafkaConsumer")
sparkConf.set("spark.io.compression.codec", "snappy")
sparkConf.set("spark.rdd.compress", "true")
sparkConf.set("spark.streaming.backpressure.enabled", "true")
sparkConf.set("spark.sql.avro.compression.codec", "snappy")
sparkConf.set("spark.sql.avro.mergeSchema", "true")
sparkConf.set("spark.sql.avro.binaryAsString", "true")
val sc = new SparkContext(sparkConf)
sc.hadoopConfiguration.set("avro.enable.summary-metadata", "false")
val ssc = new StreamingContext(sc, Seconds(2))
try {
val kafkaConf = Map[String, String]("metadata.broker.list" -> "#################:9092",
"zookeeper.connect" -> "###################:5181",
"group.id" -> "KafkaConsumer",
"zookeeper.connection.timeout.ms" -> "1000000")
val topicMaps = Map("fishbowl" -> 1)
val messages: ReceiverInputDStream[(String, Array[Byte])] = KafkaUtils.createStream[String, Array[Byte], DefaultDecoder, DefaultDecoder](ssc, kafkaConf, topicMaps, StorageLevel.MEMORY_ONLY_SER)
try {
messages.foreachRDD((rdd, time) => {
if (rdd != null) {
try {
import com.databricks.spark.avro._
import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val rdd2 = rdd.map { case (k, v) => parseAVROToString(v) }
try {
val result = rdd2.mapPartitions(records => {
val mapper = new ObjectMapper()
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
mapper.registerModule(DefaultScalaModule)
records.flatMap(record => {
try {
Some(mapper.readValue(record, classOf[mobileData]))
} catch {
case e: Exception => None;
}
})
}, true)
println(result)
val df1 = result.toDF()
df1.printSchema()
df1.show()
logger.error("Registered Events: " + df1.count())
df1.registerTempTable("mdl_events")
df1.write.format("com.databricks.spark.avro").mode(org.apache.spark.sql.SaveMode.Append).save("/tmp/output")
//df1.write.format("org.apache.sql").mode(SaveMode.Append).save("/tmp/output")
//df1.write.format("avro").mode(org.apache.spark.sql.SaveMode.Append).avro("/avroResults")
// sqlContext.read.format("com.databricks.spark.avro").load("/avroResults")
} catch {
case e: Exception => None;
}
} catch {
case e: Exception => None;
}
}
})
} catch {
case e: Exception =>
println("Writing files after job. Exception:" + e.getMessage);
e.printStackTrace();
}
} catch {
case e: Exception =>
println("Kafka Stream. Writing files after job. Exception:" + e.getMessage);
e.printStackTrace();
}
ssc.start()
ssc.awaitTermination()
}
def parseAVROToString(rawEvent: Array[Byte]): String = {
try {
if (rawEvent.isEmpty) {
println("Rejected Event")
"Empty"
} else {
deserializeEvent(rawEvent).get("eventRecord").toString
}
} catch {
case e: Exception =>
println("Exception:" + e.getMessage);
"Empty"
}
}
def deserializeEvent(rawEvent: Array[Byte]): GenericRecord = {
try {
val reader = new GenericDatumReader[GenericRecord](eventSchema)
val decoder = DecoderFactory.get.binaryDecoder(rawEvent, null)
reader.read(null, decoder)
} catch {
case e: Exception =>
None;
null;
}
}
}
这是pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>kafka</groupId>
<artifactId>consumer</artifactId>
<version>1.0.0.0-SNAPSHOT</version>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>Scala-tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</pluginRepository>
</pluginRepositories>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.10.6</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<scope>provided</scope>
<version>1.6.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka_2.10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.10</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.10</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>1.6.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-examples_2.10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-examples_2.10</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.11</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>0.9.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>1.8.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.databricks/spark-avro_2.10 -->
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-avro_2.10</artifactId>
<version>2.0.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_2.10</artifactId>
<version>2.8.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.8.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.fasterxml/jackson-xml-databind -->
<dependency>
<groupId>com.fasterxml</groupId>
<artifactId>jackson-xml-databind</artifactId>
<version>0.6.2</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>1.7</version>
<executions>
<execution>
<id>add-source</id>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/java</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<!-- mixed scala/java compile -->
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<id>compile</id>
<goals>
<goal>compile</goal>
</goals>
<phase>compile</phase>
</execution>
<execution>
<id>test-compile</id>
<goals>
<goal>testCompile</goal>
</goals>
<phase>test-compile</phase>
</execution>
<execution>
<phase>process-resources</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
</configuration>
</plugin>
<!-- for fatjar -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>assemble-all</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>kafka.consumer.AvroConsumer1</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</project>
这是我的控制台日志,
root | - action:string(nullable = true)| - tenantid:integer (nullable = false)| - lat:float(nullable = false)| - lon:float (nullable = false)| - memberid:integer(nullable = false)| - event_name:string(nullable = true)| - productUpccd:integer (nullable = false)| - device_type:string(nullable = true)| - device_os_ver:float(nullable = false)| - item_name:string (nullable = true)
+ ------ + -------- + + --- --- + -------- + ---------- + - ---------- + ----------- + ------------- + --------- + | action | tenantid |纬度| LON | MEMBERID | EVENT_NAME | productUpccd | DEVICE_TYPE | device_os_ver | ITEM_NAME | + ------ + -------- + --- + --- + -------- + ---------- + ----- ------- + ----------- ------------- + + --------- + + ------ + -------- + --- + --- + -------- + ---------- + ----- ------- + ----------- + ------------- + --------- +
+ ------ + | action | + ------ + + ------ +
16/12/13 18:22:08 INFO scheduler.JobScheduler:已完成作业流 作业1481633528000 ms.0来自作业时间1481633528000 ms 16/12/13 18:22:08 INFO scheduler.JobScheduler:总延迟:0.097秒 1481633528000 ms(执行:0.094秒)16/12/13 18:22:08信息 rdd.BlockRDD:从持久性列表16/12/13 18:22:08中删除RDD 67 INFO kafka.KafkaInputDStream:删除RDD BlockRDD [67]的块 AvroConsumer.scala的createStream:63时间1481633528000毫秒 16/12/13 18:22:08 INFO scheduler.ReceivedBlockTracker:正在删除 批量ArrayBuffer(1481633524000 ms)16/12/13 18:22:08 INFO scheduler.InputInfoTracker:删除旧的批处理元数据:1481633524000 ms ^ C16 / 12/13 18:22:09 INFO streaming.StreamingContext:Invoking 从关闭钩子停止(stopGracefully = false)16/12/13 18:22:09 INFO scheduler.ReceiverTracker:向所有1个接收器发送停止信号 16/12/13 18:22:10 INFO scheduler.JobScheduler:为时间添加了工作 1481633530000 ms 16/12/13 18:22:10 INFO scheduler.JobScheduler: 从作业时间开始作业流作业1481633530000 ms.0 1481633530000 ms
,这是我在java中的avro生产者程序
public class KafkaAvroProducer {
/* case class
TopicData("action":"AppEvent","tenantid":1173,"lat":0.0,"lon":0.0,"memberid":55,
"event_name":"CATEGORY_CLICK",
"productUpccd":0,"device_type":"iPhone","device_os_ver":"10.1","item_name":"CHICKEN",*/
public static final String EVENT_SCHEMA = "{" + "\"type\":\"record\","
+ "\"name\":\"eventrecord\"," + "\"fields\":["
+ " { \"name\":\"action\", \"type\":\"string\" },"
+ " { \"name\":\"tenantid\", \"type\":\"int\" },"
+ " { \"name\":\"lat\", \"type\":\"double\" },"
+ " { \"name\":\"lon\", \"type\":\"double\" },"
+ " { \"name\":\"memberid\", \"type\":\"int\" },"
+ " { \"name\":\"event_name\", \"type\":\"string\" },"
+ " { \"name\":\"productUpccd\", \"type\":\"int\" },"
+ " { \"name\":\"device_type\", \"type\":\"string\" },"
+ " { \"name\":\"device_os_ver\", \"type\":\"string\" },"
+ "{ \"name\":\"item_name\", \"type\":\"string\" }" + "]}";
public static void main(String[] args) throws InterruptedException {
Properties props = new Properties();
props.put("bootstrap.servers", "##########:9092");
props.put("key.serializer",
"org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer",
"org.apache.kafka.common.serialization.ByteArraySerializer");
props.put("producer.type", "async");
Schema.Parser parser = new Schema.Parser();
Schema schema = parser.parse(EVENT_SCHEMA);
Injection<GenericRecord,byte[]> avroRecords = GenericAvroCodecs.toBinary(schema);
KafkaProducer<String, byte[]> producer = new KafkaProducer<>(props);
for(int i = 0; i<300;i++){
GenericData.Record avroRecord = new GenericData.Record(schema);
setEventValues(i, avroRecord);
byte[] bytes = avroRecords.apply(avroRecord);
ProducerRecord<String, byte[]> producerRecord = new ProducerRecord<String, byte[]>("fishbowl",bytes);
System.out.println(producerRecord);
producer.send(producerRecord);
}
producer.close();
}
private static void setEventValues(int i, Record avroRecord) {
avroRecord.put("action", "AppEvent");
avroRecord.put("tenantid", i);
avroRecord.put("lat", 0.0);
avroRecord.put("lon", 0.0);
avroRecord.put("memberid", 55);
avroRecord.put("event_name", "CATEGORY_CLICK");
avroRecord.put("productUpccd", 0);
avroRecord.put("device_type", "iPhone");
avroRecord.put("device_os_ver", "10.1");
avroRecord.put("item_name", "CHICKEN");
}
}