spark使用scala来传播来自kafka主题的avro数据

时间:2016-12-13 14:54:19

标签: scala spark-streaming kafka-consumer-api

我一直在尝试使用scala编写的spark流来传输来自kafka主题的avro数据。(i followed this link)。 主要的座右铭是解析来自kafka主题的avro数据,并从dstreams中创建一个数据框并持续到hdfs。 代码运行正常,没有任何异常,但也没有输出。

case class mobileData(action: String, tenantid: Int, lat: Float, lon: Float, memberid: Int, event_name: String, productUpccd: Int, device_type: String, device_os_ver: Float, item_name: String)
object AvroConsumer {
  val eventSchema = SchemaBuilder.record("eventRecord").fields
    .name("action").`type`().stringType().noDefault()
    .name("tenantid").`type`().intType().noDefault()
    .name("lat").`type`().doubleType().noDefault()
    .name("lon").`type`().doubleType().noDefault()
    .name("memberid").`type`().intType().noDefault()
    .name("event_name").`type`().stringType().noDefault()
    .name("productUpccd").`type`().intType().noDefault()
    .name("device_type").`type`().stringType().noDefault()
    .name("device_os_ver").`type`().stringType().noDefault()
    .name("item_name").`type`().stringType().noDefault().endRecord
  def main(args: Array[String]): Unit = {
    Logger.getLogger("kafka.consumer.AvroConsumer").setLevel(Level.WARN)
    Logger.getLogger("org.apache.spark.storage.BlockManager").setLevel(Level.ERROR)
    val logger: Logger = Logger.getLogger("kafka.consumer.AvroConsumer")
    val sparkConf = new SparkConf().setAppName("Avro Consumer").
      setMaster("dev-mapr-node2.fishbowl.com").set("spark.driver.allowMultipleContexts", "true")
    sparkConf.set("spark.cores.max", "2")
    sparkConf.set("spark.serializer", classOf[KryoSerializer].getName)
    sparkConf.set("spark.sql.tungsten.enabled", "true")
    sparkConf.set("spark.eventLog.enabled", "true")
    sparkConf.set("spark.app.id", "KafkaConsumer")
    sparkConf.set("spark.io.compression.codec", "snappy")
    sparkConf.set("spark.rdd.compress", "true")
    sparkConf.set("spark.streaming.backpressure.enabled", "true")
    sparkConf.set("spark.sql.avro.compression.codec", "snappy")
    sparkConf.set("spark.sql.avro.mergeSchema", "true")
    sparkConf.set("spark.sql.avro.binaryAsString", "true")
    val sc = new SparkContext(sparkConf)
    sc.hadoopConfiguration.set("avro.enable.summary-metadata", "false")
    val ssc = new StreamingContext(sc, Seconds(2))
    try {
      val kafkaConf = Map[String, String]("metadata.broker.list" -> "#################:9092",
        "zookeeper.connect" -> "###################:5181",
        "group.id" -> "KafkaConsumer",
        "zookeeper.connection.timeout.ms" -> "1000000")
      val topicMaps = Map("fishbowl" -> 1)
      val messages: ReceiverInputDStream[(String, Array[Byte])] = KafkaUtils.createStream[String, Array[Byte], DefaultDecoder, DefaultDecoder](ssc, kafkaConf, topicMaps, StorageLevel.MEMORY_ONLY_SER)
      try {
        messages.foreachRDD((rdd, time) => {
          if (rdd != null) {
            try {
              import com.databricks.spark.avro._
              import org.apache.spark.sql.SQLContext
              val sqlContext = new SQLContext(sc)
              import sqlContext.implicits._

              val rdd2 = rdd.map { case (k, v) => parseAVROToString(v) }

              try {
                val result = rdd2.mapPartitions(records => {
                  val mapper = new ObjectMapper()
                  mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
                  mapper.registerModule(DefaultScalaModule)
                  records.flatMap(record => {
                    try {
                      Some(mapper.readValue(record, classOf[mobileData]))
                    } catch {
                      case e: Exception => None;
                    }

                  })
                }, true)
                println(result)

                val df1 = result.toDF()
                df1.printSchema()
                df1.show()
                logger.error("Registered Events: " + df1.count())
                df1.registerTempTable("mdl_events")

                df1.write.format("com.databricks.spark.avro").mode(org.apache.spark.sql.SaveMode.Append).save("/tmp/output")
                //df1.write.format("org.apache.sql").mode(SaveMode.Append).save("/tmp/output")
                //df1.write.format("avro").mode(org.apache.spark.sql.SaveMode.Append).avro("/avroResults")
              //  sqlContext.read.format("com.databricks.spark.avro").load("/avroResults")
              } catch {
                case e: Exception => None;
              }
            } catch {
              case e: Exception => None;
            }
          }
        })
      } catch {
        case e: Exception =>
          println("Writing files after job. Exception:" + e.getMessage);
          e.printStackTrace();
      }
    } catch {
      case e: Exception =>
        println("Kafka Stream. Writing files after job. Exception:" + e.getMessage);
        e.printStackTrace();
    }
    ssc.start()
    ssc.awaitTermination()
  }

  def parseAVROToString(rawEvent: Array[Byte]): String = {
    try {
      if (rawEvent.isEmpty) {
        println("Rejected Event")
        "Empty"
      } else {
        deserializeEvent(rawEvent).get("eventRecord").toString
      }
    } catch {
      case e: Exception =>
        println("Exception:" + e.getMessage);
        "Empty"
    }
  }

  def deserializeEvent(rawEvent: Array[Byte]): GenericRecord = {
    try {
      val reader = new GenericDatumReader[GenericRecord](eventSchema)
      val decoder = DecoderFactory.get.binaryDecoder(rawEvent, null)
      reader.read(null, decoder)
    } catch {
      case e: Exception =>
        None;
        null;
    }
  }
}

这是pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>kafka</groupId>
  <artifactId>consumer</artifactId>
  <version>1.0.0.0-SNAPSHOT</version>
  <pluginRepositories>
    <pluginRepository>
      <id>scala-tools.org</id>
      <name>Scala-tools Maven2 Repository</name>
      <url>http://scala-tools.org/repo-releases</url>
    </pluginRepository>
  </pluginRepositories>

  <dependencies>
    <dependency>
      <groupId>org.scala-lang</groupId>
      <artifactId>scala-library</artifactId>
      <version>2.10.6</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.10</artifactId>
      <scope>provided</scope>
      <version>1.6.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka_2.10 -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka_2.10</artifactId>
      <version>1.6.1</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka-0-10_2.10</artifactId>
      <version>2.0.0</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.10</artifactId>
      <version>1.6.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-examples_2.10 -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-examples_2.10</artifactId>
      <version>1.1.1</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka_2.11</artifactId>
      <version>1.6.1</version>
    </dependency>
    <dependency>
      <groupId>org.apache.kafka</groupId>
      <artifactId>kafka_2.11</artifactId>
      <version>0.9.0.0</version>
    </dependency>
    <dependency>
      <groupId>org.apache.avro</groupId>
      <artifactId>avro</artifactId>
      <version>1.8.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.databricks/spark-avro_2.10 -->
    <dependency>
      <groupId>com.databricks</groupId>
      <artifactId>spark-avro_2.10</artifactId>
      <version>2.0.1</version>
    </dependency>

    <dependency>
      <groupId>com.fasterxml.jackson.module</groupId>
      <artifactId>jackson-module-scala_2.10</artifactId>
      <version>2.8.4</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind -->
    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-databind</artifactId>
      <version>2.8.5</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.fasterxml/jackson-xml-databind -->
    <dependency>
      <groupId>com.fasterxml</groupId>
      <artifactId>jackson-xml-databind</artifactId>
      <version>0.6.2</version>
    </dependency>
  </dependencies>

  <build>
    <pluginManagement>
      <plugins>
        <plugin>
          <groupId>org.codehaus.mojo</groupId>
          <artifactId>build-helper-maven-plugin</artifactId>
          <version>1.7</version>
          <executions>
            <execution>
              <id>add-source</id>
              <phase>generate-sources</phase>
              <goals>
                <goal>add-source</goal>
              </goals>
              <configuration>
                <sources>
                  <source>src/main/java</source>
                </sources>
              </configuration>
            </execution>
          </executions>
        </plugin>
      </plugins>
    </pluginManagement>
    <plugins>

      <!-- mixed scala/java compile -->
      <plugin>
        <groupId>org.scala-tools</groupId>
        <artifactId>maven-scala-plugin</artifactId>
        <version>2.15.2</version>
        <executions>
          <execution>
            <id>compile</id>
            <goals>
              <goal>compile</goal>
            </goals>
            <phase>compile</phase>
          </execution>
          <execution>
            <id>test-compile</id>
            <goals>
              <goal>testCompile</goal>
            </goals>
            <phase>test-compile</phase>
          </execution>
          <execution>
            <phase>process-resources</phase>
            <goals>
              <goal>compile</goal>
            </goals>
          </execution>
        </executions>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>3.6.0</version>
        <configuration>
          <source>1.7</source>
          <target>1.7</target>
        </configuration>
      </plugin>
      <!-- for fatjar -->
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-assembly-plugin</artifactId>
        <version>2.4</version>
        <configuration>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
          </descriptorRefs>
        </configuration>
        <executions>
          <execution>
            <id>assemble-all</id>
            <phase>package</phase>
            <goals>
              <goal>single</goal>
            </goals>
          </execution>
        </executions>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-jar-plugin</artifactId>
        <version>3.0.2</version>
        <configuration>
          <archive>
            <manifest>
              <addClasspath>true</addClasspath>
              <mainClass>kafka.consumer.AvroConsumer1</mainClass>
            </manifest>
          </archive>
        </configuration>
      </plugin>
    </plugins>


  </build>

</project>

这是我的控制台日志,

  

root | - action:string(nullable = true)| - tenantid:integer   (nullable = false)| - lat:float(nullable = false)| - lon:float   (nullable = false)| - memberid:integer(nullable = false)| -   event_name:string(nullable = true)| - productUpccd:integer   (nullable = false)| - device_type:string(nullable = true)| -   device_os_ver:float(nullable = false)| - item_name:string   (nullable = true)

     

+ ------ + -------- + + --- --- + -------- + ---------- + - ---------- + ----------- + ------------- + --------- + | action | tenantid |纬度| LON | MEMBERID | EVENT_NAME | productUpccd | DEVICE_TYPE | device_os_ver | ITEM_NAME |   + ------ + -------- + --- + --- + -------- + ---------- + ----- ------- + ----------- ------------- + + --------- +   + ------ + -------- + --- + --- + -------- + ---------- + ----- ------- + ----------- + ------------- + --------- +

     

+ ------ + | action |   + ------ +   + ------ +

     

16/12/13 18:22:08 INFO scheduler.JobScheduler:已完成作业流   作业1481633528000 ms.0来自作业时间1481633528000 ms 16/12/13   18:22:08 INFO scheduler.JobScheduler:总延迟:0.097秒   1481633528000 ms(执行:0.094秒)16/12/13 18:22:08信息   rdd.BlockRDD:从持久性列表16/12/13 18:22:08中删除RDD 67   INFO kafka.KafkaInputDStream:删除RDD BlockRDD [67]的块   AvroConsumer.scala的createStream:63时间1481633528000毫秒   16/12/13 18:22:08 INFO scheduler.ReceivedBlockTracker:正在删除   批量ArrayBuffer(1481633524000 ms)16/12/13 18:22:08 INFO   scheduler.InputInfoTracker:删除旧的批处理元数据:1481633524000   ms ^ C16 / 12/13 18:22:09 INFO streaming.StreamingContext:Invoking   从关闭钩子停止(stopGracefully = false)16/12/13 18:22:09 INFO   scheduler.ReceiverTracker:向所有1个接收器发送停止信号   16/12/13 18:22:10 INFO scheduler.JobScheduler:为时间添加了工作   1481633530000 ms 16/12/13 18:22:10 INFO scheduler.JobScheduler:   从作业时间开始作业流作业1481633530000 ms.0   1481633530000 ms

,这是我在java中的avro生产者程序

public class KafkaAvroProducer {

    /* case class
     TopicData("action":"AppEvent","tenantid":1173,"lat":0.0,"lon":0.0,"memberid":55,
     "event_name":"CATEGORY_CLICK",
     "productUpccd":0,"device_type":"iPhone","device_os_ver":"10.1","item_name":"CHICKEN",*/

    public static final String EVENT_SCHEMA = "{" + "\"type\":\"record\","
            + "\"name\":\"eventrecord\"," + "\"fields\":["
            + "  { \"name\":\"action\", \"type\":\"string\" },"
            + "  { \"name\":\"tenantid\", \"type\":\"int\" },"
            + "  { \"name\":\"lat\", \"type\":\"double\" },"
            + "  { \"name\":\"lon\", \"type\":\"double\" },"
            + "  { \"name\":\"memberid\", \"type\":\"int\" },"
            + "  { \"name\":\"event_name\", \"type\":\"string\" },"
            + "  { \"name\":\"productUpccd\", \"type\":\"int\" },"
            + "  { \"name\":\"device_type\", \"type\":\"string\" },"
            + "  { \"name\":\"device_os_ver\", \"type\":\"string\" },"
            + "{ \"name\":\"item_name\", \"type\":\"string\" }" + "]}";

    public static void main(String[] args) throws InterruptedException {
        Properties props = new Properties();
        props.put("bootstrap.servers", "##########:9092");
        props.put("key.serializer",
                "org.apache.kafka.common.serialization.StringSerializer");
        props.put("value.serializer",
                "org.apache.kafka.common.serialization.ByteArraySerializer");
        props.put("producer.type", "async");
        Schema.Parser parser = new Schema.Parser();
        Schema schema = parser.parse(EVENT_SCHEMA);
        Injection<GenericRecord,byte[]> avroRecords = GenericAvroCodecs.toBinary(schema);
        KafkaProducer<String, byte[]> producer = new KafkaProducer<>(props);
        for(int i = 0; i<300;i++){
            GenericData.Record avroRecord = new GenericData.Record(schema);
            setEventValues(i, avroRecord);
            byte[] bytes = avroRecords.apply(avroRecord);
            ProducerRecord<String, byte[]> producerRecord = new ProducerRecord<String, byte[]>("fishbowl",bytes);
            System.out.println(producerRecord);
            producer.send(producerRecord);

        }
        producer.close();
    }

    private static void setEventValues(int i, Record avroRecord) {

        avroRecord.put("action", "AppEvent");
        avroRecord.put("tenantid", i);
        avroRecord.put("lat", 0.0);
        avroRecord.put("lon", 0.0);
        avroRecord.put("memberid", 55);
        avroRecord.put("event_name", "CATEGORY_CLICK");
        avroRecord.put("productUpccd", 0);
        avroRecord.put("device_type", "iPhone");
        avroRecord.put("device_os_ver", "10.1");
        avroRecord.put("item_name", "CHICKEN");
    }

}

0 个答案:

没有答案