将SparkStreaming实时转换为批处理在Kafka中

时间:2017-10-13 09:33:29

标签: apache-spark apache-kafka spark-streaming kafka-consumer-api

我为SparkStreaming编写了Kafka Producer和Consumer代码,但它仅适用于数据库中的实时更新。 能告诉我如何将其转换为批处理,以便它至少执行一次。

制作人代码:

package com.test.anna.KafkaSpark;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.kafka.clients.producer.Partitioner;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.codehaus.jettison.json.JSONObject;

import scala.Array;
import scala.Product;
//import org.hibernate.validator.constraints.Length;
import scala.collection.Iterator;

public class ProducerTest {

    public static void main(String[] args) {

        // System.setProperty("java.security.auth.login.config", "/Users/srikanth_kopparthy/Documents/keystore/client_jaas.conf");
//new
        Properties properties = new Properties();
        properties.put("bootstrap.servers", "localhost:9092");

        properties.put("acks", "all");
        properties.put("retries", "0");
        properties.put("batch.size", "26384");
        properties.put("linger.ms", "1");
        properties.put("request.timeout.ms", "150000");
        properties.put("buffer.memory", "33554432");
        properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        //properties.put("partitioner.class", "SimplePartitioner");
       // properties.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
        properties.put("value.serializer", ProductsSerializer.class.getName());
        Producer producer = new KafkaProducer(properties);

        //JDBC Connection
     // JDBC driver name and database URL
         String JDBC_DRIVER = "oracle.jdbc.driver.OracleDriver";  
         String DB_URL = "jdbc:oracle:thin:@xyz:1976:cd";

        //  Database credentials
        String USER = "xyzzy";
         String PASS = "pwd";

        Connection conn = null;
        Statement stmt = null;
        try{
           //STEP 2: Register JDBC driver
           Class.forName(JDBC_DRIVER);

           //STEP 3: Open a connection
           System.out.println("Connecting to database...");
           conn = DriverManager.getConnection(DB_URL,USER,PASS);

           //STEP 4: Execute a query
           System.out.println("Creating statement...");
           stmt = conn.createStatement();
           String sql;
           sql = "select * from test_mdm";
           ResultSet rs = stmt.executeQuery(sql);
           System.out.println("RS-->"+rs.getRow()); 
           //STEP 5: Extract data from result set
           while(rs.next()){
              //Retrieve by column name
               System.out.println("inside whiel");
             String bo_id  = rs.getString("bo");
              String profile_cm_seq  = rs.getString("profile");
              String bo_cm_start_dt  = rs.getString("cm");
              Products product=new Products();
              product.setId(bo_id);
              product.setName(profile_cm_seq);
              product.setParents(bo_cm_start_dt);
              //Display values
              System.out.print("BO_ID: " + bo_id);
              Future response = producer.send(new ProducerRecord("test7", bo_id,product));
                  try {
                    response.get(60, TimeUnit.SECONDS);
                    while(response.isDone()) {
                        System.out.println(response.get().toString());
                        break;
                    }
                    System.out.println("Sent:" + bo_id);
                } catch (InterruptedException e1) {
                    System.out.println("interrupted exception..");
                    e1.printStackTrace();
                } catch (ExecutionException e1) {
                    System.out.println("execution exception..");
                    e1.printStackTrace();
                } catch (TimeoutException e1) {
                    System.out.println("TimeoutException..");
                    e1.printStackTrace();
                }



           }
           //STEP 6: Clean-up environment
           rs.close();
           stmt.close();
           conn.close();
        }catch(SQLException se){
           //Handle errors for JDBC
           se.printStackTrace();
        }catch(Exception e){
           //Handle errors for Class.forName
           e.printStackTrace();
        }finally{
           //finally block used to close resources
           try{
              if(stmt!=null)
                 stmt.close();
           }catch(SQLException se2){
           }// nothing we can do
           try{
              if(conn!=null)
                 conn.close();
           }catch(SQLException se){
              se.printStackTrace();
           }//end finally try
        }//end try
        System.out.println("Goodbye!");

        //JDBC Connection END






    }

}

消费者代码:

package com.test.anna.KafkaSpark;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import org.apache.spark.SparkConf;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;

import com.databricks.spark.avro.*;
import kafka.serializer.StringDecoder;  


public class SparkStreamingConsumer {
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        SparkConf conf = new SparkConf()
                .setAppName("kafka-sandbox")
                .setMaster("local[*]")
                .set("spark.cassandra.connection.host","localhost"); //for cassandra
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(30000));

        // TODO: processing pipeline
        Map<String, String> kafkaParams = new HashMap();
        kafkaParams.put("metadata.broker.list", "localhost:9092");
        kafkaParams.put("zookeeper.connect","localhost:2181");
        Set<String> topics = Collections.singleton("test7");
        JavaPairInputDStream<String, String> directKafkaStream = KafkaUtils.createDirectStream(ssc,
                String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);

        directKafkaStream.foreachRDD(rdd -> {
            System.out.println("Message Received "+rdd.values().take(1));
            System.out.println("--- New RDD with " + rdd.partitions().size()
                + " partitions and " + rdd.count() + " records");
            rdd.foreach(record -> System.out.println(record));
           JavaRDD<String> rdd2=rdd.map(x ->(x._2));

           System.out.println("writing to cassandra");

           SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);

           DataFrame df = sqlContext.read().json(rdd2);
           df.show();
           //producer-consumer-spark-streaming-dataframe-validation-transformation(avro)-kafka and cassandra    
           //Convert the dataframe to AVRO after validation

           if(df.count()>0)
           {
               //   df.write().mode("append").format("com.databricks.spark.avro").partitionby
               df.repartition(1).write().mode("append").format("com.databricks.spark.avro").save("/Users/vipul.tripathi/AVRO Files/fifth");
               //DataFrame df5=sqlContext.read().format("com.databricks.spark.avro").load("/Users/xyz/AVRO Files/fifth/part-r-00000-d63f8878-20c3-4eea-9ee5-cdd62876c153.avro");
             // System.out.println("printing df5");
              //df5.show();
//          
//           df.select("bo_id").show();
//           df.registerTempTable("test");
//           DataFrame tempTest= sqlContext.sql("SELECT bo_id, profile_cm_seq,bo_cm_start_dt from test");
//           System.out.println("List temp table---->"+ tempTest);
//           tempTest.write().format("org.apache.spark.sql.cassandra").mode("append").options(new HashMap<String, String>() {
//               {
//                   put("keyspace", "java_api3");
//                   put("table", "prod");
//               }
//           }).save();
           }



        });

        directKafkaStream.foreachRDD(rdd -> {
              OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
              rdd.foreachPartition(consumerRecords -> {
                OffsetRange o = offsetRanges[TaskContext.get().partitionId()];
                System.out.println(
                  o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset());
              });
            });

        ssc.start();
        ssc.awaitTermination();  

    }       
}

以上是制作人和消费者代码。我想运行它们并希望获取已存储在oracle数据库中的数据。

如何在这种情况下保持记录的增量加载?

先谢谢。

0 个答案:

没有答案