将大量的列数据从Spark RDD(JSON)保存到Cassandra Table

时间:2017-10-10 06:57:00

标签: java apache-kafka spark-dataframe spark-streaming

我们正在尝试创建一个apache Kafka spark流应用程序并将RDD数据保存到Cassandra。我们正在发送JSON。 Kafka生产者主题中的对象对消费者而言也能够消费它。

但是在将RDD中的这个JSON数据保存到Cassandra表时面临问题。

制作人代码:

package com.test.anna.KafkaSpark;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.kafka.clients.producer.Partitioner;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.codehaus.jettison.json.JSONObject;

import scala.Array;
import scala.Product;
//import org.hibernate.validator.constraints.Length;
import scala.collection.Iterator;

public class ProducerTest {

    public static void main(String[] args) {

        // System.setProperty("java.security.auth.login.config", "/Users/srikanth_kopparthy/Documents/keystore/client_jaas.conf");

        Properties properties = new Properties();
        properties.put("bootstrap.servers", "localhost:9092");

        properties.put("acks", "all");
        properties.put("retries", "0");
        properties.put("batch.size", "16384");
        properties.put("linger.ms", "1");
        properties.put("request.timeout.ms", "150000");
        properties.put("buffer.memory", "33554432");
        properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        //properties.put("partitioner.class", "SimplePartitioner");
       // properties.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
        properties.put("value.serializer", ProductsSerializer.class.getName());
        Producer producer = new KafkaProducer(properties);

        //JDBC Connection
     // JDBC driver name and database URL
         String JDBC_DRIVER = "oracle.jdbc.driver.OracleDriver";  
         String DB_URL = "jdbc:oracle:thin:@oogway.corp.xyz.com:1976:cd";

        //  Database credentials
        String USER = "xyzzy";
         String PASS = "pwd";

        Connection conn = null;
        Statement stmt = null;
        try{
           //STEP 2: Register JDBC driver
           Class.forName("oracle.jdbc.driver.OracleDriver");

           //STEP 3: Open a connection
           System.out.println("Connecting to database...");
           conn = DriverManager.getConnection(DB_URL,USER,PASS);

           //STEP 4: Execute a query
           System.out.println("Creating statement...");
           stmt = conn.createStatement();
           String sql;
           sql = "select * from AP_SALESWS_COMP_CM WHERE ROWNUM <= 10";
           ResultSet rs = stmt.executeQuery(sql);

           //STEP 5: Extract data from result set
           while(rs.next()){
              //Retrieve by column name
             String bo_id  = rs.getString("BO_ID");
              String profile_cm_seq  = rs.getString("PROFILE_CM_SEQ");
              String bo_cm_start_dt  = rs.getString("BO_CM_START_DT");


//              List<String> listColumn=new ArrayList<>();
//              listColumn.add(bo_id);
//              listColumn.add(profile_cm_seq);
//              listColumn.add(bo_cm_start_dt);
              Products product=new Products();
              product.setId(bo_id);
              product.setName(profile_cm_seq);
              product.setParents(bo_cm_start_dt);

              String bo_cm_end_dt  = rs.getString("END_DT");
              String cm_id  = rs.getString("ID");

              //Display values
              System.out.print("ID: " + id);

                 // String msg = "Testing : Published Message from sample Producer Client " + i;
                //  Future response = producer.send(new ProducerRecord("test6","BID "+ bid + "  " +"PRFIE SEQ " + profle__seq + "   " + "START DATE " + start_dt + "    " + "END DATE " + end_dt + "    " + "CM ID " + cm_id));
              Future response = producer.send(new ProducerRecord("test7", bo_id,product));
                  try {
                    response.get(60, TimeUnit.SECONDS);
                    while(response.isDone()) {
                        System.out.println(response.get().toString());
                        break;
                    }
                    System.out.println("Sent:" + bo_id);
                } catch (InterruptedException e1) {
                    System.out.println("interrupted exception..");
                    e1.printStackTrace();
                } catch (ExecutionException e1) {
                    System.out.println("execution exception..");
                    e1.printStackTrace();
                } catch (TimeoutException e1) {
                    System.out.println("TimeoutException..");
                    e1.printStackTrace();
                }



           }
           //STEP 6: Clean-up environment
           rs.close();
           stmt.close();
           conn.close();
        }catch(SQLException se){
           //Handle errors for JDBC
           se.printStackTrace();
        }catch(Exception e){
           //Handle errors for Class.forName
           e.printStackTrace();
        }finally{
           //finally block used to close resources
           try{
              if(stmt!=null)
                 stmt.close();
           }catch(SQLException se2){
           }// nothing we can do
           try{
              if(conn!=null)
                 conn.close();
           }catch(SQLException se){
              se.printStackTrace();
           }//end finally try
        }//end try
        System.out.println("Goodbye!");

        //JDBC Connection END






    }

}

消费者代码:

package com.test.anna.KafkaSpark;
import java.util.Map;
import java.util.Properties;
import com.datastax.spark.connector.*;
import  com.datastax.spark.connector.japi.CassandraJavaUtil.*;
import  com.datastax.spark.connector.japi.CassandraStreamingJavaUtil.*;
import  com.datastax.spark.connector.japi.DStreamJavaFunctions.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Future;
import java.util.function.Function;
import com.datastax.driver.core.Session;
import com.datastax.spark.connector.writer.RowWriterFactory$;
import org.apache.commons.collections.KeyValue;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.TopicPartition;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jettison.json.JSONArray;

import com.datastax.spark.connector.cql.CassandraConnector;
import com.datastax.spark.connector.japi.CassandraJavaUtil;
import com.datastax.spark.connector.writer.RowWriterFactory;
import kafka.javaapi.producer.Producer;
import kafka.serializer.StringDecoder;
import scala.Tuple2;
import scala.reflect.macros.internal.macroImpl;  
i

public class SparkStreamingConsumer {
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        SparkConf conf = new SparkConf()
                .setAppName("kafka-sandbox")
                .setMaster("local[*]")
                .set("spark.cassandra.connection.host","localhost"); //for cassandra
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(20000));

        // TODO: processing pipeline
        Map<String, String> kafkaParams = new HashMap();
        kafkaParams.put("metadata.broker.list", "localhost:9092");
        kafkaParams.put("zookeeper.connect","localhost:2181");
        Set<String> topics = Collections.singleton("test7");
        JavaPairInputDStream<String, String> directKafkaStream = KafkaUtils.createDirectStream(ssc,
                String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);




        directKafkaStream.foreachRDD(rdd -> {
            System.out.println("Message Received "+rdd.values().take(1));
            System.out.println("--- New RDD with " + rdd.partitions().size()
                + " partitions and " + rdd.count() + " records");
            rdd.foreach(record -> System.out.println(record));
           JavaRDD<String> rdd2=rdd.map(x ->(x._2));
          // JavaPairRDD<String, List<String>> c1 = rdd2.mapToPair(i -> new Tuple2(i._1()));
            System.out.println("rdd output----"+rdd.collect());
            System.out.println("rdd2 output--"+ rdd2.collect());
        //    System.out.println("rdd2 output afeter update--"+ c1.collect());
           System.out.println("writing to cassandra");

           jsonR

         // JavaPairRDD<String,String> newRdd2= JavaPairRDD.fromJavaRDD(rdd2) ;

          // CassandraJavaUtil.javaFunctions(rdd2).writerBuilder("java_api3", "products", CassandraJavaUtil.mapTupleToRow(String.class,String.class,String.class)).saveToCassandra();

       //  CassandraJavaUtil.javaFunctions(rdd,Products.class).writerBuilder("java_api3", "products",CassandraJavaUtil.mapToRow(Products.class)).saveToCassandra();

         //  CassandraJavaUtil.javaFunctions(rdd2).writerBuilder("java_api3", "products", CassandraJavaUtil.mapTupleToRow(String.class,String.class,String.class)).withColumnSelector(CassandraJavaUtil.someColumns("id", "name","parents")).saveToCassandra(); 


      CassandraJavaUtil.javaFunctions(rdd).writerBuilder("java_api3", "products", CassandraJavaUtil.mapTupleToRow(String.class,String.class)).withColumnSelector(CassandraJavaUtil.someColumns("id", "name")).saveToCassandra(); 




        });



        ssc.start();
        ssc.awaitTermination();



    }   
}

请告诉我们如何在spark strreaming中处理JSON,以便我们可以直接将它保存在Cassandra表中。

我们如何在Java中将JAVARDD转换为JavaPairRDD。

注意:我们正在用Java编写应用程序代码,而不是使用scala。

先谢谢。

0 个答案:

没有答案