我为SparkStreaming编写了Kafka Producer和Consumer代码,但它仅适用于数据库中的实时更新。 能告诉我如何将其转换为批处理,以便它至少执行一次。
制作人代码:
package com.test.anna.KafkaSpark;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.kafka.clients.producer.Partitioner;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.codehaus.jettison.json.JSONObject;
import scala.Array;
import scala.Product;
//import org.hibernate.validator.constraints.Length;
import scala.collection.Iterator;
public class ProducerTest {
public static void main(String[] args) {
// System.setProperty("java.security.auth.login.config", "/Users/srikanth_kopparthy/Documents/keystore/client_jaas.conf");
//new
Properties properties = new Properties();
properties.put("bootstrap.servers", "localhost:9092");
properties.put("acks", "all");
properties.put("retries", "0");
properties.put("batch.size", "26384");
properties.put("linger.ms", "1");
properties.put("request.timeout.ms", "150000");
properties.put("buffer.memory", "33554432");
properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
//properties.put("partitioner.class", "SimplePartitioner");
// properties.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
properties.put("value.serializer", ProductsSerializer.class.getName());
Producer producer = new KafkaProducer(properties);
//JDBC Connection
// JDBC driver name and database URL
String JDBC_DRIVER = "oracle.jdbc.driver.OracleDriver";
String DB_URL = "jdbc:oracle:thin:@xyz:1976:cd";
// Database credentials
String USER = "xyzzy";
String PASS = "pwd";
Connection conn = null;
Statement stmt = null;
try{
//STEP 2: Register JDBC driver
Class.forName(JDBC_DRIVER);
//STEP 3: Open a connection
System.out.println("Connecting to database...");
conn = DriverManager.getConnection(DB_URL,USER,PASS);
//STEP 4: Execute a query
System.out.println("Creating statement...");
stmt = conn.createStatement();
String sql;
sql = "select * from test_mdm";
ResultSet rs = stmt.executeQuery(sql);
System.out.println("RS-->"+rs.getRow());
//STEP 5: Extract data from result set
while(rs.next()){
//Retrieve by column name
System.out.println("inside whiel");
String bo_id = rs.getString("bo");
String profile_cm_seq = rs.getString("profile");
String bo_cm_start_dt = rs.getString("cm");
Products product=new Products();
product.setId(bo_id);
product.setName(profile_cm_seq);
product.setParents(bo_cm_start_dt);
//Display values
System.out.print("BO_ID: " + bo_id);
Future response = producer.send(new ProducerRecord("test7", bo_id,product));
try {
response.get(60, TimeUnit.SECONDS);
while(response.isDone()) {
System.out.println(response.get().toString());
break;
}
System.out.println("Sent:" + bo_id);
} catch (InterruptedException e1) {
System.out.println("interrupted exception..");
e1.printStackTrace();
} catch (ExecutionException e1) {
System.out.println("execution exception..");
e1.printStackTrace();
} catch (TimeoutException e1) {
System.out.println("TimeoutException..");
e1.printStackTrace();
}
}
//STEP 6: Clean-up environment
rs.close();
stmt.close();
conn.close();
}catch(SQLException se){
//Handle errors for JDBC
se.printStackTrace();
}catch(Exception e){
//Handle errors for Class.forName
e.printStackTrace();
}finally{
//finally block used to close resources
try{
if(stmt!=null)
stmt.close();
}catch(SQLException se2){
}// nothing we can do
try{
if(conn!=null)
conn.close();
}catch(SQLException se){
se.printStackTrace();
}//end finally try
}//end try
System.out.println("Goodbye!");
//JDBC Connection END
}
}
消费者代码:
package com.test.anna.KafkaSpark;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.spark.SparkConf;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;
import com.databricks.spark.avro.*;
import kafka.serializer.StringDecoder;
public class SparkStreamingConsumer {
public static void main(String[] args) {
// TODO Auto-generated method stub
SparkConf conf = new SparkConf()
.setAppName("kafka-sandbox")
.setMaster("local[*]")
.set("spark.cassandra.connection.host","localhost"); //for cassandra
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(30000));
// TODO: processing pipeline
Map<String, String> kafkaParams = new HashMap();
kafkaParams.put("metadata.broker.list", "localhost:9092");
kafkaParams.put("zookeeper.connect","localhost:2181");
Set<String> topics = Collections.singleton("test7");
JavaPairInputDStream<String, String> directKafkaStream = KafkaUtils.createDirectStream(ssc,
String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
directKafkaStream.foreachRDD(rdd -> {
System.out.println("Message Received "+rdd.values().take(1));
System.out.println("--- New RDD with " + rdd.partitions().size()
+ " partitions and " + rdd.count() + " records");
rdd.foreach(record -> System.out.println(record));
JavaRDD<String> rdd2=rdd.map(x ->(x._2));
System.out.println("writing to cassandra");
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
DataFrame df = sqlContext.read().json(rdd2);
df.show();
//producer-consumer-spark-streaming-dataframe-validation-transformation(avro)-kafka and cassandra
//Convert the dataframe to AVRO after validation
if(df.count()>0)
{
// df.write().mode("append").format("com.databricks.spark.avro").partitionby
df.repartition(1).write().mode("append").format("com.databricks.spark.avro").save("/Users/vipul.tripathi/AVRO Files/fifth");
//DataFrame df5=sqlContext.read().format("com.databricks.spark.avro").load("/Users/xyz/AVRO Files/fifth/part-r-00000-d63f8878-20c3-4eea-9ee5-cdd62876c153.avro");
// System.out.println("printing df5");
//df5.show();
//
// df.select("bo_id").show();
// df.registerTempTable("test");
// DataFrame tempTest= sqlContext.sql("SELECT bo_id, profile_cm_seq,bo_cm_start_dt from test");
// System.out.println("List temp table---->"+ tempTest);
// tempTest.write().format("org.apache.spark.sql.cassandra").mode("append").options(new HashMap<String, String>() {
// {
// put("keyspace", "java_api3");
// put("table", "prod");
// }
// }).save();
}
});
directKafkaStream.foreachRDD(rdd -> {
OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
rdd.foreachPartition(consumerRecords -> {
OffsetRange o = offsetRanges[TaskContext.get().partitionId()];
System.out.println(
o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset());
});
});
ssc.start();
ssc.awaitTermination();
}
}
以上是制作人和消费者代码。我想运行它们并希望获取已存储在oracle数据库中的数据。
如何在这种情况下保持记录的增量加载?
先谢谢。