我正在使用Spark版本1.4.1,Cassandra和Kafka运行Spark Streaming作业。
来自Kafka主题的Job读取事件详细说明并存储到Cassandra表。
我的样本工作类:
import com.datastax.spark.connector.japi.CassandraJavaUtil;
import com.datastax.spark.connector.japi.CassandraStreamingJavaUtil;
import com.finscience.messages.StatusMessageHandler;
import com.google.gson.JsonParser;
import kafka.serializer.StringDecoder;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import scala.Tuple2;
import java.util.*;
public class CassandraTest {
StatusMessageHandler statusMessageHandler;
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");
sparkConf.setAppName("test");
sparkConf.setMaster("local[*]");
JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.milliseconds(1000));
Map<String, Integer> kafkaTopics = new HashMap<String, Integer>();
kafkaTopics.put("topic_test", 1);
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("zookeeper.connect", "127.0.0.1:2181");
kafkaParams.put("group.id", "consumer_group_test");
Integer kafkaSparkReceivers = 1;
StorageLevel messagesStorageLevel = StorageLevel.MEMORY_ONLY();
//create a list of receiver
List<JavaPairDStream<String, String>> kafkaStreams = new ArrayList<>(kafkaSparkReceivers);
for (int i = 0; i < kafkaSparkReceivers; i++) {
// Create kafka stream with brokers and topics
JavaPairInputDStream<String, String> messagesStream = KafkaUtils.createStream(ssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaParams, kafkaTopics, messagesStorageLevel);
kafkaStreams.add(messagesStream);
}
JavaPairDStream<String, String> unifiedKafkaStream = kafkaStreams.get(0);
//there is only one receiver
if (kafkaSparkReceivers > 1) {
//unify stream of different receiver
unifiedKafkaStream = ssc.union(kafkaStreams.get(0), kafkaStreams.subList(1, kafkaStreams.size()));
}
JavaDStream<TestTable> contentPayloadElementsStream = unifiedKafkaStream.map(new Function<Tuple2<String, String>, TestTable>() {
@Override
public TestTable call(Tuple2<String, String> stringStringTuple2) throws Exception {
return new TestTable(new JsonParser().parse(stringStringTuple2._2()).getAsJsonObject().get("uuid").getAsString());
}
});
CassandraStreamingJavaUtil.javaFunctions(contentPayloadElementsStream).writerBuilder("test", "table1", CassandraJavaUtil.mapToRow(TestTable.class)).saveToCassandra();
ssc.start();
ssc.awaitTermination();
}
static public class TestTable {
private UUID id;
public TestTable(String id) {
this.id = UUID.fromString(id);
}
public UUID getId() {
return id;
}
public void setId(UUID id) {
this.id = id;
}
}
}
当Cassandra忙或无法访问时,我收到以下错误:
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 20.0 failed 1 times, most recent failure: Lost task 0.0 in stage 20.0 (TID 37, localhost): java.io.IOException: Failed to open native connection to Cassandra at {127.0.0.1}:9042
at com.datastax.spark.connector.cql.CassandraConnector$.com$datastax$spark$connector$cql$CassandraConnector$$createSession(CassandraConnector.scala:164)
at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$2.apply(CassandraConnector.scala:150)
at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$2.apply(CassandraConnector.scala:150)
at com.datastax.spark.connector.cql.RefCountedCache.createNewValueAndKeys(RefCountedCache.scala:31)
at com.datastax.spark.connector.cql.RefCountedCache.acquire(RefCountedCache.scala:56)
at com.datastax.spark.connector.cql.CassandraConnector.openSession(CassandraConnector.scala:81)
at com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:109)
at com.datastax.spark.connector.writer.TableWriter.write(TableWriter.scala:139)
at com.datastax.spark.connector.streaming.DStreamFunctions$$anonfun$saveToCassandra$1$$anonfun$apply$1.apply(DStreamFunctions.scala:34)
at com.datastax.spark.connector.streaming.DStreamFunctions$$anonfun$saveToCassandra$1$$anonfun$apply$1.apply(DStreamFunctions.scala:34)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: com.datastax.driver.core.exceptions.NoHostAvailableException: All host(s) tried for query failed (tried: /127.0.0.1:9042 (com.datastax.driver.core.TransportException: [/127.0.0.1:9042] Channel has been closed))
at com.datastax.driver.core.ControlConnection.reconnectInternal(ControlConnection.java:223)
at com.datastax.driver.core.ControlConnection.connect(ControlConnection.java:78)
at com.datastax.driver.core.Cluster$Manager.init(Cluster.java:1272)
at com.datastax.driver.core.Cluster.getMetadata(Cluster.java:336)
at com.datastax.spark.connector.cql.CassandraConnector$.com$datastax$spark$connector$cql$CassandraConnector$$createSession(CassandraConnector.scala:157)
... 15 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
我需要通过电子邮件通知此问题。最好的方法是捕获上述异常?