public static void main(String[] args){
Map<String, Object> kafkaParams = new HashMap<String, Object>();
if(args.length < 2){
logger.error("Please provide ssl key location and which env to connect to");
}
else{
ReadAndRelay.path = args[0];
ReadAndRelay.env = args[1];
}
kafkaParams.put("security.protocol", "SSL");
try{
if(ReadAndRelay.env.equals("dev")){
kafkaParams.put("group.id" , "group_id");
kafkaParams.put("ssl.keystore.location", ReadAndRelay.path+"/keystore.jks");
kafkaParams.put("ssl.truststore.location", ReadAndRelay.path+"/truststore.jks");
kafkaParams.put("bootstrap.servers", "bootstrap_servers");
}
}catch(Exception e){
e.printStackTrace();
}
kafkaParams.put("ssl.truststore.password", "truststore_password");
kafkaParams.put("ssl.keystore.password", "keystore_password");
kafkaParams.put("ssl.key.password", "key_password");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
Collection<String> topics = Arrays.asList("topic1","topic2");
SparkConf sparkConf = new SparkConf().setAppName("kafka-stream").setMaster("local[4]");
JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, new Duration(2000));
final JavaInputDStream<ConsumerRecord<String, String>> stream =
KafkaUtils.createDirectStream(
streamingContext,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
);
}
上面的代码适用于密钥库文件的路径是本地的。如何在一个公共位置(如HDFS)中使用密钥库文件,并在spark应用程序中使用它们来创建直接流(创建kafka-consumer)或为每个rdd创建一个kafak-producer(因为这些将在工作节点/执行者处执行)?
当我尝试在kafka客户端属性中通常使用hdfs文件位置时,抛出错误,指出找不到文件。将hdfs中的文件提供给kafka客户端属性的正确方法是什么。
17/03/20 16:18:00 ERROR StreamingContext: Error starting the context, marking it as stopped
org.apache.kafka.common.KafkaException: Failed to construct kafka consumer
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:702)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:557)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:540)
at org.apache.spark.streaming.kafka010.Subscribe.onStart(ConsumerStrategy.scala:83)
at org.apache.spark.streaming.kafka010.DirectKafkaInputDStream.consumer(DirectKafkaInputDStream.scala:75)
at org.apache.spark.streaming.kafka010.DirectKafkaInputDStream.start(DirectKafkaInputDStream.scala:243)
at org.apache.spark.streaming.DStreamGraph$$anonfun$start$5.apply(DStreamGraph.scala:49)
at org.apache.spark.streaming.DStreamGraph$$anonfun$start$5.apply(DStreamGraph.scala:49)
at scala.collection.parallel.mutable.ParArray$ParArrayIterator.foreach_quick(ParArray.scala:143)
at scala.collection.parallel.mutable.ParArray$ParArrayIterator.foreach(ParArray.scala:136)
at scala.collection.parallel.ParIterableLike$Foreach.leaf(ParIterableLike.scala:972)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply$mcV$sp(Tasks.scala:49)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply(Tasks.scala:48)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply(Tasks.scala:48)
at scala.collection.parallel.Task$class.tryLeaf(Tasks.scala:51)
at scala.collection.parallel.ParIterableLike$Foreach.tryLeaf(ParIterableLike.scala:969)
at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask$class.compute(Tasks.scala:152)
at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:443)
at scala.concurrent.forkjoin.RecursiveAction.exec(RecursiveAction.java:160)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
at ... run in separate thread using org.apache.spark.util.ThreadUtils ... ()
at org.apache.spark.streaming.StreamingContext.liftedTree1$1(StreamingContext.scala:578)
at org.apache.spark.streaming.StreamingContext.start(StreamingContext.scala:572)
at org.apache.spark.streaming.api.java.JavaStreamingContext.start(JavaStreamingContext.scala:556)
at it.gis.servicemanagement.dcap.dsvs.spark.kafka_stream.ReadAndRelay.main(ReadAndRelay.java:168)
Caused by: org.apache.kafka.common.KafkaException: org.apache.kafka.common.KafkaException: java.io.FileNotFoundException: hdfs:/namenode:9000/tmp/kafka_dev_certs/keystore.jks (No such file or directory)
at org.apache.kafka.common.network.SslChannelBuilder.configure(SslChannelBuilder.java:44)
at org.apache.kafka.common.network.ChannelBuilders.create(ChannelBuilders.java:70)
at org.apache.kafka.clients.ClientUtils.createChannelBuilder(ClientUtils.java:83)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:623)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:557)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:540)
at org.apache.spark.streaming.kafka010.Subscribe.onStart(ConsumerStrategy.scala:83)
at org.apache.spark.streaming.kafka010.DirectKafkaInputDStream.consumer(DirectKafkaInputDStream.scala:75)
at org.apache.spark.streaming.kafka010.DirectKafkaInputDStream.start(DirectKafkaInputDStream.scala:243)
at org.apache.spark.streaming.DStreamGraph$$anonfun$start$5.apply(DStreamGraph.scala:49)
at org.apache.spark.streaming.DStreamGraph$$anonfun$start$5.apply(DStreamGraph.scala:49)
at scala.collection.parallel.mutable.ParArray$ParArrayIterator.foreach_quick(ParArray.scala:143)
at scala.collection.parallel.mutable.ParArray$ParArrayIterator.foreach(ParArray.scala:136)
at scala.collection.parallel.ParIterableLike$Foreach.leaf(ParIterableLike.scala:972)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply$mcV$sp(Tasks.scala:49)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply(Tasks.scala:48)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply(Tasks.scala:48)
at scala.collection.parallel.Task$class.tryLeaf(Tasks.scala:51)
at scala.collection.parallel.ParIterableLike$Foreach.tryLeaf(ParIterableLike.scala:969)
at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask$class.compute(Tasks.scala:152)
at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:443)
at scala.concurrent.forkjoin.RecursiveAction.exec(RecursiveAction.java:160)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Caused by: org.apache.kafka.common.KafkaException: java.io.FileNotFoundException: hdfs:/namenode:9000/tmp/kafka_dev_certs/keystore.jks (No such file or directory)
at org.apache.kafka.common.security.ssl.SslFactory.configure(SslFactory.java:110)
at org.apache.kafka.common.network.SslChannelBuilder.configure(SslChannelBuilder.java:41)
... 25 more
Caused by: java.io.FileNotFoundException: hdfs://namenode:9000/tmp/kafka_dev_certs/keystore.jks (No such file or directory)
at java.io.FileInputStream.open0(Native Method)
at java.io.FileInputStream.open(FileInputStream.java:195)
at java.io.FileInputStream.<init>(FileInputStream.java:138)
at java.io.FileInputStream.<init>(FileInputStream.java:93)
at org.apache.kafka.common.security.ssl.SslFactory$SecurityStore.load(SslFactory.java:205)
at org.apache.kafka.common.security.ssl.SslFactory$SecurityStore.access$000(SslFactory.java:190)
at org.apache.kafka.common.security.ssl.SslFactory.createSSLContext(SslFactory.java:126)
at org.apache.kafka.common.security.ssl.SslFactory.configure(SslFactory.java:108)
... 26 more
17/03/20 16:18:00 INFO ReceiverTracker: ReceiverTracker stopped
17/03/20 16:18:00 INFO JobGenerator: Stopping JobGenerator immediately
17/03/20 16:18:00 INFO RecurringTimer: Stopped timer for JobGenerator after time -1
17/03/20 16:18:00 INFO JobGenerator: Stopped JobGenerator
17/03/20 16:18:00 INFO JobScheduler: Stopped JobScheduler
答案 0 :(得分:0)
迟到了他们所说的话。但是,这是我学到的:
spark-submit --files <commaSeparatedList> ...
会将文件复制到所有工作人员的工作目录中。
例如:spark-submit --files keystore.jks,truststore.jks ...
可以在Spark(scala)中用作:
val df = spark
.readStream
.format("kafka")
...
.option("kafka.ssl.truststore.location", "truststore.jks")
.option("kafka.ssl.truststore.password", "")
.option("kafka.ssl.keystore.location", "keystore.jks")
.option("kafka.ssl.keystore.password", "")
...
.load()
关于SO的其他帖子(我没有可用的链接)建议使用:org.apache.spark.SparkFiles.get("keystore.jks")
并在.option("kafka.ssl.truststore.location", ...
中使用该位置,但这仍然导致FNFE
。
我观察到的一些相似之处:
resources
文件夹中的文件,例如:resources/input.txt
可以读为Source.fromFile("/input.txt")
,这与使用--files
实现的功能类似