我是新手来激发流媒体并尝试实现kafka,MongoDB集成,其中我的代码从Kafka主题中提取JSON数据并插入到MongoDB中。以下是我的代码
package com.streams.sparkmongo
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import com.mongodb.spark._
object MongoStream {
def main(args: Array[String]) {
//Initializing Spark
val conf = SparkSession.builder()
.master("spark://localhost:7077")
.appName("MongoStream")
.config("spark.mongodb.output.uri", "mongodb://localhost/test.myCollection")
.getOrCreate()
//Defining Kafka configuration parameters
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "localhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "mongostream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (true: java.lang.Boolean),
"zookeeper.connect" -> "localhost:2181",
"zookeeper.connection.timeout.ms" -> "10000",
"zookeeper.session.timeout.ms" -> "10000")
// Create a StreamingContext with a 10 second batch size from a SparkConf
val ssc = new StreamingContext(conf.sparkContext, Seconds(10))
//Defining Kafka topic
val topics = Array("mongostreamtest")
val stream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
val elementDstream = stream.map(v => v.value).foreachRDD { rdd =>
import conf.implicits._
//val payload = conf.read.json(rdd)
rdd.saveToMongoDB()
}
ssc.start()
ssc.awaitTermination
}
}
我正在循环到每个RDD并使用帮助方法saveToMongoDB()将其保存到MongoDB。我收到以下错误。任何投入都将非常感激。
WARN TaskSetManager: Lost task 0.2 in stage 9.0 (TID 11, <<ip>>, executor 0): com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting for a server that matches WritableServerSelector. Client view of cluster state is {type=REPLICA_SET, servers=[{address=localhost:27017, type=REPLICA_SET_SECONDARY, roundTripTime=0.6 ms, state=CONNECTED}]
at com.mongodb.connection.BaseCluster.createTimeoutException(BaseCluster.java:377)
at com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:104)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:75)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:71)
at com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:411)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:168)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:74)
at com.mongodb.Mongo.execute(Mongo.java:845)
at com.mongodb.Mongo$2.execute(Mongo.java:828)
at com.mongodb.MongoCollectionImpl.insertMany(MongoCollectionImpl.java:338)
at com.mongodb.MongoCollectionImpl.insertMany(MongoCollectionImpl.java:322)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1$$anonfun$apply$2.apply(MongoSpark.scala:119)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1$$anonfun$apply$2.apply(MongoSpark.scala:119)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1.apply(MongoSpark.scala:119)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1.apply(MongoSpark.scala:118)
at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:186)
at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:184)
at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:171)
at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:171)
at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:154)
at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:171)
at com.mongodb.spark.MongoConnector.withCollectionDo(MongoConnector.scala:184)
at com.mongodb.spark.MongoSpark$$anonfun$save$1.apply(MongoSpark.scala:118)
at com.mongodb.spark.MongoSpark$$anonfun$save$1.apply(MongoSpark.scala:117)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)