每行迭代的数据帧保存到cassandra

时间:2018-05-30 13:47:06

标签: cassandra apache-kafka apache-spark-sql spark-streaming

我有以下代码: -

 def writeToCassandra(cassandraConnector: CassandraConnector) = new ForeachWriter[Row] {
override def process(row: Row): Unit = {
println("row is " + row.toString())}
override def close(errorOrNull: Throwable): Unit = {}

override def open(partitionId: Long, version: Long): Boolean =
  true 
}

val conf = new SparkConf()
  .setAppName("Data")
  .set("spark.cassandra.connection.host", "192.168.0.40,192.168.0.106,192.168.0.113")
  .set("spark.cassandra.connection.keep_alive_ms", "20000")
   .set("spark.executor.memory", "1g")
  .set("spark.driver.memory", "2g")
  .set("spark.submit.deployMode", "cluster")
  .set("spark.executor.instances", "9")
  .set("spark.executor.cores", "1")
  .set("spark.cores.max", "9")
  .set("spark.driver.cores", "3")
  .set("spark.ui.port", "4040")
  .set("spark.streaming.backpressure.enabled", "true")
  .set("spark.speculation", "true")

println("Spark Configuration Done")
val spark = SparkSession
  .builder
  .appName("Data")
  .config(conf)
  .master("local[2]")
  .getOrCreate()
println("Spark Session Config Done")

val cassandraConnector = CassandraConnector(conf)
import spark.implicits._
import org.apache.spark.sql.streaming.OutputMode
val dataStream =
  spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "192.168.0.78:9092,192.168.0.78:9093,192.168.0.78:9094")
    .option("subscribe", "historyfleet")
    .load()

val query =
  dataStream
    .writeStream
    .outputMode(OutputMode.Append())
    .foreach(writeToCassandra(cassandraConnector))
    .format("console")
    .start()

query.awaitTermination()
query.stop()

它将运行时错误视为: -

Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/spark/sql/execution/streaming/Source$class
at org.apache.spark.sql.kafka010.KafkaSource.<init>(KafkaSource.scala:80)
at org.apache.spark.sql.kafka010.KafkaSourceProvider.createSource(KafkaSourceProvider.scala:94)
at org.apache.spark.sql.execution.datasources.DataSource.createSource(DataSource.scala:240)
at org.apache.spark.sql.streaming.StreamingQueryManager$$anonfun$1.applyOrElse(StreamingQueryManager.scala:245)
at org.apache.spark.sql.streaming.StreamingQueryManager$$anonfun$1.applyOrElse(StreamingQueryManager.scala:241)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:279)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:279)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:278)
at org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:287)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.execution.streaming.Source$class
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 14 more

我的应用程序花了一些时间将DataFrame插入到Cassandra中,因此我正在尝试检查单次迭代是否会加快我的性能,但是它会给出上述错误。 使用3个节点集群 - 12个执行器,每个1个核心。它在cassandra中每秒进行6000次插入。需要优化这个。 请给我任何建议。谢谢,

0 个答案:

没有答案