如何在DStream中逐行保存Spark Streaming RDD并使用spark-cassandra-connector保存表cassandra中的每一行

时间:2017-07-12 16:45:57

标签: apache-spark cassandra connector

我有一个包含4行的RDD,每行包含4个以逗号分隔的字段,我想在表中逐行记录Cassandra还包含与RDD行相同数量的字段,我认为有必要逐行浏览每个RDD,例如:

RDD:

test1,2017-07-12T09:30:48.024-0700,19,Status
test1,2017-07-12T09:30:49.026-0700,91,Status
test2,2017-07-12T09:30:48.025-0700,21,Status
test2,2017-07-12T09:30:49.027-0700,83,Status

如果我想用SQL做就像这样

insert into key3.table1(topic_name, date_time, mesure, status) values("test1","2017-07-12T09:30:48.024-0700",19,"Status")

insert into key3.table1(topic_name, date_time, mesure, status) values("test1","2017-07-12T09:30:49.026-0700",91,"Status")

insert into key3.table1(topic_name, date_time, mesure, status)  values("test1","2017-07-12T09:30:48.025-0700",21,"Status")

insert into key3.table1(topic_name, date_time, mesure, status) values("test1","2017-07-12T09:30:49.027-0700",83,"Status")

我想做同样的事情,但使用DStream的预定义方法,如foreachRDD,transfrom,filter,map。我尝试过几次但是没用,这是我用过的代码

package org.apache.sparkwordcount

import com.datastax.spark.connector.streaming._
import com.datastax.spark.connector.SomeColumns
import org.apache.spark._
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._ 
import org.apache.spark.streaming.kafka._
import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.driver.core.Session
//import org.apache.kafka.serializer.StringDecoder
import _root_.kafka.serializer.StringDecoder



object SparkCassandra {
    def main(args: Array[String]) {

     val sparkConf = new SparkConf()
         .setAppName("KakfaStreamToCassandra").setMaster("spark://127.0.1.1:7077")
         .set("spark.cassandra.connection.host", "127.0.0.1")
         .set("spark.cassandra.connection.port", "9042")
     .set("spark.cassandra.auth.username","cassandra")
     .set("spark.cassandra.auth.password","cassandra")
     .set("spark.cassandra.connection.keep_alive_ms","3600000")
     .set("spark.cassandra.connection.timeout_ms","3600")

    /*CassandraConnector(sparkConf).withSessionDo { session =>
      session.execute("CREATE KEYSPACE test3 WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
      session.execute("CREATE TABLE test3.words (word text PRIMARY KEY, count int)")
      session.execute("insert into test2.words (word,count) values('ddd',1)")       
    }*/

     val topics = "test1,test2" 
     val ssc = new StreamingContext(sparkConf, Seconds(2))
    val keysspace="sparkcassandra"
    val table="kafkatable"
     val topicsSet = topics.split(",").toSet
     val kafkaParams = Map[String, String]("metadata.broker.list" -> "localhost:9092")
     val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)

     val lines = messages.map(_._2)


     val words = lines.flatMap(_.split("\n"))
        //.saveToCassandra(keysspace, table, SomeColumns("topic_name", "date_time", "mesure", "status"))


     words.print()

     //val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)


    //lines.flatMap(_.split(","))
      //.saveToCassandra(keysspace, table, SomeColumns("topic_name", "date_time", "mesure", "status"))

        // messages.foreachRDD { rdd => rdd.foreach { record => println(record._2)}}


     ssc.start()
    //words.getClass
     ssc.awaitTermination()

}
}

如果您有想法或者您可以更正我的代码,请不要犹豫

0 个答案:

没有答案