我有一个包含4行的RDD,每行包含4个以逗号分隔的字段,我想在表中逐行记录Cassandra还包含与RDD行相同数量的字段,我认为有必要逐行浏览每个RDD,例如:
RDD:
test1,2017-07-12T09:30:48.024-0700,19,Status
test1,2017-07-12T09:30:49.026-0700,91,Status
test2,2017-07-12T09:30:48.025-0700,21,Status
test2,2017-07-12T09:30:49.027-0700,83,Status
如果我想用SQL做就像这样
insert into key3.table1(topic_name, date_time, mesure, status) values("test1","2017-07-12T09:30:48.024-0700",19,"Status")
insert into key3.table1(topic_name, date_time, mesure, status) values("test1","2017-07-12T09:30:49.026-0700",91,"Status")
insert into key3.table1(topic_name, date_time, mesure, status) values("test1","2017-07-12T09:30:48.025-0700",21,"Status")
insert into key3.table1(topic_name, date_time, mesure, status) values("test1","2017-07-12T09:30:49.027-0700",83,"Status")
我想做同样的事情,但使用DStream的预定义方法,如foreachRDD,transfrom,filter,map。我尝试过几次但是没用,这是我用过的代码
package org.apache.sparkwordcount
import com.datastax.spark.connector.streaming._
import com.datastax.spark.connector.SomeColumns
import org.apache.spark._
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.kafka._
import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.driver.core.Session
//import org.apache.kafka.serializer.StringDecoder
import _root_.kafka.serializer.StringDecoder
object SparkCassandra {
def main(args: Array[String]) {
val sparkConf = new SparkConf()
.setAppName("KakfaStreamToCassandra").setMaster("spark://127.0.1.1:7077")
.set("spark.cassandra.connection.host", "127.0.0.1")
.set("spark.cassandra.connection.port", "9042")
.set("spark.cassandra.auth.username","cassandra")
.set("spark.cassandra.auth.password","cassandra")
.set("spark.cassandra.connection.keep_alive_ms","3600000")
.set("spark.cassandra.connection.timeout_ms","3600")
/*CassandraConnector(sparkConf).withSessionDo { session =>
session.execute("CREATE KEYSPACE test3 WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
session.execute("CREATE TABLE test3.words (word text PRIMARY KEY, count int)")
session.execute("insert into test2.words (word,count) values('ddd',1)")
}*/
val topics = "test1,test2"
val ssc = new StreamingContext(sparkConf, Seconds(2))
val keysspace="sparkcassandra"
val table="kafkatable"
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> "localhost:9092")
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)
val lines = messages.map(_._2)
val words = lines.flatMap(_.split("\n"))
//.saveToCassandra(keysspace, table, SomeColumns("topic_name", "date_time", "mesure", "status"))
words.print()
//val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
//lines.flatMap(_.split(","))
//.saveToCassandra(keysspace, table, SomeColumns("topic_name", "date_time", "mesure", "status"))
// messages.foreachRDD { rdd => rdd.foreach { record => println(record._2)}}
ssc.start()
//words.getClass
ssc.awaitTermination()
}
}
如果您有想法或者您可以更正我的代码,请不要犹豫