我正在尝试从Kafka获取Stream消息并使用spark发送到ElasticSearch。 Spark以各种系统的df大小的形式从kafka接收消息,并为不同的内存使用生成消息并将其推送到ElasticSearch。我得到的问题是索引正在创建,但消息不是来自Elastic。我是新来的。
package rnd
import com.sun.rowset.internal.Row
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.elasticsearch.spark.sql._
import org.elasticsearch.spark._
object WordFind {
def main(args: Array[String]) {
}
import org.apache.spark.SparkConf
val conf = new SparkConf().setMaster("local[*]").setAppName("KafkaReceiver")
val sc = new SparkContext(conf)
//val checkpointDir = "/usr/local/kafka/kafka_2.11-0.11.0.2/checkpoint/"
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
val batchIntervalSeconds = 2
//val ssc = new StreamingContext(conf, Seconds(10))
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.dstream.ReceiverInputDStream
val ssc = new StreamingContext(sc, Seconds(batchIntervalSeconds))
val kafkaStream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc, "localhost:2181",
"spark-streaming-consumer-group", Map("wordcounttopic" -> 5))
import org.apache.spark.streaming.dstream.DStream
val filteredStream: DStream[Array[String]] = kafkaStream
.filter(!_._2.contains("Filesystem")) // eliminate header
.map(_._2.split("\\s+")) // split with space
val outputDStream: DStream[String] = filteredStream.map {
row =>
val useIdx = row.length - 2
val useSystemInfo = row.length - 6
// if Use%>70 for any case> Message: Increase ROM size by 20%
// if Use%<30% for any case> Message: Decrease ROM size by 25%
val sysName = row(useSystemInfo).toString
val usePercent = row(useIdx).replace("%", "").toInt
usePercent match {
case x if x > 70 => sysName + " Increase ROM size by 20%"
case x if x < 30 => sysName + "Decrease ROM size by 25%"
case _ => "Undefined"
usePercent.toString
}
}
import org.elasticsearch.spark.sql._
// outputDStream.print()
//outputDStream.print()
val config: Map[String,String] = Map("es.index.auto.create" -> "yes")
outputDStream.foreachRDD{messageRDD =>
//messageRDD.saveToEs("dfvaluemessage_v1/km")
messageRDD.saveToEs("dfvaluemessage_v1/km", config)
}
//outputDStream.foreachRDD{messageRDD =>
//messageRDD.saveToEs("dfvaluemessage_v1/km")
//}
//outputDStream.saveToEs("kafkawordcount_v1/kwc")
// To make sure data is not deleted by the time we query it interactively
ssc.remember(Minutes(1))
//ssc.checkpoint(checkpointDir)
ssc
// }
// This starts the streaming context in the background.
ssc.start()
// This is to ensure that we wait for some time before the background streaming job starts. This will put this cell on hold for 5 times the batchIntervalSeconds.
ssc.awaitTerminationOrTimeout(batchIntervalSeconds * 5 * 1000)
}
下面的ElasticSearch输出:
答案 0 :(得分:-1)
问题在于我尝试使用foreachRDD并使用savetoES将消息推送到Elasticsearch。但它只能通过创建数据帧来完成。我做了那个改变,它工作得很好。
package rnd
import com.sun.rowset.internal.Row
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.elasticsearch.spark.sql._
import org.elasticsearch.spark._
object WordFind {
def main(args: Array[String]) {
}
import org.apache.spark.SparkConf
val conf = new SparkConf().setMaster("local[*]").setAppName("KafkaReceiver")
val sc = new SparkContext(conf)
//val checkpointDir = "/usr/local/kafka/kafka_2.11-0.11.0.2/checkpoint/"
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
val batchIntervalSeconds = 2
//val ssc = new StreamingContext(conf, Seconds(10))
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.dstream.ReceiverInputDStream
val ssc = new StreamingContext(sc, Seconds(batchIntervalSeconds))
val kafkaStream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc, "localhost:2181",
"spark-streaming-consumer-group", Map("wordcounttopic" -> 5))
import org.apache.spark.streaming.dstream.DStream
val filteredStream: DStream[Array[String]] = kafkaStream
.filter(!_._2.contains("Filesystem")) // eliminate header
.map(_._2.split("\\s+")) // split with space
val outputDStream: DStream[String] = filteredStream.map {
row =>
val useIdx = row.length - 2
val useSystemInfo = row.length - 6
// if Use%>70 for any case> Message: Increase ROM size by 20%
// if Use%<30% for any case> Message: Decrease ROM size by 25%
val sysName = row(useSystemInfo).toString
val usePercent = row(useIdx).replace("%", "").toInt
usePercent match {
case x if x > 70 => sysName + " Increase ROM size by 20%"
case x if x < 30 => sysName + "Decrease ROM size by 25%"
case _ => "Undefined"
usePercent.toString
}
}
import org.elasticsearch.spark.sql._
// outputDStream.print()
//outputDStream.print()
val config: Map[String,String] = Map("es.index.auto.create" -> "yes")
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
outputDStream.foreachRDD{messageRDD =>
val df = messageRDD.toDF("messages")
//messageRDD.saveToEs("dfvaluemessage_v1/km")
df.saveToEs("dfvaluemessage_v1/km", config)
}
//outputDStream.foreachRDD{messageRDD =>
//messageRDD.saveToEs("dfvaluemessage_v1/km")
//}
//outputDStream.saveToEs("kafkawordcount_v1/kwc")
// To make sure data is not deleted by the time we query it interactively
ssc.remember(Minutes(1))
//ssc.checkpoint(checkpointDir)
ssc
// }
// This starts the streaming context in the background.
ssc.start()
// This is to ensure that we wait for some time before the background streaming job starts. This will put this cell on hold for 5 times the batchIntervalSeconds.
ssc.awaitTerminationOrTimeout(batchIntervalSeconds * 5 * 1000)
}