我试图通过Apache Spark Streaming阅读Kafka主题,但我无法弄清楚如何将DStream中的数据转换为DataFrame,然后存储在临时表中。 Kafka中的消息采用Avro格式,由Kafka JDBC Connect从数据库创建。我有下面的代码,它可以正常工作,直到它执行r@r:~/aff/openh264$ make OS=android NDKROOT=/home/r/aff/ndk TARGET=android-27 NDKLEVEL=27 sysroot=/home/r/aff/ndk/sysroot/home/r/aff/ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-gcc -O3 -DNDEBUG -DHAVE_NEON -march=armv7-a -mfloat-abi=softfp -mfpu=vfpv3-d16 -DANDROID_NDK -fpic --sysroot=/home/r/aff/ndk/platforms/android-27/arch-arm -MMD -MP -DGENERATED_VERSION_HEADER -I./codec/api/svc -I./codec/common/inc -Icodec/common/inc -I/home/r/aff/ndk/sysroot -Dandroid_getCpuIdArm=wels_getCpuIdArm -Dandroid_setCpuArm=wels_setCpuArm -Dandroid_getCpuCount=wels_getCpuCount -Dandroid_getCpuFamily=wels_getCpuFamily -Dandroid_getCpuFeatures=wels_getCpuFeatures -Dandroid_setCpu=wels_setCpu -I/home/r/aff/ndk/sources/android/cpufeatures -c -o codec/common/src/cpu-features.o /home/r/aff/ndk/sources/android/cpufeatures/cpu-features.c
In file included from /home/r/aff/ndk/sources/android/cpufeatures/cpu-features.c:64:0:
/home/r/aff/ndk/sources/android/cpufeatures/cpu-features.h:31:23: fatal error: sys/cdefs.h: No such file or directory
#include <sys/cdefs.h>
^
compilation terminated.
build/platform-android.mk:96: recipe for target 'codec/common/src/cpu-features.o' failed
make: *** [codec/common/src/cpu-features.o] Error 1
来读取json到dataframe。
spark.read.json
执行行package consumerTest
import io.confluent.kafka.serializers.KafkaAvroDeserializer
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
import scala.util.parsing.json.{JSON, JSONObject}
object Consumer {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.master("local")
.appName("my-spark-app")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate();
import spark.implicits._
val ssc = new StreamingContext(spark.sparkContext, Seconds(10))
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "<kafka-server>:9092",
"key.deserializer" -> classOf[KafkaAvroDeserializer],
"value.deserializer" -> classOf[KafkaAvroDeserializer],
"group.id" -> "sakwq",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> "false",
"schema.registry.url" -> "http://<schema-registry>:8181"
)
val topics = Array("cdcemployee")
val stream = KafkaUtils.createDirectStream[String, Object](
ssc,
PreferConsistent,
Subscribe[String, Object](topics, kafkaParams)
)
val data = stream.map(record => {
println(record.value.toString())
record.value
val df = spark.read.json(record.value.toString())
})
data.print();
ssc.start()
ssc.awaitTermination()
}
}
val df = spark.read.json(record.value.toString())
此外,如果删除18/05/10 09:49:11 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:135)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:133)
at org.apache.spark.sql.DataFrameReader.<init>(DataFrameReader.scala:689)
at org.apache.spark.sql.SparkSession.read(SparkSession.scala:645)
at consumerTest.Consumer$.$anonfun$main$1(Consumer.scala:63)
at consumerTest.Consumer$.$anonfun$main$1$adapted(Consumer.scala:60)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:393)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at scala.collection.AbstractIterator.to(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1354)
at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1354)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/05/10 09:49:11 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:135)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:133)
at org.apache.spark.sql.DataFrameReader.<init>(DataFrameReader.scala:689)
at org.apache.spark.sql.SparkSession.read(SparkSession.scala:645)
at consumerTest.Consumer$.$anonfun$main$1(Consumer.scala:63)
at consumerTest.Consumer$.$anonfun$main$1$adapted(Consumer.scala:60)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:393)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at scala.collection.AbstractIterator.to(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1354)
at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1354)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
语句
println(record.value.toString())
时会打印样本数据
spark.read.json
任何人都可以帮我解决如何将其转换为数据帧并将其临时存储在表格中的问题吗?
编辑:
答案 0 :(得分:1)
Json数据:
{"timestamp": "1571053218000","t1": "55.23","t2": "10","t3": "ON"}
{"timestamp": "1571053278000","t1": "63.23","t2": "11","t3": "OFF"}
{"timestamp": "1571053338000","t1": "73.23","t2": "12","t3": "ON"}
{"timestamp": "1571053398000","t1": "83.23","t2": "13","t3": "ON"}
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.types import IntegerType, LongType, DecimalType,StructType, StructField, StringType
from pyspark.sql import Row
from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.sql import Window
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
ssc = StreamingContext(sc, 5)
stream_data = ssc.textFileStream("/filepath/")
def readMyStream(rdd):
if not rdd.isEmpty():
df = spark.read.json(rdd)
print('Started the Process')
print('Selection of Columns')
df = df.select('t1','t2','t3','timestamp').where(col("timestamp").isNotNull())
df.show()
stream_data.foreachRDD( lambda rdd: readMyStream(rdd) )
ssc.start()
ssc.stop()