Spark(v2.4)程序功能:
Kafka
队列中读取JSON数据问题得到:
-获取Resetting offset for partition nifi-log-batch-0 to offset 2826180.
源代码:
package io.xyz.streaming
import org.apache.spark.sql.avro._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.functions._
object readKafkaJson {
private val topic = "nifi-log-batch"
private val kafkaUrl = "http://<hostname>:9092"
private val chk = "/home/xyz/tmp/checkpoint"
private val outputFileLocation = "/home/xyz/abc/data"
private val sparkSchema = StructType(Array(
StructField("timestamp", StringType),
StructField("level", StringType),
StructField("thread", StringType),
StructField("class", StringType),
StructField("message", StringType),
StructField("updatedOn", StringType),
StructField("stackTrace", StringType)))
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("ConfluentConsumer")
.master("local[*]")
.getOrCreate()
import spark.implicits._
// ===================Read Kafka data in JSON==================
val df = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", kafkaUrl)
.option("startingOffsets", "latest")
.option("subscribe", topic)
.load()
val dfs1 = df
.selectExpr("CAST(value AS STRING)")
.select(from_json(col("value"), sparkSchema).alias("my_column"))
.select("my_column.*")
// ===================Write to console==================
dfs1
.writeStream
.format("console")
.start()
.awaitTermination()
}
}
控制台上的详细问题日志:
2019-04-10 01:12:58 INFO WriteToDataSourceV2Exec:54 - Start processing data source writer: org.apache.spark.sql.execution.streaming.sources.MicroBatchWriter@622d0057. The input RDD has 0 partitions.
2019-04-10 01:12:58 INFO SparkContext:54 - Starting job: start at readKafkaJson.scala:70
2019-04-10 01:12:58 INFO DAGScheduler:54 - Job 0 finished: start at readKafkaJson.scala:70, took 0.003870 s
2019-04-10 01:12:58 INFO WriteToDataSourceV2Exec:54 - Data source writer org.apache.spark.sql.execution.streaming.sources.MicroBatchWriter@622d0057 is committing.
-------------------------------------------
Batch: 0
-------------------------------------------
2019-04-10 01:12:58 INFO CodeGenerator:54 - Code generated in 41.952695 ms
+---------+-----+------+-----+-------+---------+----------+
|timestamp|level|thread|class|message|updatedOn|stackTrace|
+---------+-----+------+-----+-------+---------+----------+
+---------+-----+------+-----+-------+---------+----------+
2019-04-10 01:12:58 INFO WriteToDataSourceV2Exec:54 - Data source writer org.apache.spark.sql.execution.streaming.sources.MicroBatchWriter@622d0057 committed.
2019-04-10 01:12:58 INFO SparkContext:54 - Starting job: start at readKafkaJson.scala:70
2019-04-10 01:12:58 INFO DAGScheduler:54 - Job 1 finished: start at readKafkaJson.scala:70, took 0.000104 s
2019-04-10 01:12:58 INFO CheckpointFileManager:54 - Writing atomically to file:/tmp/temporary-df2fea18-7b2f-4146-bcfd-7923cfab65e7/commits/0 using temp file file:/tmp/temporary-df2fea18-7b2f-4146-bcfd-7923cfab65e7/commits/.0.eb290a31-1965-40e7-9028-d18f2eea0627.tmp
2019-04-10 01:12:58 INFO CheckpointFileManager:54 - Renamed temp file file:/tmp/temporary-df2fea18-7b2f-4146-bcfd-7923cfab65e7/commits/.0.eb290a31-1965-40e7-9028-d18f2eea0627.tmp to file:/tmp/temporary-df2fea18-7b2f-4146-bcfd-7923cfab65e7/commits/0
2019-04-10 01:12:58 INFO MicroBatchExecution:54 - Streaming query made progress: {
"id" : "fb44fbef-5d05-4bb8-ae72-3327b98af261",
"runId" : "ececfe49-bbc6-4964-8798-78980cbec525",
"name" : null,
"timestamp" : "2019-04-10T06:12:56.414Z",
"batchId" : 0,
"numInputRows" : 0,
"processedRowsPerSecond" : 0.0,
"durationMs" : {
"addBatch" : 1324,
"getBatch" : 10,
"getEndOffset" : 1,
"queryPlanning" : 386,
"setOffsetRange" : 609,
"triggerExecution" : 2464,
"walCommit" : 55
},
"stateOperators" : [ ],
"sources" : [ {
"description" : "KafkaV2[Subscribe[nifi-log-batch]]",
"startOffset" : null,
"endOffset" : {
"nifi-log-batch" : {
"0" : 2826180
}
},
"numInputRows" : 0,
"processedRowsPerSecond" : 0.0
} ],
"sink" : {
"description" : "org.apache.spark.sql.execution.streaming.ConsoleSinkProvider@6ced6212"
}
}
2019-04-10 01:12:58 INFO Fetcher:583 - [Consumer clientId=consumer-1, groupId=spark-kafka-source-9a027b2b-0a3a-4773-a356-a585e488062c--81433247-driver-0] Resetting offset for partition nifi-log-batch-0 to offset 2826180.
2019-04-10 01:12:58 INFO MicroBatchExecution:54 - Streaming query made progress: {
"id" : "fb44fbef-5d05-4bb8-ae72-3327b98af261",
"runId" : "ececfe49-bbc6-4964-8798-78980cbec525",
"name" : null,
"timestamp" : "2019-04-10T06:12:58.935Z",
"batchId" : 1,
"numInputRows" : 0,
"inputRowsPerSecond" : 0.0,
"processedRowsPerSecond" : 0.0,
"durationMs" : {
"getEndOffset" : 1,
"setOffsetRange" : 11,
"triggerExecution" : 15
},
"stateOperators" : [ ],
"sources" : [ {
"description" : "KafkaV2[Subscribe[nifi-log-batch]]",
"startOffset" : {
"nifi-log-batch" : {
"0" : 2826180
}
},
"endOffset" : {
"nifi-log-batch" : {
"0" : 2826180
}
},
"numInputRows" : 0,
"inputRowsPerSecond" : 0.0,
"processedRowsPerSecond" : 0.0
} ],
"sink" : {
"description" : "org.apache.spark.sql.execution.streaming.ConsoleSinkProvider@6ced6212"
}
}
2019-04-10 01:12:58 INFO Fetcher:583 - [Consumer clientId=consumer-1, groupId=spark-kafka-source-9a027b2b-0a3a-4773-a356-a585e488062c--81433247-driver-0] Resetting offset for partition nifi-log-batch-0 to offset 2826180.
2019-04-10 01:12:58 INFO Fetcher:583 - [Consumer clientId=consumer-1, groupId=spark-kafka-source-9a027b2b-0a3a-4773-a356-a585e488062c--81433247-driver-0] Resetting offset for partition nifi-log-batch-0 to offset 2826180.
2019-04-10 01:12:58 INFO Fetcher:583 - [Consumer clientId=consumer-1, groupId=spark-kafka-source-9a027b2b-0a3a-4773-a356-a585e488062c--81433247-driver-0] Resetting offset for partition nifi-log-batch-0 to offset 2826180.
即使我也在pySpark
中运行等效代码,我也面临同样的问题。
请提出如何解决此问题的建议。
通过以下命令提交的工作:
spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0 --jars /home/xyz/Softwares/spark-streaming-kafka-0-8-assembly_2.11-2.4.0.jar --class io.xyz.streaming.readKafkaJson --master local[*] /home/xyz/ScalaCode/target/SparkSchemaKafka-0.0.1-SNAPSHOT-jar-with-dependencies.jar
答案 0 :(得分:0)
询问者似乎已经找到了解决方案,以下是注释中的相关部分:
主要分辨率
这是Scala中的架构结构问题。纠正后 解决问题的方案。
次要主题
在Pyspark代码中正在处理,但消息未 停止即我能够运行代码并能够编写流 数据导入JSON文件,但控制台消息中填充了 以上提到的
Resetting offset for ...
日志消息
实际上是pyspark问题,正在打印INFO消息, 我禁用了
之后一切都很好。