Kafka IllegalArgumentException:空文本

时间:2018-06-22 14:27:25

标签: scala apache-spark apache-kafka

我正在尝试连接到在docker shell中运行的kafka服务器。当我运行程序时,kafka服务器通过打印来响应

kafkamanager     | [error] p.c.s.n.PlayDefaultUpstreamHandler - Exception caught in Netty
kafkamanager     | java.lang.IllegalArgumentException: empty text
kafkamanager     |      at org.jboss.netty.handler.codec.http.HttpVersion.<init>(HttpVersion.java:89) ~[io.netty.netty-3.10.4.Final.jar:na]
kafkamanager     |      at org.jboss.netty.handler.codec.http.HttpVersion.valueOf(HttpVersion.java:62) ~[io.netty.netty-3.10.4.Final.jar:na]
kafkamanager     |      at org.jboss.netty.handler.codec.http.HttpRequestDecoder.createMessage(HttpRequestDecoder.java:75) ~[io.netty.netty-3.10.4.Final.jar:na]
kafkamanager     |      at org.jboss.netty.handler.codec.http.HttpMessageDecoder.decode(HttpMessageDecoder.java:191) ~[io.netty.netty-3.10.4.Final.jar:na]
kafkamanager     |      at org.jboss.netty.handler.codec.http.HttpMessageDecoder.decode(HttpMessageDecoder.java:102) ~[io.netty.netty-3.10.4.Final.jar:na]
kafkamanager     |      at org.jboss.netty.handler.codec.replay.ReplayingDecoder.callDecode(ReplayingDecoder.java:500) ~[io.netty.netty-3.10.4.Final.jar:na]
kafkamanager     |      at org.jboss.netty.handler.codec.replay.ReplayingDecoder.cleanup(ReplayingDecoder.java:554) ~[io.netty.netty-3.10.4.Final.jar:na]
kafkamanager     |      at org.jboss.netty.handler.codec.frame.FrameDecoder.channelDisconnected(FrameDecoder.java:365) [io.netty.netty-3.10.4.Final.jar:na]
kafkamanager     |      at org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:102) [io.netty.netty-3.10.4.Final.jar:na]
kafkamanager     |      at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564) [io.netty.netty-3.10.4.Final.jar:na]

一遍又一遍,即使程序本身由于超时异常而崩溃:

18/06/22 10:10:51 ERROR Utils: Aborting task org.apache.kafka.common.errors.TimeoutException: Failed to update metadata after 60000 ms.
18/06/22 10:10:51 ERROR Utils: Aborting task org.apache.kafka.common.errors.TimeoutException: Failed to update metadata after 60000 ms.
18/06/22 10:10:51 ERROR Utils: Aborting taskorg.apache.kafka.common.errors.TimeoutException: Failed to update metadata after 60000 ms.
18/06/22 10:10:51 ERROR DataWritingSparkTask: Writer for partition 2 is aborting.
18/06/22 10:10:51 ERROR DataWritingSparkTask: Writer for partition 0 is aborting.
18/06/22 10:10:51 ERROR DataWritingSparkTask: Writer for partition 1 is aborting.
18/06/22 10:10:51 ERROR DataWritingSparkTask: Writer for partition 2 aborted.
18/06/22 10:10:51 ERROR DataWritingSparkTask: Writer for partition 1 aborted.
18/06/22 10:10:51 ERROR DataWritingSparkTask: Writer for partition 0 aborted.
18/06/22 10:10:51 ERROR Executor: Exception in task 2.0 in stage 0.0 (TID 2)
org.apache.kafka.common.errors.TimeoutException: Failed to update metadata after 60000 ms.
18/06/22 10:10:51 ERROR Executor: Exception in task 1.0 in stage 0.0 (TID 1)
org.apache.kafka.common.errors.TimeoutException: Failed to update metadata after 60000 ms.
18/06/22 10:10:51 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
org.apache.kafka.common.errors.TimeoutException: Failed to update metadata after 60000 ms.
18/06/22 10:10:51 ERROR TaskSetManager: Task 2 in stage 0.0 failed 1 times; aborting job
18/06/22 10:10:51 ERROR WriteToDataSourceV2Exec: Data source writer org.apache.spark.sql.execution.streaming.sources.InternalRowMicroBatchWriter@240a98a7 is aborting.
18/06/22 10:10:51 ERROR WriteToDataSourceV2Exec: Data source writer org.apache.spark.sql.execution.streaming.sources.InternalRowMicroBatchWriter@240a98a7 aborted.
18/06/22 10:10:51 ERROR MicroBatchExecution: Query [id = 6b4e22ba-596e-4a9f-b14d-43b669008f36, runId = cdcda6fa-ec8b-4f3a-9c7c-388a4812e0d5] terminated with error
org.apache.spark.SparkException: Writing job aborted.
    at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.doExecute(WriteToDataSourceV2.scala:112)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
    at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
    at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
    at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:294)
    at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3272)
    at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2722)
    at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2722)
    at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3253)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
    at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3252)
    at org.apache.spark.sql.Dataset.collect(Dataset.scala:2722)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3$$anonfun$apply$16.apply(MicroBatchExecution.scala:480)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3.apply(MicroBatchExecution.scala:475)
    at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
    at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:474)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:133)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
    at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
    at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:121)
    at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:117)
    at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 0.0 failed 1 times, most recent failure: Lost task 2.0 in stage 0.0 (TID 2, localhost, executor driver): org.apache.kafka.common.errors.TimeoutException: Failed to update metadata after 60000 ms.

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
    at scala.Option.foreach(Option.scala:257)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
    at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.doExecute(WriteToDataSourceV2.scala:82)
... 31 more
Caused by: org.apache.kafka.common.errors.TimeoutException: Failed to update metadata after 60000 ms.

我的docker-compose.yml看起来像这样:

version: "2"

services:
  kafkaserver:
    image: "spotify/kafka:latest"
    container_name: kafka
    hostname: kafkaserver
    networks:
      - kafkanet
    ports:
      - 2181:2181
      - 9092:9092
    environment:
      ADVERTISED_HOST: kafkaserver
      ADVERTISED_PORT: 9092
  kafka_manager:
    image: "mzagar/kafka-manager-docker:1.3.3.4"
    container_name: kafkamanager
    networks:
      - kafkanet
    ports:
      - 9000:9000
    links:
      - kafkaserver
    environment:
      ZK_HOSTS: "kafkaserver:2181"

networks:
  kafkanet:
    driver: bridge

我的实际代码如下:

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp
import org.apache.spark.sql.types._

object SpeedTester {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder.master("local[4]").appName("SpeedTester").config("spark.driver.memory", "8g").getOrCreate()
    val rootLogger = Logger.getRootLogger()
    rootLogger.setLevel(Level.ERROR)
    import spark.implicits._
    val mySchema = StructType(Array(
      StructField("incident_id", StringType),
      StructField("date", StringType),
      StructField("state", StringType),
      StructField("city_or_county", StringType),
      StructField("n_killed", IntegerType),
      StructField("n_injured", IntegerType)
    ))

    val streamingDataFrame = spark.readStream.schema(mySchema).csv("C:/Users/zoldham/IdeaProjects/flinkpoc/Data/test")
    streamingDataFrame.selectExpr("CAST(incident_id AS STRING) AS key", "to_json(struct(*)) AS value").writeStream
      .format("kafka")
      .option("topic", "testTopic")
      .option("kafka.bootstrap.servers", "localhost:9000")
      .option("checkpointLocation", "C:/Users/zoldham/IdeaProjects/flinkpoc/Data")
      .start()

    val df = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9000")
      .option("subscribe", "testTopic").load()
    val df1 = df.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS TIMESTAMP)").as[(String, Timestamp)]
      .select(from_json(col("value"), mySchema).as("data"), col("timestamp"))
      .select("data.*", "timestamp")
    df1.writeStream
      .format("console")
      .option("truncate","false")
      .start()
      .awaitTermination()
  }
}

为什么连接不正确?我会搞乱某种授权吗?感谢您提供任何建议,我一直在努力寻找其他已发现此问题的人。

0 个答案:

没有答案