Question

最近我们将我们的项目从Spark 2.2.0 cloudera2迁移到Spark 2.3.0 cloudera2，并注意到一些曾经工作过但但现在因异常而失败的客户Sink。为了简单起见，我重写了一个小案例，以便帮助者可以复制并粘贴代码来测试它。

package question

import java.io.PrintWriter
import java.net.Socket
import org.apache.spark.sql._
import org.apache.spark.sql.execution.streaming._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.streaming._

class NCSink extends Sink {
    def addBatch(batchId: Long, data: DataFrame): Unit = {
        data.foreachPartition { iterator =>
            val socket = new Socket("localhost", 7778)
            val writer = new PrintWriter(socket.getOutputStream, true)
            iterator.foreach(row => writer.println(row.getString(0)))
            socket.close
        }
    }
}

class NCSinkProvider extends StreamSinkProvider {
    def createSink(sc: SQLContext, params: Map[String, String], columns: Seq[String], mode: OutputMode): Sink = new NCSink()
}

object NCStreaming {
    def main(args: Array[String]) = {
        val spark = SparkSession.builder.getOrCreate
        import spark.implicits._
        spark.readStream.format("socket").option("host", "localhost").option("port", 7777).load.as[String].writeStream.format("question.NCSinkProvider").outputMode("append").option("checkpointLocation", "checkpoint").start.awaitTermination
    }
}

上述程序可以在Spark 2.2.0中运行（由cloudera2 parcel部署）

发送

[johnlin@localhost ~]$ nc -lk 7777
good
better
best
never
let
it
rest

收到

[johnlin@localhost ~]$ nc -lk 7778
good
better
never
it
let
rest
best

然而，在Spark 2.3.0（由cloudera2 parcel部署）中，它获得异常Queries with streaming sources must be executed with writeStream.start()

Exception in thread "main" org.apache.spark.sql.streaming.StreamingQueryException: Queries with streaming sources must be executed with writeStream.start();;
LogicalRDD [value#6], true

=== Streaming Query ===
Identifier: [id = 072fce9e-0cc5-482b-a971-17102da37528, runId = 415272e9-2c2a-47de-947e-fbf64c8cc0da]
Current Committed Offsets: {TextSocketSource[host: localhost, port: 7777]: 12}
Current Available Offsets: {TextSocketSource[host: localhost, port: 7777]: 13}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
TextSocketSource[host: localhost, port: 7777]
        at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:295)
        at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
Caused by: org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
LogicalRDD [value#6], true

        at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.org$apache$spark$sql$catalyst$analysis$UnsupportedOperationChecker$$throwError(UnsupportedOperationChecker.scala:374)
        at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$$anonfun$checkForBatch$1.apply(UnsupportedOperationChecker.scala:37)
        at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$$anonfun$checkForBatch$1.apply(UnsupportedOperationChecker.scala:35)
        at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
        at scala.collection.immutable.List.foreach(List.scala:381)
        at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
        at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.checkForBatch(UnsupportedOperationChecker.scala:35)
        at org.apache.spark.sql.execution.QueryExecution.assertSupported(QueryExecution.scala:51)
        at org.apache.spark.sql.execution.QueryExecution.withCachedData$lzycompute(QueryExecution.scala:62)
        at org.apache.spark.sql.execution.QueryExecution.withCachedData(QueryExecution.scala:60)
        at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:66)
        at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:66)
        at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:72)
        at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:68)
        at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:77)
        at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:77)
        at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
        at org.apache.spark.sql.Dataset.withNewRDDExecutionId(Dataset.scala:3234)
        at org.apache.spark.sql.Dataset.foreachPartition(Dataset.scala:2674)
        at question.NCSink.addBatch(NCStreaming.scala:12)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3$$anonfun$apply$16.apply(MicroBatchExecution.scala:477)
        at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3.apply(MicroBatchExecution.scala:475)
        at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
        at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:474)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:133)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
        at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
        at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:121)
        at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:117)
        at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
        ... 1 more

我搜索了这个异常，知道它可能与使用不支持的操作（如多个聚合）有关。但我不知道如何从错误消息中的查询计划中找到它。我知道Spark 2.3结构化流媒体引入了一些新功能。但我无法弄清楚需要对我的代码进行哪些修改。你能帮帮我吗？

Answer 1

我相信的原因是为逻辑计划创建了两个分支，但是你在第二个分支上调用start，第一个分支未处理，所以也许使用Spark 2.3.0，他们已修复它以明确处理。

spark.readStream.format("socket").option("host", "localhost").option("port", 7777).load.as[String]
     .writeStream.format("question.NCSinkProvider").outputMode("append").option("checkpointLocation", "checkpoint")
     .start
     .awaitTermination

这纯粹是我的假设，我可能会错，但你可以试试这个吗？

object NCStreaming {
    def main(args: Array[String]) = {
        val spark = SparkSession.builder.getOrCreate
        import spark.implicits._
        val data = spark.readStream.format("socket").option("host", "localhost").option("port", 7777).load.as[String]
        val query = data.writeStream.format("question.NCSinkProvider").outputMode("append").option("checkpointLocation", "checkpoint").start()
        query.awaitTermination()
    }
}

Answer 2

我深入研究了Spark的源代码，以研究发生了什么变化。我发现如果我修改

    data.foreachPartition { iterator =>

到

    data.queryExecution.toRdd.foreachPartition { iterator =>

然后它可以正常工作。

此外，我意识到自Spark 2.0.0以来，我可以使用ForeachWriter进行这种输出流式传输。

package question

import java.io.PrintWriter
import java.net.Socket
import org.apache.spark.sql._

class NCWriter(host: String, port: Int) extends ForeachWriter[String] {
    var socket: Socket = _
    var writer: PrintWriter = _

    def open(partitionId: Long, version: Long): Boolean = {
        socket = new Socket(host, port)
        writer = new PrintWriter(socket.getOutputStream, true)
        true
    }

    def process(record: String): Unit = writer.println(record)

    def close(exception: Throwable): Unit = socket.close
}

object NCStreaming {
    def main(args: Array[String]) = {
        val spark = SparkSession.builder.getOrCreate
        import spark.implicits._
        spark.readStream.format("socket").option("host", "localhost").option("port", 7777).load.as[String].writeStream.foreach(new NCWriter("localhost", 7778)).outputMode("append").option("checkpointLocation", "checkpoint").start.awaitTermination
    }
}

Spark Structured Streaming - Customer Sink在Spark 2.2.0中工作但在Spark 2.3.0中获得异常

2 个答案: