Spark将DStream保存到HDFS

时间:2018-04-16 11:32:45

标签: scala apache-spark

在纱线客户端模式下从DataFrame写入HDFS时出现问题。在spark-submit上,在hdfs路径下创建了一个名为System.currentTimeMillis()的目录,但该目录为空。

流媒体应该在一段时间内发生,然后停止。

应用程序在我的本地工作正常(但是dstream写入本地文件)

正在使用的代码如下:

 val tweetSchemaString = "tweetID tweetCreatedAt tweetUserName tweetUserScreenName tweetUserLocation"

    import org.apache.spark.sql.{Row, SparkSession}
    import org.apache.spark.sql._
    import org.apache.spark.sql.Row
    import org.apache.spark.sql.types.{StructType,StructField,StringType}

    val spark: SparkSession = SparkSession.builder.master(args(1)).getOrCreate
    //val spark: SparkSession = SparkSession.builder.master("local[*]").getOrCreate

    val schema = StructType(tweetSchemaString.split(" ").map(fieldName => StructField(fieldName,StringType,true)))

    tweets.foreachRDD(
      tweet => {
        if(!tweet.partitions.isEmpty){
          val rowRDD = tweet.map(
            rddTweet => Row(
              rddTweet.getId().toString(),
              rddTweet.getCreatedAt.toString(),
              rddTweet.getUser().getName,
              rddTweet.getUser().getScreenName,
              rddTweet.getUser.getLocation
             )
          )
          //Get the Singleton Instance of SQL Context
          val sqlContext = SQLContext.getOrCreate(tweet.sparkContext)
          val tweetsDF = spark.createDataFrame(rowRDD,schema)
          tweetsDF.createOrReplaceTempView("tweetsTable")
          println("=======================================================")
          println("===================WRITING TO HDFS=====================")

          tweetsDF.write.json("hdfs://url:8020/home/gib/"+ System.currentTimeMillis())
        }
      }
    )
    // Start Streaming Context
    ssc.start()
    //ssc.awaitTermination()
    ssc.awaitTerminationOrTimeout(60000 * streamFor)
    ssc.stop()

执行时 spark-submit --executor-memory 1g --driver-memory 1g --class str.Streamer TwitterStreamV1-1.0-jar-with-dependencies.jar 2 yarn-client

下面的错误会弹出指定的执行时间,在这种情况下 2 分钟

这是在信号发送到停止接收器之后发生的

18/04/16 11:11:31 INFO ReceiverTracker: Sent stop signal to all 1 receivers
18/04/16 11:11:31 ERROR ReceiverTracker: Deregistered receiver for stream 0: Stopped by driver

错误

18/04/16 11:11:34 ERROR InsertIntoHadoopFsRelationCommand: Aborting job.
java.lang.InterruptedException
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedInterruptibly(AbstractQueuedSynchronizer.java:998)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireSharedInterruptibly(AbstractQueuedSynchronizer.java:1304)
        at scala.concurrent.impl.Promise$DefaultPromise.tryAwait(Promise.scala:202)
        at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:218)
        at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:153)
        at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:623)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1873)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1886)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1906)
        at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelationCommand.scala:143)
        at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply(InsertIntoHadoopFsRelationCommand.scala:115)
        at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply(InsertIntoHadoopFsRelationCommand.scala:115)
        at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
        at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:115)
        at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
        at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
        at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
        at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115)
        at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115)
        at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:136)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
        at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:133)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:114)
        at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:86)
        at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:86)
        at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:525)
        at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:211)
        at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:194)
        at org.apache.spark.sql.DataFrameWriter.json(DataFrameWriter.scala:467)
        at dsti.Streamer$$anonfun$main$1.apply(Streamer.scala:100)
        at dsti.Streamer$$anonfun$main$1.apply(Streamer.scala:77)
        at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:627)
        at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:627)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
        at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:415)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
        at scala.util.Try$.apply(Try.scala:192)
        at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
        at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:247)
        at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:247)
        at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:247)
        at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
        at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:246)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$

有没有办法实现这个目标?

  1. 流式传输并写入文件
  2. 在指定时间后停止流媒体和写作?

0 个答案:

没有答案