在纱线客户端模式下从DataFrame写入HDFS时出现问题。在spark-submit上,在hdfs路径下创建了一个名为System.currentTimeMillis()
的目录,但该目录为空。
流媒体应该在一段时间内发生,然后停止。
应用程序在我的本地工作正常(但是dstream写入本地文件)
正在使用的代码如下:
val tweetSchemaString = "tweetID tweetCreatedAt tweetUserName tweetUserScreenName tweetUserLocation"
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType,StructField,StringType}
val spark: SparkSession = SparkSession.builder.master(args(1)).getOrCreate
//val spark: SparkSession = SparkSession.builder.master("local[*]").getOrCreate
val schema = StructType(tweetSchemaString.split(" ").map(fieldName => StructField(fieldName,StringType,true)))
tweets.foreachRDD(
tweet => {
if(!tweet.partitions.isEmpty){
val rowRDD = tweet.map(
rddTweet => Row(
rddTweet.getId().toString(),
rddTweet.getCreatedAt.toString(),
rddTweet.getUser().getName,
rddTweet.getUser().getScreenName,
rddTweet.getUser.getLocation
)
)
//Get the Singleton Instance of SQL Context
val sqlContext = SQLContext.getOrCreate(tweet.sparkContext)
val tweetsDF = spark.createDataFrame(rowRDD,schema)
tweetsDF.createOrReplaceTempView("tweetsTable")
println("=======================================================")
println("===================WRITING TO HDFS=====================")
tweetsDF.write.json("hdfs://url:8020/home/gib/"+ System.currentTimeMillis())
}
}
)
// Start Streaming Context
ssc.start()
//ssc.awaitTermination()
ssc.awaitTerminationOrTimeout(60000 * streamFor)
ssc.stop()
执行时
spark-submit --executor-memory 1g --driver-memory 1g --class str.Streamer TwitterStreamV1-1.0-jar-with-dependencies.jar 2 yarn-client
下面的错误会弹出指定的执行时间,在这种情况下 2 分钟
这是在信号发送到停止接收器之后发生的
18/04/16 11:11:31 INFO ReceiverTracker: Sent stop signal to all 1 receivers
18/04/16 11:11:31 ERROR ReceiverTracker: Deregistered receiver for stream 0: Stopped by driver
错误
18/04/16 11:11:34 ERROR InsertIntoHadoopFsRelationCommand: Aborting job.
java.lang.InterruptedException
at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedInterruptibly(AbstractQueuedSynchronizer.java:998)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireSharedInterruptibly(AbstractQueuedSynchronizer.java:1304)
at scala.concurrent.impl.Promise$DefaultPromise.tryAwait(Promise.scala:202)
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:218)
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:153)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:623)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1873)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1886)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1906)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelationCommand.scala:143)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply(InsertIntoHadoopFsRelationCommand.scala:115)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply(InsertIntoHadoopFsRelationCommand.scala:115)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:115)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:136)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:133)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:114)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:86)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:86)
at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:525)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:211)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:194)
at org.apache.spark.sql.DataFrameWriter.json(DataFrameWriter.scala:467)
at dsti.Streamer$$anonfun$main$1.apply(Streamer.scala:100)
at dsti.Streamer$$anonfun$main$1.apply(Streamer.scala:77)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:627)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:627)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:415)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:247)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:247)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:247)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:246)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$
有没有办法实现这个目标?