我遇到了火花问题,这很奇怪。有人可以帮忙吗? 提前致谢。 代码在这里:
import re, sys
from pyspark import SparkContext, HiveContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime, date, time, timedelta
import time
import urllib2
import socket
import base64
from operator import add
from subprocess import call
def transToTimeStamp(time_bigint):
return time.strftime("%Y-%m-%d", time.localtime(float(time_bigint)))
def getImageData(url):
try:
return base64.b64encode(urllib2.urlopen(url, timeout=10).read())
except Exception, e:
print('Exception: ' + e.message)
return ""
if __name__ == '__main__':
spark = SparkSession.builder\
.appName('uniq_xxx')\
.config('spark.sql.warehouse.dir','spark-warehouse')\
.enableHiveSupport().getOrCreate()
sql = 'select id, image, time from tabletest where time > 1506787200'
df = spark.sql(sql)
print df.count()
out_path = "/user/test/images3/"
myudf = udf(transToTimeStamp, StringType())
img_udf = udf(getImageData, StringType())
new_df = df.withColumn('dayaaa', myudf(df.post_time)).withColumn('img_data', img_udf(df.image2)).drop('image2').drop('post_time')
new_df.write.partitionBy('dayaaa').mode('overwrite').save(out_path)
spark.stop()
数据大约是4900行。
错误是:
18/01/19 21:52:14 INFO scheduler.DAGScheduler:作业1失败:保存在NativeMethodAccessorImpl.java:0,取8083.693827 s 18/01/19 21:52:14 ERROR scheduler.LiveListenerBus:SparkListenerBus已经停止了!删除事件SparkListenerExecutorMetricsUpdate(1,WrappedArray((69,2,0,Vector(AccumulableInfo(1531,Some(internal.metrics.executorDeserializeTime),Some(0),None,true,true,None),AccumulableInfo(1532,Some( internal.metrics.executorDeserializeCpuTime),Some(0),None,true,true,None),AccumulableInfo(1533,Some(internal.metrics.executorRunTime),Some(0),None,true,true,None),AccumulableInfo( 1534,Some(internal.metrics.executorCpuTime),Some(0),None,true,true,None),AccumulableInfo(1535,Some(internal.metrics.resultSize),Some(0),None,true,true,None ),AccumulableInfo(1536,Some(internal.metrics.jvmGCTime),Some(594),None,true,true,None),AccumulableInfo(1537,Some(internal.metrics.resultSerializationTime),Some(0),None,true ,true,None),AccumulableInfo(1538,Some(internal.metrics.memoryBytesSpilled),Some(0),None,true,true,None),AccumulableInfo(1539,Some(internal.metrics.diskBytesSpilled),Some(0) ,None,true,true,None),AccumulableInfo(1540,Some(internal.metrics.peakExecutionMemory),Some(0),None,true,true,N one),AccumulableInfo(1541,Some(internal.metrics.updatedBlockStatuses),Some([(broadcast_4_piece0,BlockStatus(StorageLevel(memory,1 replicas),41884,0)),(broadcast_4,BlockStatus(StorageLevel(memory,deserialized,1) replicas),107120,0)),(broadcast_3_piece0,BlockStatus(StorageLevel(memory,1 replicas),27122,0)),(broadcast_3,BlockStatus(StorageLevel(memory,deserialized,1 replicas),408720,0))]) ,None,true,true,None),AccumulableInfo(1542,Some(internal.metrics.shuffle.read.remoteBlocksFetched),Some(0),None,true,true,None),AccumulableInfo(1543,Some(internal.metrics) .shuffle.read.localBlocksFetched),Some(0),None,true,true,None),AccumulableInfo(1544,Some(internal.metrics.shuffle.read.remoteBytesRead),Some(0),None,true,true,无),AccumulableInfo(1545,Some(internal.metrics.shuffle.read.localBytesRead),Some(0),None,true,true,None),AccumulableInfo(1546,Some(internal.metrics.shuffle.read.fetchWaitTime)) ,Some(0),None,true,true,None),AccumulableInfo(1547,Some(internal.metrics.shuffle.read.recor) dsRead),Some(0),None,true,true,None),AccumulableInfo(1548,Some(internal.metrics.shuffle.write.bytesWritten),Some(0),None,true,true,None),AccumulableInfo( 1549,Some(internal.metrics.shuffle.write.recordsWritten),Some(0),None,true,true,None),AccumulableInfo(1550,Some(internal.metrics.shuffle.write.writeTime),Some(0) ,None,true,true,None),AccumulableInfo(1551,Some(internal.metrics.input.bytesRead),Some(6154349),None,true,true,None),AccumulableInfo(1552,Some(internal.metrics.input) .recordsRead),Some(532922),None,true,true,None),AccumulableInfo(1553,Some(internal.metrics.output.bytesWritten),Some(0),None,true,true,None),AccumulableInfo(1554) ,Some(internal.metrics.output.recordsWritten),Some(0),None,true,true,None),AccumulableInfo(1530,Some(输出行数),Some(532922),None,true,true,Some (sql)),AccumulableInfo(1528,Some(持续时间总计(min,med,max)),Some(-1),None,true,true,Some(sql)),AccumulableInfo(1529,Some(输出行数) ),Some(372600),None,true,true,Some(sql)),Accum ulableInfo(0,None,Some([]),None,false,false,None),AccumulableInfo(1527,Some(duration total(min,med,max)),Some(-1),None,true,true,有些(SQL)))))) 18/01/19 21:52:14 INFO scheduler.DAGScheduler:ResultStage 2(保存在NativeMethodAccessorImpl.java:0)由于阶段被取消而导致8083.670失败,因为SparkContext已关闭 18/01/19 21:52:14 ERROR scheduler.LiveListenerBus:SparkListenerBus已经停止了!删除事件SparkListenerStageCompleted(org.apache.spark.scheduler.StageInfo@5663383a) 18/01/19 21:52:14 ERROR datasources.FileFormatWriter:中止作业null。 org.apache.spark.SparkException:作业1已取消,因为SparkContext已关闭 在org.apache.spark.scheduler.DAGScheduler $$ anonfun $ cleanUpAfterSchedulerStop $ 1.apply(DAGScheduler.scala:808) 在org.apache.spark.scheduler.DAGScheduler $$ anonfun $ cleanUpAfterSchedulerStop $ 1.apply(DAGScheduler.scala:806) 在scala.collection.mutable.HashSet.foreach(HashSet.scala:78) 在org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:806) 在org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1668) 在org.apache.spark.util.EventLoop.stop(EventLoop.scala:83) 在org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1587) 在org.apache.spark.SparkContext $$ anonfun $ stop $ 8.apply $ mcV $ sp(SparkContext.scala:1826) 在org.apache.spark.util.Utils $ .tryLogNonFatalError(Utils.scala:1283) 在org.apache.spark.SparkContext.stop(SparkContext.scala:1825) 在org.apache.spark.SparkContext $$ anonfun $ 2.apply $ mcV $ sp(SparkContext.scala:581) 在org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216) 在org.apache.spark.util.SparkShutdownHookManager $$ anonfun $ runAll $ 1 $$ anonfun $ apply $ mcV $ sp $ 1.apply $ mcV $ sp(ShutdownHookManager.scala:188) 在org.apache.spark.util.SparkShutdownHookManager $$ anonfun $ runAll $ 1 $$ anonfun $ apply $ mcV $ sp $ 1.apply(ShutdownHookManager.scala:188) 在org.apache.spark.util.SparkShutdownHookManager $$ anonfun $ runAll $ 1 $$ anonfun $ apply $ mcV $ sp $ 1.apply(ShutdownHookManager.scala:188) 在org.apache.spark.util.Utils $ .logUncaughtExceptions(Utils.scala:1951) 在org.apache.spark.util.SparkShutdownHookManager $$ anonfun $ runAll $ 1.apply $ mcV $ sp(ShutdownHookManager.scala:188) 在org.apache.spark.util.SparkShutdownHookManager $$ anonfun $ runAll $ 1.apply(ShutdownHookManager.scala:188) 在org.apache.spark.util.SparkShutdownHookManager $$ anonfun $ runAll $ 1.apply(ShutdownHookManager.scala:188) 在scala.util.Try $ .apply(Try.scala:192) 在org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188) 在org.apache.spark.util.SparkShutdownHookManager $$ anon $ 2.run(ShutdownHookManager.scala:178) 在org.apache.hadoop.util.ShutdownHookManager $ 1.run(ShutdownHookManager.java:54) 在org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628) 在org.apache.spark.SparkContext.runJob(SparkContext.scala:1918) 在org.apache.spark.SparkContext.runJob(SparkContext.scala:1931) 在org.apache.spark.SparkContext.runJob(SparkContext.scala:1951) 在org.apache.spark.sql.execution.datasources.FileFormatWriter $$ anonfun $ write $ 1.apply $ mcV $ sp(FileFormatWriter.scala:127) 在org.apache.spark.sql.execution.datasources.FileFormatWriter $$ anonfun $ write $ 1.apply(FileFormatWriter.scala:121) 在org.apache.spark.sql.execution.datasources.FileFormatWriter $$ anonfun $ write $ 1.apply(FileFormatWriter.scala:121) 在org.apache.spark.sql.execution.SQLExecution $ .withNewExecutionId(SQLExecution.scala:57) 在org.apache.spark.sql.execution.datasources.FileFormatWriter $ .write(FileFormatWriter.scala:121) 在org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:101) 在org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult $ lzycompute(commands.scala:58) 在org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56) 在org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74) 在org.apache.spark.sql.execution.SparkPlan $$ anonfun $ execute $ 1.apply(SparkPlan.scala:114)