调用lambda函数时没有找到模块

时间:2018-01-23 03:58:00

标签: python lambda pyspark bigdata

我正在尝试在PySpark 1.6上运行python程序。下面的脚本使用名为" dateutil"的模块。将时间从一个时区转换为另一个时区。我已经检查过所有工作节点上安装的dateutil模块以及我用来提交作业的当前系统。

执行命令:

spark-submit --packages "com.databricks:spark-csv_2.11:1.5.0" test.py

脚本:

from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz

conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')

sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)

def utcToAESTDateString (row):
    #import pytz
    from dateutil import tz
    utc_tz = dateutil.tz.gettz('UTC')
    #utc_tz = pytz.timezone('UTC')
    utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
    #print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
    aest_time = dateutil.tz.gettz('AEST')
    math.acos (1)
    #print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
    #aedt_time = tz.gettz('AEDT')
    #print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
    #return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
    return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row

sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
    .load("/user/xxx/conviva/*")\
    .filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
    .rdd\
    .map(lambda y: utcToAESTDateString(y)))\
    .registerTempTable("table1")


#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()

错误:

Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1293, in takeUpToNumLeft
  File "/home/xxxx/test.py", line 50, in <lambda>
  File "/home/xxxx/test.py", line 34, in utcToAESTDateString
NameError: global name 'dateutil' is not defined

    at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
    at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
    at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
    at org.apache.spark.scheduler.Task.run(Task.scala:89)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    ... 1 more

1 个答案:

答案 0 :(得分:0)

更改这些行utc_tz = tz.gettz('UTC')aest_time = tz.gettz('AEST')

因此,当您导入这样的特定方法时: from dateutil import tz,您无法执行dateutil.tz等函数调用 你必须tz()

您的代码应如下所示:

from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz

conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')

sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)

def utcToAESTDateString (row):
    #import pytz
    from dateutil import tz
    utc_tz = tz.gettz('UTC')
    #utc_tz = pytz.timezone('UTC')
    utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
    #print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
    aest_time = tz.gettz('AEST')
    math.acos (1)
    #print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
    #aedt_time = tz.gettz('AEDT')
    #print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
    #return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
    return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row

sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
    .load("/user/xxx/conviva/*")\
    .filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
    .rdd\
    .map(lambda y: utcToAESTDateString(y)))\
    .registerTempTable("table1")


#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()