我正在尝试将python依赖项打包以发送到带有spark-submit
的hadoop集群,我想尽可能以DRYest方式执行此操作。
我希望我的my_spark_app.py
看起来像这样:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('MyApp').setMaster('yarn-client')
sc = SparkContext(conf=conf)
sc.addPyFile('/path/to/dependencies.py')
from dependencies import DependencyManager
dm = DependencyManager(sc)
dm.register_lib('dateutil')
import dateutil
# do stuff with dateutil
然后dependencies.py
就是这样:
import zipfile, os
LIBPATH = '/path/to/my/python/env/lib/python2.7/site-packages/'
class DependencyManager(object):
"""
Collects dependencies to be zipped and sent to the spark context
"""
def __init__(self, spark_context):
self.sc = spark_context
def register_lib(self, p):
libpath = os.path.join(LIBPATH, p)
zippath = libpath + '.zip'
zf = zipfile.PyZipFile(zippath, mode='w')
try:
zf.debug = 3
zf.writepy(libpath)
self.sc.addPyFile(zippath)
finally:
zf.close()
这会产生这个(因为zf.debug = 3
):
Adding package in /path/to/env/lib/python2.7/site-packages/dateutil as dateutil
Adding dateutil/__init__.pyc
Adding dateutil/rrule.pyc
Adding dateutil/relativedelta.pyc
Adding package in /path/to/env/lib/python2.7/site-packages/dateutil/zoneinfo as dateutil/zoneinfo
Adding dateutil/zoneinfo/__init__.pyc
Adding dateutil/zoneinfo/rebuild.pyc
Adding dateutil/parser.pyc
Adding dateutil/tzwin.pyc
Adding dateutil/easter.pyc
Adding package in /path/to/env/lib/python2.7/site-packages/dateutil/tz as dateutil/tz
Adding dateutil/tz/__init__.pyc
Adding dateutil/tz/tz.pyc
Adding dateutil/tz/win.pyc
Adding dateutil/tz/_common.pyc
Traceback (most recent call last):
File "/path/to/my_spark_app.py", line 25
import dateutil
ImportError: No module named dateutil
不知何故,从self.sc.addPyFile()
类中调用DependencyManager
不会影响SparkContext,即使它直接在my_spark_app.py
中正常工作。
这里发生了什么?
答案 0 :(得分:1)
问题很简单,与火花无关。 在这里:
def register_lib(self, p):
libpath = os.path.join(LIBPATH, p)
zippath = libpath + '.zip'
zf = zipfile.PyZipFile(zippath, mode='w')
try:
zf.debug = 3
zf.writepy(libpath)
self.sc.addPyFile(zippath)
finally:
zf.close()
当self.sc.addPyFile(zippath)
被调用时,zf
io仍处于打开状态。我们只需要在致电之前关闭它:
def register_lib(self, p):
libpath = os.path.join(LIBPATH, p)
zippath = libpath + '.zip'
zf = zipfile.PyZipFile(zippath, mode='w')
try:
zf.debug = 3
zf.writepy(libpath)
zf.close() # file is now ready to add to the spark context
self.sc.addPyFile(zippath)
finally:
zf.close()