所以我有以下pyspark代码:
class networkcallmaker(object):
def __init__(self, connectionparams):
self.connectionparams = connectionparams
self.headers = {"someheaders": 1}
def sendRequest(self, req):
url = make_url(self.connectionparams)
# There is some logic in this function that also picks a
# random host from a list of hostnames to deliver the request to.
# I abstracted away all of that for the purpose
# of this question under this function name alias.
req = urllib.request.Request(url, req, self.headers)
resp = urllib.request.urlopen(req)
return resp.read().decode("utf-8")
def makeARequest(param1, param2):
req = makeElaborateRequest(param1, param2)
ncm = networkcallmaker(connectionparams)
resp = ncm.sendRequest(req)
respdict = json.loads(resp)
return respdict["key1"]["key2"]["key3"]
def rddmaker(sc):
raw_rdd = sc.textFile("hdfs:/hello/world/abc.txt")
def rowsplitter(row):
rows = list(csv.reader(StringIO(row), delimiter=","))
for row in rows:
if row[10].strip() == "":
row[10] = makeARequest(row[2], row[3])
return [row[0], row[1], row[10]]
return raw_rdd.map(rowsplitter)
我们有一个本地服务,可在10台主机上运行。我将来自hdfs的巨型CSV文件加载到rdd中。这个巨大的CSV文件在目标列之一(假设第11列)中缺少数据。我使用csv文件的其他字段来调用Restful服务。静态服务进行一些计算并返回缺失值可能是什么。我将此计算值插入到缺少数据的地方,然后返回该行。
现在的问题是我收到一个泡菜错误:
File "/usr/spark/current/python/lib/pyspark.zip/pyspark/sql/session.py", line 57, in toDF
File "/usr/spark/current/python/lib/pyspark.zip/pyspark/sql/session.py", line 520, in createDataFram
File "/usr/spark/current/python/lib/pyspark.zip/pyspark/sql/session.py", line 360, in _createFromRDD
File "/usr/spark/current/python/lib/pyspark.zip/pyspark/sql/session.py", line 331, in _inferSchema
File "/usr/spark/current/python/lib/pyspark.zip/pyspark/rdd.py", line 1361, in first
File "/usr/spark/current/python/lib/pyspark.zip/pyspark/rdd.py", line 1343, in take
File "/usr/spark/current/python/lib/pyspark.zip/pyspark/context.py", line 965, in runJob
File "/usr/spark/current/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call_
File "/usr/spark/current/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/spark/current/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_v
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.r
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 timeache.spark.api.python.PythonException: Traceback (most recent call last):
File "/disk/4/yarn/local/usercache/asdfadf/appcache/application_1533845182922_5376/container_e22
func, profiler, deserializer, serializer = read_command(pickleSer, infile)
File "/disk/4/yarn/local/usercache/asdfadf/appcache/application_1533845182922_5376/container_e22
command = serializer._read_with_length(file)
File "/disk/4/yarn/local/usercache/asdfasdf/appcache/application_1533845182922_5376/container_e22
return self.loads(obj)
File "/disk/4/yarn/local/usercache/asdfasd/appcache/application_1533845182922_5376/container_e22
return pickle.loads(obj)
TypeError: 'NoneType' object does not support item assignment
有人可以告诉我可能是什么问题。