我正在尝试使用Apache-Spark并行处理目录中分离的csv文件。具体来说,我希望每个从节点在每个csv文件中添加第一列的所有数字,并发回计算结果。以下是我的代码:
import os, sys, inspect, csv
### Current directory path.
curr_dir = os.path.split(inspect.getfile(inspect.currentframe()))[0]
### Setup the environment variables
spark_home_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../../spark-1.4.0")))
python_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../python")))
os.environ["SPARK_HOME"] = spark_home_dir
os.environ["PYTHONPATH"] = python_dir
### Setup pyspark directory path
pyspark_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../python")))
sys.path.append(pyspark_dir)
### Import the pyspark
from pyspark import SparkConf, SparkContext
### Specify the data file directory, and load the data files
data_path = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "./test_dir")))
### myfunc is to add all numbers in the first column.
def myfunc(s):
s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
cr = csv.reader(open(s_new,"rb"))
total = 0
for row in cr:
total += int(row[0])
return total
def main():
### Initialize the SparkConf and SparkContext
conf = SparkConf().setAppName("ruofan").setMaster("local")
sc = SparkContext(conf = conf)
datafile = sc.wholeTextFiles(data_path)
### Sent the application in each of the slave node
temp = datafile.foreach(myfunc)
### Collect the result and print it out.
for x in temp.sample(False, 1).collect():
print x
if __name__ == "__main__":
main()
但是当我运行代码时,它会显示如下错误:
AttributeError: 'tuple' object has no attribute 'startswith'
堆栈跟踪如下:
ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
return f(iterator)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
f(x)
File "sum.py", line 24, in myfunc
s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
File "/usr/lib/python2.7/posixpath.py", line 75, in join
if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
15/07/14 16:52:15 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
return f(iterator)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
f(x)
File "sum.py", line 24, in myfunc
s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
File "/usr/lib/python2.7/posixpath.py", line 75, in join
if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
15/07/14 16:52:15 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
15/07/14 16:52:15 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/07/14 16:52:15 INFO TaskSchedulerImpl: Cancelling stage 0
15/07/14 16:52:15 INFO DAGScheduler: ResultStage 0 (foreach at sum.py:40) failed in 0.408 s
15/07/14 16:52:15 INFO DAGScheduler: Job 0 failed: foreach at sum.py:40, took 0.458805 s
Traceback (most recent call last):
File "sum.py", line 47, in <module>
main()
File "sum.py", line 40, in main
temp = datafile.foreach(myfunc)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 721, in foreach
self.mapPartitions(processPartition).count() # Force evaluation
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 972, in count
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 963, in sum
return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 771, in reduce
vals = self.mapPartitions(func).collect()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 745, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/java_gateway.py", line 538, in __call__
self.target_id, self.name)
File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/protocol.py", line 300, in get_return_value
format(target_id, '.', name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
return f(iterator)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
f(x)
File "sum.py", line 24, in myfunc
s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
File "/usr/lib/python2.7/posixpath.py", line 75, in join
if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1266)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1257)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1256)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1256)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1450)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1411)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
我不知道为什么我的代码不起作用。如果有人能帮我解决问题,我真的很感激。谢谢!
答案 0 :(得分:0)
堆栈跟踪非常明显,错误来自myfunc
行:
s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
s应该是一个字符串数组,但是你从
传递一个元组temp = datafile.foreach(myfunc)
其中datafile
来自wholeTextFiles
,结果为RDD[(String, String)]
。元组中的第一项是路径,第二项是内容。因此,您需要通过代码的外观仅传入元组(路径)的第一部分:
datafile.foreach(lambda (path, content): myfunc(path))