Apache-Spark并行处理分离的csv文件

时间:2015-07-14 20:00:34

标签: python csv apache-spark

我正在尝试使用Apache-Spark并行处理目录中分离的csv文件。具体来说,我希望每个从节点在每个csv文件中添加第一列的所有数字,并发回计算结果。以下是我的代码:

import os, sys, inspect, csv

### Current directory path.
curr_dir = os.path.split(inspect.getfile(inspect.currentframe()))[0]

### Setup the environment variables
spark_home_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../../spark-1.4.0")))
python_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../python")))
os.environ["SPARK_HOME"] = spark_home_dir
os.environ["PYTHONPATH"] = python_dir

### Setup pyspark directory path
pyspark_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../python")))
sys.path.append(pyspark_dir)

### Import the pyspark
from pyspark import SparkConf, SparkContext

### Specify the data file directory, and load the data files
data_path = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "./test_dir")))

### myfunc is to add all numbers in the first column.
def myfunc(s):
    s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
    cr = csv.reader(open(s_new,"rb"))

    total = 0
    for row in cr:  
        total += int(row[0])
    return total

def main():
    ### Initialize the SparkConf and SparkContext
    conf = SparkConf().setAppName("ruofan").setMaster("local")
    sc = SparkContext(conf = conf)
    datafile = sc.wholeTextFiles(data_path)

    ### Sent the application in each of the slave node
    temp = datafile.foreach(myfunc)

    ### Collect the result and print it out.
    for x in temp.sample(False, 1).collect(): 
            print x


if __name__ == "__main__":
    main()

但是当我运行代码时,它会显示如下错误:

AttributeError: 'tuple' object has no attribute 'startswith'

堆栈跟踪如下:

ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
    return f(iterator)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
    f(x)
  File "sum.py", line 24, in myfunc
    s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
  File "/usr/lib/python2.7/posixpath.py", line 75, in join
    if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'

    at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
    at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)
15/07/14 16:52:15 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
    return f(iterator)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
    f(x)
  File "sum.py", line 24, in myfunc
    s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
  File "/usr/lib/python2.7/posixpath.py", line 75, in join
    if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'

    at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
    at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)

15/07/14 16:52:15 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
15/07/14 16:52:15 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
15/07/14 16:52:15 INFO TaskSchedulerImpl: Cancelling stage 0
15/07/14 16:52:15 INFO DAGScheduler: ResultStage 0 (foreach at sum.py:40) failed in 0.408 s
15/07/14 16:52:15 INFO DAGScheduler: Job 0 failed: foreach at sum.py:40, took 0.458805 s
Traceback (most recent call last):
  File "sum.py", line 47, in <module>
    main()
  File "sum.py", line 40, in main
    temp = datafile.foreach(myfunc)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 721, in foreach
    self.mapPartitions(processPartition).count()  # Force evaluation
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 972, in count
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 963, in sum
    return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 771, in reduce
    vals = self.mapPartitions(func).collect()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 745, in collect
    port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/java_gateway.py", line 538, in __call__
    self.target_id, self.name)
  File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/protocol.py", line 300, in get_return_value
    format(target_id, '.', name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
    return f(iterator)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
    f(x)
  File "sum.py", line 24, in myfunc
    s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
  File "/usr/lib/python2.7/posixpath.py", line 75, in join
    if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'

    at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
    at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1266)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1257)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1256)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1256)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1450)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1411)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)

我不知道为什么我的代码不起作用。如果有人能帮我解决问题,我真的很感激。谢谢!

1 个答案:

答案 0 :(得分:0)

堆栈跟踪非常明显,错误来自myfunc行:

s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))

s应该是一个字符串数组,但是你从

传递一个元组
temp = datafile.foreach(myfunc)

其中datafile来自wholeTextFiles,结果为RDD[(String, String)]。元组中的第一项是路径,第二项是内容。因此,您需要通过代码的外观仅传入元组(路径)的第一部分:

datafile.foreach(lambda (path, content): myfunc(path))