我编写了一个简单的python代码sum.py
来从HDFS加载目录data
,并为目录data
中的每个csv文件添加所有第一列编号。代码如下所示:
import os, sys, inspect, csv
### Current directory path.
curr_dir = os.path.split(inspect.getfile(inspect.currentframe()))[0]
### Setup the environment variables
spark_home_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../spark-1.4.0")))
python_dir = os.path.realpath(os.path.abspath(os.path.join(spark_home_dir, "./python")))
os.environ["SPARK_HOME"] = spark_home_dir
os.environ["PYTHONPATH"] = python_dir
### Setup pyspark directory path
#pyspark_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../python")))
pyspark_dir = os.path.realpath(os.path.abspath(os.path.join(spark_home_dir, "./python")))
sys.path.append(pyspark_dir)
### Import the pyspark
from pyspark import SparkConf, SparkContext
### myfunc is to add all numbers in the first column.
def myfunc(s):
total = 0
if s.endswith(".csv"):
#s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
#cr = csv.reader(open(s_new,"rb"))
cr = csv.reader(open(s,"rb"))
for row in cr:
total += int(row[0])
print "The total number of ", s, " is: ", total
return total
def main():
### Initialize the SparkConf and SparkContext
conf = SparkConf().setAppName("ruofan").setMaster("local")
sc = SparkContext(conf = conf)
### Load data from HDFS
datafile = sc.wholeTextFiles("hdfs://localhost:9000/data")
### Sent the application in each of the slave node
temp = datafile.foreach(lambda (path, content): myfunc(str(path).strip('file:')))
if __name__ == "__main__":
main()
在运行我的代码之前,我将目录data
放入我的本地HDFS。然后我输入以下命令:
$ hdfs dfs -ls /data
输出:
Found 2 items
-rw-r--r-- 1 ying supergroup 6 2015-07-23 15:46 /data/test1.csv
-rw-r--r-- 1 ying supergroup 6 2015-07-23 15:46 /data/test2.csv
因此输出显示data
已经在HDFS中。然后我输入sum.py
来运行我的代码$ python sum.py
。但是,它显示错误:
IOError: [Errno 2] No such file or directory:'hdfs://localhost:9000/data/test1.csv'
以下是我的代码的回溯:
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
return f(iterator)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
f(x)
File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 41, in <lambda>
temp = datafile.foreach(lambda (path, content): myfunc(str(path).strip('file:')))
File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 26, in myfunc
cr = csv.reader(open(s,"rb"))
IOError: [Errno 2] No such file or directory: 'hdfs://localhost:9000/data/test1.csv'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
15/07/23 18:01:49 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
return f(iterator)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
f(x)
File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 41, in <lambda>
temp = datafile.foreach(lambda (path, content): myfunc(str(path).strip('file:')))
File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 26, in myfunc
cr = csv.reader(open(s,"rb"))
IOError: [Errno 2] No such file or directory: 'hdfs://localhost:9000/data/test1.csv'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
15/07/23 18:01:49 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
15/07/23 18:01:49 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/07/23 18:01:49 INFO TaskSchedulerImpl: Cancelling stage 0
15/07/23 18:01:49 INFO DAGScheduler: ResultStage 0 (foreach at /home/ying/AWS_Tutorial/spark_codes/sum.py:41) failed in 0.392 s
15/07/23 18:01:49 INFO DAGScheduler: Job 0 failed: foreach at /home/ying/AWS_Tutorial/spark_codes/sum.py:41, took 0.438280 s
Traceback (most recent call last):
File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 45, in <module>
main()
File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 41, in main
temp = datafile.foreach(lambda (path, content): myfunc(str(path).strip('file:')))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 721, in foreach
self.mapPartitions(processPartition).count() # Force evaluation
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 972, in count
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 963, in sum
return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 771, in reduce
vals = self.mapPartitions(func).collect()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 745, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/java_gateway.py", line 538, in __call__
self.target_id, self.name)
File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/protocol.py", line 300, in get_return_value
format(target_id, '.', name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
return f(iterator)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
f(x)
File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 41, in <lambda>
temp = datafile.foreach(lambda (path, content): myfunc(str(path).strip('file:')))
File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 26, in myfunc
cr = csv.reader(open(s,"rb"))
IOError: [Errno 2] No such file or directory: 'hdfs://localhost:9000/data/test1.csv'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1266)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1257)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1256)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1256)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1450)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1411)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
我非常确定端口的主名称是localhost:9000
,我不知道代码的错误。如果有人帮助我解决问题,我真的很感激。