Apache-Spark从HDFS加载文件

时间:2015-07-23 22:11:06

标签: python apache-spark localhost hdfs

我编写了一个简单的python代码sum.py来从HDFS加载目录data,并为目录data中的每个csv文件添加所有第一列编号。代码如下所示:

import os, sys, inspect, csv

### Current directory path.
curr_dir = os.path.split(inspect.getfile(inspect.currentframe()))[0]

### Setup the environment variables
spark_home_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../spark-1.4.0")))
python_dir = os.path.realpath(os.path.abspath(os.path.join(spark_home_dir, "./python")))
os.environ["SPARK_HOME"] = spark_home_dir
os.environ["PYTHONPATH"] = python_dir

### Setup pyspark directory path
#pyspark_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../python")))
pyspark_dir = os.path.realpath(os.path.abspath(os.path.join(spark_home_dir, "./python")))
sys.path.append(pyspark_dir)

### Import the pyspark
from pyspark import SparkConf, SparkContext

### myfunc is to add all numbers in the first column.
def myfunc(s):
  total = 0
  if s.endswith(".csv"):
    #s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
    #cr = csv.reader(open(s_new,"rb"))
    cr = csv.reader(open(s,"rb"))
    for row in cr:
      total += int(row[0])
  print "The total number of ", s, " is: ", total
  return total

def main():
  ### Initialize the SparkConf and SparkContext
  conf = SparkConf().setAppName("ruofan").setMaster("local")
  sc = SparkContext(conf = conf)

  ### Load data from HDFS
  datafile = sc.wholeTextFiles("hdfs://localhost:9000/data")    

  ### Sent the application in each of the slave node
  temp = datafile.foreach(lambda (path, content): myfunc(str(path).strip('file:')))


if __name__ == "__main__":
  main()

在运行我的代码之前,我将目录data放入我的本地HDFS。然后我输入以下命令:

$ hdfs dfs -ls /data

输出:

Found 2 items
-rw-r--r--   1 ying supergroup          6 2015-07-23 15:46 /data/test1.csv
-rw-r--r--   1 ying supergroup          6 2015-07-23 15:46 /data/test2.csv

因此输出显示data已经在HDFS中。然后我输入sum.py来运行我的代码$ python sum.py。但是,它显示错误:

IOError: [Errno 2] No such file or directory:'hdfs://localhost:9000/data/test1.csv'

以下是我的代码的回溯:

File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
    return f(iterator)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
    f(x)
  File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 41, in <lambda>
    temp = datafile.foreach(lambda (path, content): myfunc(str(path).strip('file:')))
  File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 26, in myfunc
    cr = csv.reader(open(s,"rb"))
IOError: [Errno 2] No such file or directory: 'hdfs://localhost:9000/data/test1.csv'

    at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
    at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)
15/07/23 18:01:49 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
    return f(iterator)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
    f(x)
  File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 41, in <lambda>
    temp = datafile.foreach(lambda (path, content): myfunc(str(path).strip('file:')))
  File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 26, in myfunc
    cr = csv.reader(open(s,"rb"))
IOError: [Errno 2] No such file or directory: 'hdfs://localhost:9000/data/test1.csv'

    at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
    at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)

15/07/23 18:01:49 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
15/07/23 18:01:49 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
15/07/23 18:01:49 INFO TaskSchedulerImpl: Cancelling stage 0
15/07/23 18:01:49 INFO DAGScheduler: ResultStage 0 (foreach at /home/ying/AWS_Tutorial/spark_codes/sum.py:41) failed in 0.392 s
15/07/23 18:01:49 INFO DAGScheduler: Job 0 failed: foreach at /home/ying/AWS_Tutorial/spark_codes/sum.py:41, took 0.438280 s
Traceback (most recent call last):
  File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 45, in <module>
    main()
  File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 41, in main
    temp = datafile.foreach(lambda (path, content): myfunc(str(path).strip('file:')))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 721, in foreach
    self.mapPartitions(processPartition).count()  # Force evaluation
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 972, in count
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 963, in sum
    return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 771, in reduce
    vals = self.mapPartitions(func).collect()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 745, in collect
    port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/java_gateway.py", line 538, in __call__
    self.target_id, self.name)
  File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/protocol.py", line 300, in get_return_value
    format(target_id, '.', name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
    return f(iterator)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
    f(x)
  File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 41, in <lambda>
    temp = datafile.foreach(lambda (path, content): myfunc(str(path).strip('file:')))
  File "/home/ying/AWS_Tutorial/spark_codes/sum.py", line 26, in myfunc
    cr = csv.reader(open(s,"rb"))
IOError: [Errno 2] No such file or directory: 'hdfs://localhost:9000/data/test1.csv'

    at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
    at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1266)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1257)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1256)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1256)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1450)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1411)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)

我非常确定端口的主名称是localhost:9000,我不知道代码的错误。如果有人帮助我解决问题,我真的很感激。

0 个答案:

没有答案