我是SPARK概念的新手。 我已下载并安装了spark,并希望使用Spyder GUI开始学习。 我尝试运行以下名为" SpyderSetupForSpark.py"的代码片段。并得到一个巨大的错误列表。
# -*- coding: utf-8 -*-
"""
Make sure you give execute privileges
-----------------------------------------------------------------------------
Spark with Python: Setup Spyder IDE for Spark
Execute this script once when Spyder is started on Windows
-----------------------------------------------------------------------------
"""
import os
import sys
os.chdir("C:\Spark\spark-2.0.1-bin-hadoop2.7\python")
os.curdir
# Configure the environment. Set this up to the directory where
# Spark is installed
if 'SPARK_HOME' not in os.environ:
os.environ['SPARK_HOME'] = 'C:\Spark\spark-2.0.1-bin-hadoop2.7'
# Create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']
#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exist. The names might change
#as versions change.
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.3-src.zip"))
#Initiate Spark context. Once this is done all other applications can run
from pyspark import SparkContext
from pyspark import SparkConf
# Optionally configure Spark Settings
conf=SparkConf()
conf.set("spark.executor.memory", "1g")
conf.set("spark.cores.max", "2")
conf.setAppName("V2 Maestros")
## Initialize SparkContext. Run only once. Otherwise you get multiple
#Context Error.
sc = SparkContext('local', conf=conf)
#Test to make sure everything works.
lines=sc.textFile("auto-data.csv")
lines.count()
我收到以下错误列表:
Traceback (most recent call last):
File "<ipython-input-2-a17bcb5851dc>", line 2, in <module>
lines.count()
File "C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py", line 1008, in count
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py", line 999, in sum
return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
File "C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py", line 873, in fold
vals = self.mapPartitions(func).collect()
File "C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py", line 776, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\py4j-0.10.3-src.zip\py4j\java_gateway.py", line 1133, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\py4j-0.10.3-src.zip\py4j\protocol.py", line 319, in get_return_value
format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.SparkException: Python worker did not connect back in time
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:138)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:67)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:114)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:128)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
at java.net.PlainSocketImpl.accept(Unknown Source)
at java.net.ServerSocket.implAccept(Unknown Source)
at java.net.ServerSocket.accept(Unknown Source)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:133)
... 12 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:453)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker did not connect back in time
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:138)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:67)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:114)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:128)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
at java.net.PlainSocketImpl.accept(Unknown Source)
at java.net.ServerSocket.implAccept(Unknown Source)
at java.net.ServerSocket.accept(Unknown Source)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:133)
... 12 more
请帮忙。 此致
答案 0 :(得分:1)
最后我解决了这个问题。我更改了Python的目录设置,如下所示:
import os
import sys
os.chdir("C:\Python27")
os.curdir
它就像一个魅力!