根据文档(PYSPARK_DRIVER_PYTHON=ipython PYSPARK_DRIVER_PYTHON_OPTS="notebook" ./bin/pyspark
)启动,然后填写:
from os import path
from tempfile import gettempdir
#from pyspark import SparkFiles
filename = path.join(gettempdir(), 'somefile.txt')
with open(filename, 'w') as f:
f.writelines(['foo\n'*500])
#sc = SparkContext(appName="PythonSort")
sc.addFile(filename)
print 'sc.textFile(filename).count() =', sc.textFile(filename).count()
sc.stop()
输出:sc.textFile(filename).count() = 500
%pyspark
# Then same as "IPython notebook"
输出:
(<class 'py4j.protocol.Py4JError'>, Py4JError(u'An error occurred while calling None.org.apache.spark.api.python.PythonRDD. Trace:\npy4j.Py4JException: Constructor org.apache.spark.api.python.PythonRDD([class org.apache.spark.rdd.MapPartitionsRDD, class [B, class java.util.HashMap, class java.util.ArrayList, class java.lang.Boolean, class java.lang.String, class java.lang.String, class java.util.ArrayList, class org.apache.spark.Accumulator]) does not exist\n\tat py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:184)\n\tat py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:202)\n\tat py4j.Gateway.invoke(Gateway.java:213)\n\tat py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:79)\n\tat py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:207)\n\tat java.lang.Thread.run(Thread.java:745)\n\n',), <traceback object at 0x7f3f79e24440>)