我很新兴,我和Jupyter一起使用它。现在我想导入shakespeare.txt文件并运行以下代码
from __future__ import print_function
from operator import add
# Use the SparkContext sc here, see below.
lines = sc.textFile("C:\Users\rajarshi.bhadra\Desktop\Spark_Testing\Gutenberg\shakespeare.txt")
counts = lines.flatMap(lambda x: x.split(' ')) \
.map(lambda x: (x, 1)) \
.reduceByKey(add)
output = counts.collect()
# Print the first 100 word counts as pairs
for (word, count) in output[:100]:
print("%s: %i" % (word, count))
为此,我收到以下错误。任何有关这方面的帮助将不胜感激。
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-114-41cbbaab4214> in <module>()
5 lines = sc.textFile("C:\Users\rajarshi.bhadra\Desktop\Spark_Testing\Gutenberg\shakespeare.txt")
6
----> 7 counts = lines.flatMap(lambda x: x.split(' ')) .map(lambda x: (x, 1)) .reduceByKey(add)
8
9 output = counts.collect()
C:\spark-1.3.1\python\pyspark\rdd.pyc in reduceByKey(self, func, numPartitions)
1480 [('a', 2), ('b', 1)]
1481 """
-> 1482 return self.combineByKey(lambda x: x, func, func, numPartitions)
1483
1484 def reduceByKeyLocally(self, func):
C:\spark-1.3.1\python\pyspark\rdd.pyc in combineByKey(self, createCombiner, mergeValue, mergeCombiners, numPartitions)
1691 """
1692 if numPartitions is None:
-> 1693 numPartitions = self._defaultReducePartitions()
1694
1695 serializer = self.ctx.serializer
C:\spark-1.3.1\python\pyspark\rdd.pyc in _defaultReducePartitions(self)
2074 return self.ctx.defaultParallelism
2075 else:
-> 2076 return self.getNumPartitions()
2077
2078 def lookup(self, key):
C:\spark-1.3.1\python\pyspark\rdd.pyc in getNumPartitions(self)
319 2
320 """
--> 321 return self._jrdd.partitions().size()
322
323 def filter(self, f):
C:\spark-1.3.1\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
C:\spark-1.3.1\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o311.partitions.
ajarshi.bhadra/Desktop/Spark_Testing/Gutenberg/shakespeare.txt
at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:197)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:208)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:203)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
at org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:57)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
at org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:64)
at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:46)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Unknown Source)
答案 0 :(得分:0)
当我将gutenberg文件夹作为我的SPARK_HOME中的spark-1.3.1内的子文件夹时,它有效。确切地说,当我这样做时
sc.textFile("C:\spark-1.3.1\Data_Work\Gutenberg\shakespeare.txt")
创建适当的目录后