我有一个DataFrame(在Pyspark中称为df1
,其中一列的类型为DenseVector
。这是数据帧的架构。
DataFrame[prediction: double, probability: vector, label: double]
我尝试使用df1.rdd
方法将其转换为RDD。然后我对它执行count()
但是我收到以下错误消息。
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/spark/python/pyspark/rdd.py", line 1006, in count
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "/usr/lib/spark/python/pyspark/rdd.py", line 997, in sum
return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
File "/usr/lib/spark/python/pyspark/rdd.py", line 871, in fold
vals = self.mapPartitions(func).collect()
File "/usr/lib/spark/python/pyspark/rdd.py", line 773, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "/usr/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__
File "/usr/lib/spark/python/pyspark/sql/utils.py", line 36, in deco
return f(*a, **kw)
File "/usr/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line 300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 243.0 failed 4 times, most recent failure: Lost task 1.3 in stage 243.0 (TID 62500, anp-r02wn04.c03.hadoop.td.com): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/lib/spark/python/pyspark/worker.py", line 111, in main
process()
File "/usr/lib/spark/python/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/lib/spark/python/pyspark/rdd.py", line 2355, in pipeline_func
return func(split, prev_func(split, iterator))
File "/usr/lib/spark/python/pyspark/rdd.py", line 2355, in pipeline_func
return func(split, prev_func(split, iterator))
File "/usr/lib/spark/python/pyspark/rdd.py", line 317, in func
return f(iterator)
File "/usr/lib/spark/python/pyspark/rdd.py", line 1006, in <lambda>
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "/usr/lib/spark/python/pyspark/rdd.py", line 1006, in <genexpr>
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "/usr/lib/spark/python/pyspark/serializers.py", line 139, in load_stream
yield self._read_with_length(stream)
File "/usr/lib/spark/python/pyspark/serializers.py", line 164, in _read_with_length
return self.loads(obj)
File "/usr/lib/spark/python/pyspark/serializers.py", line 422, in loads
return pickle.loads(obj)
File "/usr/lib/spark/python/pyspark/sql/types.py", line 728, in _parse_datatype_json_string
return _parse_datatype_json_value(json.loads(json_string))
File "/usr/lib/spark/python/pyspark/sql/types.py", line 748, in _parse_datatype_json_value
return _all_complex_types[tpe].fromJson(json_value)
File "/usr/lib/spark/python/pyspark/sql/types.py", line 525, in fromJson
return StructType([StructField.fromJson(f) for f in json["fields"]])
File "/usr/lib/spark/python/pyspark/sql/types.py", line 425, in fromJson
_parse_datatype_json_value(json["type"]),
File "/usr/lib/spark/python/pyspark/sql/types.py", line 750, in _parse_datatype_json_value
return UserDefinedType.fromJson(json_value)
File "/usr/lib/spark/python/pyspark/sql/types.py", line 663, in fromJson
m = __import__(pyModule, globals(), locals(), [pyClass])
File "/usr/lib/spark/python/pyspark/mllib/__init__.py", line 25, in <module>
import numpy
ImportError: ('No module named numpy', <function _parse_datatype_json_string at 0x1f09d70>, (u'{"type":"struct","fields": [{"name":"prediction","type":"double","nullable":true,"metadata":{}}, {"name":"probability","type":{"type":"udt","class":"org.apache.spark.mllib.linalg.VectorUDT","pyClass":"pyspark.mllib.linalg.VectorUDT","sqlType":{"type":"struct","fields": [{"name":"type","type":"byte","nullable":false,"metadata":{}}, {"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}}, {"name":"label","type":"double","nullable":true,"metadata":{"ml_attr":{"vals":["0","1"],"type":"nominal","name":"label"}}}]}',))
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1294)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1282)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1281)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1281)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1507)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1469)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1850)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1921)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:905)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.RDD.collect(RDD.scala:904)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:373)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
这里出了什么问题?
答案 0 :(得分:4)
您的堆栈跟踪说:
File "/usr/lib/spark/python/pyspark/mllib/__init__.py", line 25, in <module>
import numpy
ImportError: ('No module named numpy', ...
似乎numpy
是mllib
的依赖项。确保安装了numpy,如果要通过线路序列化代码,请确保工作节点也可以访问numpy
库。
获取numpy
的最简单方法是安装Anaconda。
答案 1 :(得分:0)
我将在这里添加另一个与错误无关的答案,因为我已经忘记了我用Google搜索了多少次。
简单地说,最简单的方法是创建一个udf
来提取DenseVector的第一个元素(或第n个元素)。
简单示例(使用pyspark shell):
from pyspark import SparkContext
from pyspark.sql import Row
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors
FeatureRow = Row('id', 'features')
data = sc.parallelize([(0, Vectors.dense([9.7, 1.0, -3.2])),
(1, Vectors.dense([2.25, -11.1, 123.2])),
(2, Vectors.dense([-7.2, 1.0, -3.2]))])
df = data.map(lambda r: FeatureRow(*r)).toDF()
vector_udf = udf(lambda vector: float(vector[1]), DoubleType())
df.withColumn('feature_sums', vector_udf(df.features)).first()