This is our code:
from __future__ import print_function
import sys
from pyspark import SparkContext
if __name__ == "__main__":
sc = SparkContext(appName="m")
f = sc.textFile('/home/hduser/Downloads/ml-1m/b.csv')
genres = ["Action","Adventure","Animation", "Children's","Comedy","Crime Documentary","Drama"," Fantasy","Film- Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"]
g = f.map(lambda x:x.split(','))
r = {}
for genre in genres:
a = g.filter(lambda x: genre in x[1])
for i in range(6041):
u = a.filter(lambda x: int(x[2]) == i)
x = u.map(lambda x:int(x[3])).sum()
if u.count() == 0:
r[i] = 0
else:
r[i] = float(x)/u.count()
print(x)
sc.stop()
调用sum()函数时显示错误:
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 15.0 failed 4 times, most recent failure: Lost task 1.3 in stage 15.0 (TID 53, 10.70.3.170, executor 1): java.lang.ClassCastException: cannot assign instance of scala.collection.immutable.List$SerializationProxy to field org.apache.spark.rdd.RDD.org$apache$spark$rdd$RDD$$dependencies_ of type scala.collection.Seq in instance of org.apache.spark.rdd.MapPartitionsRDD
at java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2127)
at java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1305)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2247)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2241)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:420)
at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:479)
at sun.reflect.GeneratedMethodAccessor3.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1058)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2132)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2241)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2241)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:420)
at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:479)
at sun.reflect.GeneratedMethodAccessor3.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1058)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2132)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2241)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2241)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:420)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:80)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:453)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassCastException: cannot assign instance of scala.collection.immutable.List$SerializationProxy to field org.apache.spark.rdd.RDD.org$apache$spark$rdd$RDD$$dependencies_ of type scala.collection.Seq in instance of org.apache.spark.rdd.MapPartitionsRDD
at java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2127)
at java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1305)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2247)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2241)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:420)
at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:479)
at sun.reflect.GeneratedMethodAccessor3.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1058)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2132)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2241)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2241)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:420)
at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:479)
at sun.reflect.GeneratedMethodAccessor3.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1058)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2132)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2241)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2241)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2165)
at 17/07/10 11:51:01 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 49, 10.70.3.232, executor 0): TaskKilled (killed intentionally)
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2023)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1533)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:420)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:80)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)