我在熊猫中有10行9列的数据框示例,并想在pyspark中使用它。
出于某种原因,pyspark方法collect()
会引发错误,并且错误是无意义的或隐秘的,足以被视为无意义。
我的问题是为什么我不能在pyspark中使用数据框?
请在下面找到所有详细信息。
python 3.6.5,java 10.0.2
装有pip的pyspark,可在单台计算机上运行。
import pandas as pd
pd.__version__
#'0.23.4'
pf = pd.DataFrame.from_dict({'id1': {0: 'id046', 1: 'id041', 2: 'id036', 3: 'id067', 4: 'id047', 5: 'id003', 6: 'id052', 7: 'id024', 8: 'id051', 9: 'id048'}, 'id2': {0: 'id018', 1: 'id047', 2: 'id039', 3: 'id100', 4: 'id086', 5: 'id002', 6: 'id018', 7: 'id062', 8: 'id043', 9: 'id051'}, 'id3': {0: 'id0000006976', 1: 'id0000009477', 2: 'id0000003195', 3: 'id0000001035', 4: 'id0000005874', 5: 'id0000001900', 6: 'id0000002599', 7: 'id0000005962', 8: 'id0000003913', 9: 'id0000001059'}, 'id4': {0: 97, 1: 89, 2: 23, 3: 44, 4: 79, 5: 10, 6: 42, 7: 34, 8: 51, 9: 60}, 'id5': {0: 67, 1: 75, 2: 33, 3: 31, 4: 75, 5: 34, 6: 73, 7: 57, 8: 58, 9: 49}, 'id6': {0: 7103, 1: 699, 2: 7180, 3: 3189, 4: 5530, 5: 4013, 6: 4912, 7: 1677, 8: 9892, 9: 5903}, 'v1': {0: 3, 1: 4, 2: 1, 3: 3, 4: 5, 5: 3, 6: 5, 7: 5, 8: 3, 9: 1}, 'v2': {0: 5, 1: 4, 2: 2, 3: 2, 4: 4, 5: 1, 6: 1, 7: 5, 8: 5, 9: 4}, 'v3': {0: 2.4358, 1: 15.3333, 2: 82.1465, 3: 32.4652, 4: 2.4358, 5: 73.0214, 6: 13.958, 7: 30.7107, 8: 5.1784, 9: 63.2878}})
import pyspark as ps
ps.__version__
#'2.3.1'
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("appname").getOrCreate()
sf = spark.createDataFrame(pf)
sf.collect()
错误:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/jan/git/my-project/spark/py-spark/lib/python3.6/site-packages/py
spark/sql/dataframe.py", line 466, in collect
sock_info = self._jdf.collectToPython()
spark/sql/dataframe.py", line 466, in collect [63/4510] sock_info = self._jdf.collectToPython()
File "/home/jan/git/my-project/spark/py-spark/lib/python3.6/site-packages/py
4j/java_gateway.py", line 1257, in __call__ answer, self.gateway_client, self.target_id, self.name)
File "/home/jan/git/my-project/spark/py-spark/lib/python3.6/site-packages/py
spark/sql/utils.py", line 63, in deco
return f(*a, **kw) File "/home/jan/git/my-project/spark/py-spark/lib/python3.6/site-packages/py
4j/protocol.py", line 328, in get_return_value format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o39.collectToPython.
: java.lang.IllegalArgumentException
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.spark.util.ClosureCleaner$.getClassReader(ClosureCleaner.s
cala:46)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodI
nsn$2.apply(ClosureCleaner.scala:449)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodI
nsn$2.apply(ClosureCleaner.scala:432)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$vis[42/4510]nsn$2.apply(ClosureCleaner.scala:432)
at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(
TraversableLike.scala:733) at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(Has
hMap.scala:103)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(Has
hMap.scala:103) at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala
:230) at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
at scala.collection.mutable.HashMap$$anon$1.foreach(HashMap.scala:103) at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.s
cala:732)
at org.apache.spark.util.FieldAccessFinder$$anon$3.visitMethodInsn(Closu
reCleaner.scala:432)
at org.apache.xbean.asm5.ClassReader.a(Unknown Source)
at org.apache.xbean.asm5.ClassReader.b(Unknown Source)
at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$C
losureCleaner$$clean$14.apply(ClosureCleaner.scala:262)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$$
losureCleaner$$clean$14.apply(ClosureCleaner.scala:262) [21/4510] at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$C
losureCleaner$$clean$14.apply(ClosureCleaner.scala:261)
at scala.collection.immutable.List.foreach(List.scala:381) at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCl
eaner$$clean(ClosureCleaner.scala:261)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:159)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2299) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2073)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099) at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.s
cala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.sca
la:297)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset
.scala:3195)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset
.scala:3192)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply[0/4510].scala:3192)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3254)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3253)
at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3192)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:564)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:844)