为什么不能将镶木地板文件加载到spark数据框中?

时间:2017-04-06 03:19:09

标签: amazon-s3 pyspark spark-dataframe

df = spark.read.load("s3://s3-cdp-prod-hive/novaya/impliciy_score_matrix/")

我尝试从s3 hdfs加载镶木地板文件。但它给出了以下错误:

Traceback (most recent call last):
  File "/tmp/zeppelin_pyspark-486331500042535422.py", line 344, in <module>
    raise Exception(traceback.format_exc())
Exception: Traceback (most recent call last):
  File "/tmp/zeppelin_pyspark-486331500042535422.py", line 337, in <module>
    exec(code)
  File "<stdin>", line 4, in <module>
  File "/usr/lib/spark/python/pyspark/sql/readwriter.py", line 149, in load
    return self._df(self._jreader.load(path))
  File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/lib/spark/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
    format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o73.load.
: com.amazon.ws.emr.hadoop.fs.consistency.exception.ConsistencyException: 3 items inconsistent (no s3 object for associated metadata item). First object: /s3-cdp-prod-hive/novaya/impliciy_score_matrix/.hive-staging_hive_2017-03-30_15-12-27_966_3482598128600403763-456
    at com.amazon.ws.emr.hadoop.fs.consistency.ConsistencyCheckerS3FileSystem.listStatus(ConsistencyCheckerS3FileSystem.java:727)
    at com.amazon.ws.emr.hadoop.fs.consistency.ConsistencyCheckerS3FileSystem.listStatus(ConsistencyCheckerS3FileSystem.java:497)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:191)
    at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)
    at com.sun.proxy.$Proxy43.listStatus(Unknown Source)
    at com.amazon.ws.emr.hadoop.fs.s3n2.S3NativeFileSystem2.listStatus(S3NativeFileSystem2.java:192)
    at com.amazon.ws.emr.hadoop.fs.EmrFileSystem.listStatus(EmrFileSystem.java:339)
    at org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex$.org$apache$spark$sql$execution$datasources$PartitioningAwareFileIndex$$listLeafFiles(PartitioningAwareFileIndex.scala:392)
    at org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex$$anonfun$org$apache$spark$sql$execution$datasources$PartitioningAwareFileIndex$$bulkListLeafFiles$1.apply(PartitioningAwareFileIndex.scala:302)
    at org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex$$anonfun$org$apache$spark$sql$execution$datasources$PartitioningAwareFileIndex$$bulkListLeafFiles$1.apply(PartitioningAwareFileIndex.scala:301)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
    at scala.collection.AbstractTraversable.map(Traversable.scala:104)
    at org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex$.org$apache$spark$sql$execution$datasources$PartitioningAwareFileIndex$$bulkListLeafFiles(PartitioningAwareFileIndex.scala:301)
    at org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex.listLeafFiles(PartitioningAwareFileIndex.scala:253)
    at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.refresh0(InMemoryFileIndex.scala:74)
    at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:50)
    at org.apache.spark.sql.execution.datasources.DataSource.tempFileIndex$lzycompute$1(DataSource.scala:133)
    at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$tempFileIndex$1(DataSource.scala:124)
    at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$getOrInferFileFormatSchema(DataSource.scala:138)
    at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:387)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:152)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:135)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:280)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:214)
    at java.lang.Thread.run(Thread.java:745)
ERROR   
Took 8 min 39 sec. Last updated by yanan.chen at April 06 2017, 10:57:48 AM. (outdated)


%pyspark
df.show()
 Traceback (most recent call last):
  File "/tmp/zeppelin_pyspark-1196174262473053317.py", line 344, in <module>
    raise Exception(traceback.format_exc())
Exception: Traceback (most recent call last):
  File "/tmp/zeppelin_pyspark-1196174262473053317.py", line 337, in <module>
    exec(code)
  File "<stdin>", line 1, in <module>
NameError: name 'df' is not defined
ERROR   
Took 9 min 28 sec. Last updated by taoyang1 at April 06 2017, 6:55:45 AM.


%pyspark
als = ALS(maxIter=20, rank=10, regParam=0.01, userCol="member_srl", itemCol="productid", ratingCol="rating", seed=123)
#(training, test) = df.randomSplit([0.8, 0.2])
#model = als.fit(training)
model = als.fit(df)
FINISHED   
Took 2 min 12 sec. Last updated by yanan.chen at March 30 2017, 1:39:34 PM.


%pyspark
pred = model.transform(df)
FINISHED   
Took 0 sec. Last updated by yanan.chen at March 30 2017, 1:40:18 PM.


%pyspark
pred.count()
 Traceback (most recent call last):
  File "/tmp/zeppelin_pyspark-1727164243295023279.py", line 344, in <module>
    raise Exception(traceback.format_exc())
Exception: Traceback (most recent call last):
  File "/tmp/zeppelin_pyspark-1727164243295023279.py", line 337, in <module>
    exec(code)
  File "<stdin>", line 1, in <module>
  File "/usr/lib/spark/python/pyspark/sql/dataframe.py", line 380, in count
    return int(self._jdf.count())
  File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/lib/spark/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
    format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o416.count.
: org.apache.spark.SparkException: Job 19 cancelled because SparkContext was shut down
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:808)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:806)
    at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
    at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:806)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1668)
    at org.apache.spark.util.EventLoop.stop(EventLoop.scala:83)
    at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1587)
    at org.apache.spark.SparkContext$$anonfun$stop$8.apply$mcV$sp(SparkContext.scala:1826)
    at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1283)
    at org.apache.spark.SparkContext.stop(SparkContext.scala:1825)
    at org.apache.spark.SparkContext$$anon$3.run(SparkContext.scala:1770)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:935)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:934)
    at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:275)
    at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2371)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
    at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2765)
    at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2370)
    at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2377)
    at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2405)
    at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2404)
    at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2778)
    at org.apache.spark.sql.Dataset.count(Dataset.scala:2404)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:280)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:214)
    at java.lang.Thread.run(Thread.java:745)

我已成功加载另一个存储为镶木地板的表格,该表格不像这个那么大。负载故障是由于表的大小有2亿行。 任何人都知道这有什么问题吗?如何解决它。

1 个答案:

答案 0 :(得分:0)

您可以尝试增加执行程序和执行程序memmory的数量。 您可以像pyspark --num-executors 4 --executor-memory 4g

那样执行此操作