我在使用Hortonworks HDP 2.5的Hadoop集群中使用pyspark 2.0。 我尝试用以下内容阅读镶木地板文件:
dfsms = spark.read.parquet("/projects/data/parquetfolder")
我可以看到数据的标题并打印几行。但是,当我尝试类似的东西时:
dfsms.count()
dfsms.describe().show()
我收到以下错误:
java.io.IOException: can not read class org.apache.parquet.format.FileMetaData: Required field 'version' was not found in serialized data! Struct: FileMetaData(version:0, schema:null, num_rows:0, row_groups:null)
--------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-7-0717f4acccb0> in <module>()
1 #dfsms.show(10)
2
----> 3 dfsms.count()
/usr/hdp/2.5.0.0-1245/spark2/python/pyspark/sql/dataframe.py in count(self)
297 2
298 """
--> 299 return int(self._jdf.count())
300
301 @ignore_unicode_prefix
/usr/hdp/2.5.0.0-1245/spark2/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
931 answer = self.gateway_client.send_command(command)
932 return_value = get_return_value(
--> 933 answer, self.gateway_client, self.target_id, self.name)
934
935 for temp_arg in temp_args:
/usr/hdp/2.5.0.0-1245/spark2/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/hdp/2.5.0.0-1245/spark2/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
310 raise Py4JJavaError(
311 "An error occurred while calling {0}{1}{2}.\n".
--> 312 format(target_id, ".", name), value)
313 else:
314 raise Py4JError(
Py4JJavaError: An error occurred while calling o46.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 1.0 failed 4 times, most recent failure: Lost task 5.3 in stage 1.0 (TID 42, worker12.): java.io.IOException: can not read class org.apache.parquet.format.FileMetaData: Required field 'version' was not found in serialized data! Struct: FileMetaData(version:0, schema:null, num_rows:0, row_groups:null)