我最近碰到了一个在交互式shell(Jupyter)中使用PySpark 1.5之前没有遇到过的问题。我加入了2个Spark数据帧并在下面收到了这个回溯。我尝试切换到HiveContext与SQLContext以及2个方法来将Parquet列作为字符串读取。仍然没有问题解决了?...
AttributeError Traceback (most recent call last)
<ipython-input-36-068dbaf1acc4> in <module>()
3 fm = fm.withColumn("date_time",fm.date_time.astype('Timestamp'))
4 statement = [fm.tail==reas.tail,fm.date_time >= reas.d_date,fm.date_time <= reas.a_date]
----> 5 aca =reas.join(fm,statement,'inner').select(reas.tail,reas.dwn,reas.up,reas.d_date,reas.a_date,reas.id,reas.reason,reas.dt,reas.technology,fm.date_time,fm.lon,fm.lat,fm.cov,fm.reg)
/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/dataframe.py in select(self, *cols)
764 [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]
765 """
--> 766 jdf = self._jdf.select(self._jcols(*cols))
767 return DataFrame(jdf, self.sql_ctx)
768
/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/dataframe.py in _jcols(self, *cols)
641 if len(cols) == 1 and isinstance(cols[0], list):
642 cols = cols[0]
--> 643 return self._jseq(cols, _to_java_column)
644
645 @since("1.3.1")
/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/dataframe.py in _jseq(self, cols, converter)
628 def _jseq(self, cols, converter=None):
629 """Return a JVM Seq of Columns from a list of Column or names"""
--> 630 return _to_seq(self.sql_ctx._sc, cols, converter)
631
632 def _jmap(self, jm):
/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/column.py in _to_seq(sc, cols, converter)
58 """
59 if converter:
---> 60 cols = [converter(c) for c in cols]
61 return sc._jvm.PythonUtils.toSeq(cols)
62
/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/column.py in <listcomp>(.0)
58 """
59 if converter:
---> 60 cols = [converter(c) for c in cols]
61 return sc._jvm.PythonUtils.toSeq(cols)
62
/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/column.py in _to_java_column(col)
46 jcol = col._jc
47 else:
---> 48 jcol = _create_column_from_name(col)
49 return jcol
50
/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/column.py in _create_column_from_name(name)
39 def _create_column_from_name(name):
40 sc = SparkContext._active_spark_context
---> 41 return sc._jvm.functions.col(name)
42
43
/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
527
528 args_command = ''.join(
--> 529 [get_command_part(arg, self.pool) for arg in new_args])
530
531 command = CALL_COMMAND_NAME +\
/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in <listcomp>(.0)
527
528 args_command = ''.join(
--> 529 [get_command_part(arg, self.pool) for arg in new_args])
530
531 command = CALL_COMMAND_NAME +\
/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_command_part(parameter, python_proxy_pool)
263 command_part += ';' + interface
264 else:
--> 265 command_part = REFERENCE_TYPE + parameter._get_object_id()
266
267 command_part += '\n'
AttributeError: 'function' object has no attribute '_get_object_id'
这是我的导入
import datetime
import pytz
import os
import time
import sys
import string
from datetime import date, timedelta
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import col, unix_timestamp, round
from pyspark.sql.functions import when,lag
from pyspark.sql.functions import regexp_replace
from pyspark.sql import DataFrameWriter
from pyspark.sql import Window
sc.setLogLevel("WARN")
sqlContext = SQLContext(sc)
# sqlContext.setConf("spark.sql.parquet.binaryAsString","true")
sqlContext.sql("SET spark.sql.parquet.binaryAsString=true")