PySpark 1.5 Join-- AttributeError:' function'对象没有属性' _get_object_id'

时间:2016-05-13 16:36:38

标签: apache-spark pyspark

我最近碰到了一个在交互式shell(Jupyter)中使用PySpark 1.5之前没有遇到过的问题。我加入了2个Spark数据帧并在下面收到了这个回溯。我尝试切换到HiveContext与SQLContext以及2个方法来将Parquet列作为字符串读取。仍然没有问题解决了?...

AttributeError                            Traceback (most recent call last)
<ipython-input-36-068dbaf1acc4> in <module>()
      3 fm = fm.withColumn("date_time",fm.date_time.astype('Timestamp'))
      4 statement = [fm.tail==reas.tail,fm.date_time >=   reas.d_date,fm.date_time <= reas.a_date]
----> 5 aca =reas.join(fm,statement,'inner').select(reas.tail,reas.dwn,reas.up,reas.d_date,reas.a_date,reas.id,reas.reason,reas.dt,reas.technology,fm.date_time,fm.lon,fm.lat,fm.cov,fm.reg)                  

/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/dataframe.py in select(self, *cols)
764         [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]
765         """
--> 766         jdf = self._jdf.select(self._jcols(*cols))
767         return DataFrame(jdf, self.sql_ctx)
768 

/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/dataframe.py in _jcols(self, *cols)
641         if len(cols) == 1 and isinstance(cols[0], list):
642             cols = cols[0]
--> 643         return self._jseq(cols, _to_java_column)
644 
645     @since("1.3.1")

/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/dataframe.py in _jseq(self, cols, converter)
628     def _jseq(self, cols, converter=None):
629         """Return a JVM Seq of Columns from a list of Column or names"""
--> 630         return _to_seq(self.sql_ctx._sc, cols, converter)
631 
632     def _jmap(self, jm):

/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/column.py in _to_seq(sc, cols, converter)
 58     """
 59     if converter:
---> 60         cols = [converter(c) for c in cols]
 61     return sc._jvm.PythonUtils.toSeq(cols)
 62 

/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/column.py in <listcomp>(.0)
 58     """
 59     if converter:
---> 60         cols = [converter(c) for c in cols]
 61     return sc._jvm.PythonUtils.toSeq(cols)
 62 

/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/column.py in _to_java_column(col)
 46         jcol = col._jc
 47     else:
---> 48         jcol = _create_column_from_name(col)
 49     return jcol
 50 

/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/pyspark/sql/column.py in _create_column_from_name(name)
 39 def _create_column_from_name(name):
 40     sc = SparkContext._active_spark_context
---> 41     return sc._jvm.functions.col(name)
 42 
 43 

/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
527 
528         args_command = ''.join(
--> 529                 [get_command_part(arg, self.pool) for arg in new_args])
530 
531         command = CALL_COMMAND_NAME +\

/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in <listcomp>(.0)
527 
528         args_command = ''.join(
--> 529                 [get_command_part(arg, self.pool) for arg in new_args])
530 
531         command = CALL_COMMAND_NAME +\

/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_command_part(parameter, python_proxy_pool)
263             command_part += ';' + interface
264     else:
--> 265         command_part = REFERENCE_TYPE + parameter._get_object_id()
266 
267     command_part += '\n'

AttributeError: 'function' object has no attribute '_get_object_id'

这是我的导入

import datetime
import pytz
import os
import time
import sys
import string
from datetime import date, timedelta
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import col, unix_timestamp, round
from pyspark.sql.functions import when,lag
from pyspark.sql.functions import regexp_replace
from pyspark.sql import DataFrameWriter
from pyspark.sql import Window

sc.setLogLevel("WARN")
sqlContext = SQLContext(sc)
# sqlContext.setConf("spark.sql.parquet.binaryAsString","true")
sqlContext.sql("SET spark.sql.parquet.binaryAsString=true")

0 个答案:

没有答案