我正在尝试在select语句中进行简单的计算,如下所示:
for d in dataframes:
d = d.select(
'request_timestamp',
'shard_id',
'account_id',
repeat(lit('1'), (13 - length('account_id').cast(IntegerType()))).alias('custom'))
d.show()
repeat
函数返回以下错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-13-07f1c7fd01f2> in <module>()
56 'account_id',
57 # length('account_id').alias('len'))
---> 58 repeat(lit('1'), length('account_id').cast(IntegerType())).alias('padding'))
59 d.show()
/databricks/spark/python/pyspark/sql/functions.py in repeat(col, n)
1419 """
1420 sc = SparkContext._active_spark_context
-> 1421 return Column(sc._jvm.functions.repeat(_to_java_column(col), n))
1422
1423
/databricks/spark/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py in __call__(self, *args)
1122
1123 def __call__(self, *args):
-> 1124 args_command, temp_args = self._build_args(*args)
1125
1126 command = proto.CALL_COMMAND_NAME +\
/databricks/spark/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py in _build_args(self, *args)
1086 def _build_args(self, *args):
1087 if self.converters is not None and len(self.converters) > 0:
-> 1088 (new_args, temp_args) = self._get_args(args)
1089 else:
1090 new_args = args
/databricks/spark/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py in _get_args(self, args)
1073 for converter in self.gateway_client.converters:
1074 if converter.can_convert(arg):
-> 1075 temp_arg = converter.convert(arg, self.gateway_client)
1076 temp_args.append(temp_arg)
1077 new_args.append(temp_arg)
/databricks/spark/python/lib/py4j-0.10.3-src.zip/py4j/java_collections.py in convert(self, object, gateway_client)
510 HashMap = JavaClass("java.util.HashMap", gateway_client)
511 java_map = HashMap()
--> 512 for key in object.keys():
513 java_map[key] = object[key]
514 return java_map
TypeError: 'Column' object is not callable
我知道这可以通过udf
轻松完成,但我想了解为什么即使我将它投射到Integer
,我也无法完成这项工作。
答案 0 :(得分:1)
Spark repeat
只能处理普通整数作为第二个参数。但是Hive的repeat
可以处理计算,你可以像expr
这样使用它:
df.select(expr('repeat(1, 13 - length(account_id))').alias('custom'))