我想在Spark Dataframe上创建日期范围,默认情况下没有函数可以执行此操作。所以,我写了这个,
from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.types import *
spark = SparkSession.builder.appName('test').getOrCreate()
data_frame = spark.range(1, 10).withColumn('date_start', F.to_date(F.lit('2018-01-01'), 'yyyy-MM-dd'))
结果是
+---+----------+
| id|date_start|
+---+----------+
| 1|2018-01-01|
| 2|2018-01-01|
| 3|2018-01-01|
| 4|2018-01-01|
| 5|2018-01-01|
+---+----------+
现在,我想在“ date_start”列中添加“ id”,并创建一列从开始到结束的日期范围。
data_frame.withColumn('date_window', F.date_add(F.col('date_start'), F.col('id')))
但是我遇到了TypeError
TypeError Traceback (most recent call last)
<ipython-input-151-9e46a2ad88a2> in <module>
----> 1 data_frame.withColumn('date_window', F.date_add(F.col('date_start'), F.col('id')))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\sql\functions.py in date_add(start, days)
1039 """
1040 sc = SparkContext._active_spark_context
-> 1041 return Column(sc._jvm.functions.date_add(_to_java_column(start), days))
1042
1043
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1246
1247 def __call__(self, *args):
-> 1248 args_command, temp_args = self._build_args(*args)
1249
1250 command = proto.CALL_COMMAND_NAME +\
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\java_gateway.py in _build_args(self, *args)
1210 def _build_args(self, *args):
1211 if self.converters is not None and len(self.converters) > 0:
-> 1212 (new_args, temp_args) = self._get_args(args)
1213 else:
1214 new_args = args
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\java_gateway.py in _get_args(self, args)
1197 for converter in self.gateway_client.converters:
1198 if converter.can_convert(arg):
-> 1199 temp_arg = converter.convert(arg, self.gateway_client)
1200 temp_args.append(temp_arg)
1201 new_args.append(temp_arg)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\java_collections.py in convert(self, object, gateway_client)
498 ArrayList = JavaClass("java.util.ArrayList", gateway_client)
499 java_list = ArrayList()
--> 500 for element in object:
501 java_list.add(element)
502 return java_list
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\sql\column.py in __iter__(self)
342
343 def __iter__(self):
--> 344 raise TypeError("Column is not iterable")
345
346 # string methods
TypeError: Column is not iterable
由于某些原因,我可以使用Spark函数expr
data_frame.withColumn("date_window", F.expr("date_add(date_start, id)"))
瞧瞧!似乎可行
+---+----------+-----------+
| id|date_start|date_window|
+---+----------+-----------+
| 1|2018-01-01| 2018-01-02|
| 2|2018-01-01| 2018-01-03|
| 3|2018-01-01| 2018-01-04|
| 4|2018-01-01| 2018-01-05|
| 5|2018-01-01| 2018-01-06|
+---+----------+-----------+
我的问题是:expr
函数与我编写的函数有什么不同?