如果我们通过julian day,我需要编写一个自定义python udf来获取日期,这与以下逻辑相反。请指教。
def date_to_julian_day(my_date):
"""Returns the Julian day number of a date."""
a = (14 - my_date.month)//12
y = my_date.year + 4800 - a
m = my_date.month + 12*a - 3
return my_date.day + ((153*m + 2)//5) + 365*y + y//4 - y//100 + y//400 - 32045
答案 0 :(得分:0)
from pyspark.sql import functions as f
rdd = spark.sparkContext.parallelize([(1, '2017-03-01'), (2, '2017-03-02')])
df = spark.createDataFrame(rdd, schema=['idx', 'dt'])
df = df.select(df['idx'], f.to_date(df['dt']).alias('dt'))
My_UDF = f.UserDefinedFunction(date_to_julian_day, returnType=StringType())
df = df.withColumn('julian', My_UDF(df['dt']))
df.show()