我尝试从我的用户定义函数
返回数据框作为列表时遇到错误myDataFrame = (
sc.parallelize([
(10001, "2017-02-12 12:01:40" , "2017-02-12 12:56:32"),
(10001, "2017-02-13 12:06:32" , "2017-02-15 16:06:32"),
(10001, "2017-02-16 21:45:56" , "2017-02-21 21:45:56"),
(10001, "2017-02-21 22:32:41" , "2017-02-25 00:52:50"),
]).toDF(["id", "startTime" , "endTime"]).withColumn("startTime", col("startTime").cast("timestamp")).withColumn("endTime", col("endTime").cast("timestamp")))
return_type = ArrayType(MapType(StringType(), StringType()))
@udf(returnType=return_type)
def myUdf(start, end):
start = pd.to_datetime(start,infer_datetime_format=True)
end = pd.to_datetime(end,infer_datetime_format=True)
rng = pd.date_range(start.floor('h'), end.floor('h'), freq='h')
left = pd.Series(rng, index=rng).clip_lower(start)
right = pd.Series(rng + 1, index=rng).clip_upper(end)
timeSeries = right - left
resultDataFrame = []
for key, result in timeSeries.items():
resultDataFrame.append((datetime.weekday(key.date()) , key.time().hour , int(result.total_seconds()//60)))
resultDataFrame = pd.DataFrame(resultDataFrame, columns=('day', 'hour', 'minute'))
response = resultDataFrame.to_dict("index").values()
return (list(response))
extracted = myUdf("startTime", "endTime")
exploded = explode(extracted).alias("exploded")
expanded = [col("exploded").getItem(k).alias(k) for k in ["day", "hour", "minute"]]
result = myDataFrame.select("id", exploded).select("id",*expanded)
result.show()
我想打印这样的结果
+---------+----+----+------+
|utilityId|day |hour|minute|
+---------+----+----+------+
但是,我得到的错误就像
ERROR Executor: Exception in task 0.0 in stage 1005.0 (TID 18845)
net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.dtype)
at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707)
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175)
at net.razorvine.pickle.Unpickler.load(Unpickler.java:99)
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1$$anonfun$apply$6.apply(BatchEvalPythonExec.scala:156)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1$$anonfun$apply$6.apply(BatchEvalPythonExec.scala:155)
答案 0 :(得分:1)
您返回一个numpy类型对象的字典列表,这些字段不是pyspark支持的类型。
type(list(response)[0]['day'])
numpy.int64
使用" 2017-02-13 12:06:32" ," 2017-02-15 16:06:32"当start
和end
函数返回时:
+-----+------+-------+--------+
| | day | hour | minute |
+-----+------+-------+--------+
| 0 | 0 | 12 | 53 |
| 1 | 0 | 13 | 60 |
| 2 | 0 | 14 | 60 |
| 3 | 0 | 15 | 60 |
| 4 | 0 | 16 | 60 |
| 5 | 0 | 17 | 60 |
| 6 | 0 | 18 | 60 |
| 7 | 0 | 19 | 60 |
| 8 | 0 | 20 | 60 |
| 9 | 0 | 21 | 60 |
| 10 | 0 | 22 | 60 |
| 11 | 0 | 23 | 60 |
| 12 | 1 | 0 | 60 |
| 13 | 1 | 1 | 60 |
| 14 | 1 | 2 | 60 |
| 15 | 1 | 3 | 60 |
| 16 | 1 | 4 | 60 |
| 17 | 1 | 5 | 60 |
| 18 | 1 | 6 | 60 |
| 19 | 1 | 7 | 60 |
| 20 | 1 | 8 | 60 |
| 21 | 1 | 9 | 60 |
| 22 | 1 | 10 | 60 |
| 23 | 1 | 11 | 60 |
| 24 | 1 | 12 | 60 |
| 25 | 1 | 13 | 60 |
| 26 | 1 | 14 | 60 |
| 27 | 1 | 15 | 60 |
| 28 | 1 | 16 | 60 |
| 29 | 1 | 17 | 60 |
| 30 | 1 | 18 | 60 |
| 31 | 1 | 19 | 60 |
| 32 | 1 | 20 | 60 |
| 33 | 1 | 21 | 60 |
| 34 | 1 | 22 | 60 |
| 35 | 1 | 23 | 60 |
| 36 | 2 | 0 | 60 |
| 37 | 2 | 1 | 60 |
| 38 | 2 | 2 | 60 |
| 39 | 2 | 3 | 60 |
| 40 | 2 | 4 | 60 |
| 41 | 2 | 5 | 60 |
| 42 | 2 | 6 | 60 |
| 43 | 2 | 7 | 60 |
| 44 | 2 | 8 | 60 |
| 45 | 2 | 9 | 60 |
| 46 | 2 | 10 | 60 |
| 47 | 2 | 11 | 60 |
| 48 | 2 | 12 | 60 |
| 49 | 2 | 13 | 60 |
| 50 | 2 | 14 | 60 |
| 51 | 2 | 15 | 60 |
| 52 | 2 | 16 | 6 |
+-----+------+-------+--------+
我们可以用小时构建日期范围并从中导出其他列:
import datetime as dt
import pyspark.sql.functions as psf
from pyspark.sql.types import ArrayType, TimestampType
date_range_udf = psf.udf(
lambda start, end: [start + dt.timedelta(hours=x)
for x in range(0, int((end.replace(second=0,minute=0) - start.replace(second=0,minute=0)).total_seconds())//3600 + 1)],
ArrayType(TimestampType()))
myDataFrame_range = myDataFrame \
.withColumn("date", psf.explode(date_range_udf("startTime", "endTime")))
myDataFrame_range.show()
+-----+-------------------+-------------------+-------------------+
| id| startTime| endTime| date|
+-----+-------------------+-------------------+-------------------+
|10001|2017-02-12 12:01:40|2017-02-12 12:56:32|2017-02-12 12:01:40|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 12:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 13:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 14:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 15:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 16:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 17:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 18:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 19:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 20:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 21:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 22:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-13 23:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-14 00:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-14 01:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-14 02:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-14 03:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-14 04:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-14 05:06:32|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32|2017-02-14 06:06:32|
+-----+-------------------+-------------------+-------------------+
现在是其他专栏:
myDataFrame_range \
.select(
'id', 'startTime', 'endTime',
(psf.from_unixtime(psf.unix_timestamp('date'), 'u') - 1).cast('int').alias('day'),
psf.hour('date').alias('hour'),
psf.when(
psf.col('startTime') == psf.col('date'),
60 - psf.minute('startTime') - (psf.second('startTime') > 0).cast('int')) \
.when(
((psf.unix_timestamp('endTime') - psf.unix_timestamp('date'))/3600).cast("int") == 0,
psf.minute('startTime')) \
.otherwise(60).alias('minute')) \
.show()
+-----+-------------------+-------------------+---+----+------+
| id| startTime| endTime|day|hour|minute|
+-----+-------------------+-------------------+---+----+------+
|10001|2017-02-12 12:01:40|2017-02-12 12:56:32| 6| 12| 58|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 12| 53|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 13| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 14| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 15| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 16| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 17| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 18| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 19| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 20| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 21| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 22| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 0| 23| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 1| 0| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 1| 1| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 1| 2| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 1| 3| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 1| 4| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 1| 5| 60|
|10001|2017-02-13 12:06:32|2017-02-15 16:06:32| 1| 6| 60|
+-----+-------------------+-------------------+---+----+------+
最好尽量避免使用UDF
,请注意pyspark.sql.functions
已针对计算进行了优化。