我正在尝试从一个已有的具有矢量或10个浮点值的列中的数据框中创建一些新闻列。如果我使用udf,则会收到“ 无法序列化对象:Py4JError:调用o90 .__ getstate __ 时发生错误”错误
我正在创建一个火花实例并在其中运行操作。
第1部分:(只需创建示例数据框)
orderBlock
输出:
from pyspark.sql.functions import lower, udf, lit, substring
from pyspark.sql.types import StringType, IntegerType
data = [([0.15338473676518236,0.11111990827514251,0.1094681573180615,0.08678457544315374,0.08565640776751143,0.08284697154989813,0.08199811585174373,0.07326450301060178,0.0660288568201205,0.0569631751252344,0.04420535033355498,0.02915343747658533,0.019125804263209503],)]
df = spark.createDataFrame(data, ['probability'])
df = df.withColumn('ModelVersion', lit('1.0.1'))
df = df.withColumn('loannumber', lit('123456'))
df = df.withColumn('callstartdatetime', lit('2018-10-13 13:21:37'))
new_df = df.select('loannumber', 'ModelVersion', 'probability','callstartdatetime' )
new_df.show()
第2部分:
+----------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|loannumber|ModelVersion|probability |callstartdatetime |
+----------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|123456 |1.0.1 |[0.15338473676518236, 0.11111990827514251, 0.1094681573180615, 0.08678457544315374, 0.08565640776751143, 0.08284697154989813, 0.08199811585174373, 0.07326450301060178, 0.0660288568201205, 0.0569631751252344, 0.04420535033355498, 0.02915343747658533, 0.019125804263209503]|2018-10-13 13:21:37|
+----------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+
错误:
def get_by_index(v, index):
return str(v[index])
get_by_index_udf = udf(get_by_index, StringType())
data = new_df.select('loannumber',
get_by_index_udf('probability', lit(0)).alias('probability_hour8'),
get_by_index_udf('probability', lit(3)).alias('probability_hour9'),
get_by_index_udf('probability', lit(1)).alias('probability_hour10'),
get_by_index_udf('probability', lit(4)).alias('probability_hour11'),
get_by_index_udf('probability', lit(2)).alias('probability_hour12'),
get_by_index_udf('probability', lit(5)).alias('probability_hour13'),
get_by_index_udf('probability', lit(6)).alias('probability_hour14'),
get_by_index_udf('probability', lit(7)).alias('probability_hour15'),
get_by_index_udf('probability', lit(8)).alias('probability_hour16'),
get_by_index_udf('probability', lit(9)).alias('probability_hour17'),
get_by_index_udf('probability', lit(10)).alias('probability_hour18'),
get_by_index_udf('probability', lit(11)).alias('probability_hour19'),
get_by_index_udf('probability', lit(12)).alias('probability_hour20'))
data.show(10, False)