我在hdfs
位置有一个csv文件,并已转换为dataframe
,而我的dataframe
如下所示...
column1,column2,column3
Node1, block1, 1,4,5
Node1, block1, null
Node1, block2, 3,6,7
Node1, block2, null
Node1, block1, null
我想解析此dataframe
,我的输出dataframe
应该在下面。
column1,column2,column3
Node1, block1, counter0:1,counter1:4,counter2:5
Node1, block1, null
Node1, block2, counter0:3,counter1:6,counter2:7
Node1, block2, null
Node1, block1, null
我遇到了下面提到的一些错误,所以有什么可以帮助我解决该错误的方法吗?或者可以帮助我获取正确/修改的代码?谢谢。
import pyspark
from pyspark.sql.functions import *
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.types as T
from pyspark.sql.functions import udf
start_value = 2
schema_name = 2
start_key = 0
df = spark.read.csv("hdfs://path/Ccounters/test.csv",header=True)
def dict(x):
split_col = x.split(",")
col_nm = df.schema.names[schema_name]
convert = map(lambda x :col_nm + str(start_key) +":"+str(x) ,split_col)
con_str = ','.join(convert)
return con_str
udf_dict = udf(dict, StringType())
df1 =df.withColumn('distance', udf_dict(df.column3))
df1.show()
getting error below:
File "/opt/data/data11/yarn/local/usercache/cdap/appcache/application_1555606923440_67815/container_e48_1555606923440_67815_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 160, in dump
pickle.PicklingError: Could not serialize object: Py4JError: An error occurred while calling o58.__getnewargs__. Trace:
py4j.Py4JException: Method __getnewargs__([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:272)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)
答案 0 :(得分:1)
我发现您不能在UDF内使用有火花的对象(例如“地图”功能)(https://stackoverflow.com/a/57230637)。执行所需操作的另一种方法是在UDF中使用for循环。
根据以下问题的答案,添加了可以轻松地将此UDF应用到多列的零件:how to get the name of column with maximum value in pyspark dataframe
df = spark.createDataFrame([('Node1', 'block1', '1,4,5', None), ('Node1', 'block1', None, '1,2,3'), ('Node1', 'block2', '3,6,7', None), ('Node1', 'block2', None, '4,5,6'), ('Node1', 'block1', None, '7,8,9')], ['column1', 'column2', 'column3', 'column4'])
# df.show()
# +-------+-------+-------+-------+
# |column1|column2|column3|column4|
# +-------+-------+-------+-------+
# | Node1| block1| 1,4,5| null|
# | Node1| block1| null| 1,2,3|
# | Node1| block2| 3,6,7| null|
# | Node1| block2| null| 4,5,6|
# | Node1| block1| null| 7,8,9|
# +-------+-------+-------+-------+
def columnfill(x):
# if x is empty, return x
if x == None:
return x
else:
split = x.split(',')
y = []
z = 0
for i in split:
y.append('counter'+str(z)+':'+str(i))
z += 1
return ','.join(y)
udf_columnfill = udf(columnfill, StringType())
### Apply UDF to a single column:
# df_result1 = df.withColumn('distance', udf_columnfill(df.column3))
### Code for applying UDF to multiple columns
# Define columns that should be transformed
columnnames = ['column3', 'column4']
# Create a condition that joins multiple string parts, containing column operations
cond = "df.withColumn" + ".withColumn".join(["('" + str(c) + "_new', udf_columnfill(df." + str(c) + ")).drop('"+ str(c) +"')" for c in (columnnames)])
# # Print condition to see which transformations are executed
# print(cond)
# df.withColumn('column3_new', udf_columnfill(df.column3)).drop('column3').withColumn('column4_new', udf_columnfill(df.column4)).drop('column4')
# Create the new dataframe that evaluates the defined condition
df_result2 = eval(cond)
# df_result2.show()
# +-------+-------+--------------------------------+--------------------------------+
# |column1|column2|column3_new |column4_new |
# +-------+-------+--------------------------------+--------------------------------+
# |Node1 |block1 |counter0:1,counter1:4,counter2:5|null |
# |Node1 |block1 |null |counter0:1,counter1:2,counter2:3|
# |Node1 |block2 |counter0:3,counter1:6,counter2:7|null |
# |Node1 |block2 |null |counter0:4,counter1:5,counter2:6|
# |Node1 |block1 |null |counter0:7,counter1:8,counter2:9|
# +-------+-------+--------------------------------+--------------------------------+
在插入列名的位置添加了一个额外的UDF输入值,作为列值的前缀:
# Updated UDF
def columnfill(cinput, cname):
# if x is empty, return x
if cinput == None:
return cinput
else:
values = cinput.split(',')
output = []
count = 0
for value in values:
output.append(str(cname)+str(count)+":"+str(value))
count += 1
return ','.join(output)
udf_columnfill = udf(columnfill, StringType())
# Define columns that should be transformed
columnnames = ['column3', 'column4']
# Create a condition that joins multiple string parts, containing column operations
cond2 = "df.withColumn" + ".withColumn".join(["('" + str(c) + "_new', udf_columnfill(df." + str(c) + ", f.lit('" + str(c) + "_new'))).drop('"+ str(c) +"')" for c in (columnnames)])
df_result3 = eval(cond2)
# +-------+-------+--------------------------------------------+--------------------------------------------+
# |column1|column2|column3_new |column4_new |
# +-------+-------+--------------------------------------------+--------------------------------------------+
# |Node1 |block1 |column3_new0:1,column3_new1:4,column3_new2:5|null |
# |Node1 |block1 |null |column4_new0:1,column4_new1:2,column4_new2:3|
# |Node1 |block2 |column3_new0:3,column3_new1:6,column3_new2:7|null |
# |Node1 |block2 |null |column4_new0:4,column4_new1:5,column4_new2:6|
# |Node1 |block1 |null |column4_new0:7,column4_new1:8,column4_new2:9|
# +-------+-------+--------------------------------------------+--------------------------------------------+
print(cond)
# df.withColumn('column3_new', udf_columnfill(df.column3, f.lit('column3_new'))).drop('column3').withColumn('column4_new', udf_columnfill(df.column4, f.lit('column4_new'))).drop('column4')