我正在串联两个数组列,并将它们转换回数组。现在,当我应用爆炸时,什么也没有发生。使用Spark 2.3。这里有什么奇怪的东西吗?
df = spark.createDataFrame([(1,25,['A','B','B','C'],['A','B','B','C']),(1,20,['A','A','B','C'],['A','B','B','C']),(1,20,['A','C','B','C'],['A','B','B','C']),(2,26,['X','Y','Z','C'],['A','B','B','C'])],['id','age','one','two'])
+---+---+------------+------------+
| id|age| one| two|
+---+---+------------+------------+
| 1| 25|[A, B, B, C]|[A, B, B, C]|
| 1| 20|[A, A, B, C]|[A, B, B, C]|
| 1| 20|[A, C, B, C]|[A, B, B, C]|
| 2| 26|[X, Y, Z, C]|[A, B, B, C]|
+---+---+------------+------------+
>>> df.createOrReplaceTempView('df')
>>> df2 = spark.sql('''select id,age, array(concat_ws(',', one, two)) as three from df''')
>>> df2.show()
+---+---+-----------------+
| id|age| three|
+---+---+-----------------+
| 1| 25|[A,B,B,C,A,B,B,C]|
| 1| 20|[A,A,B,C,A,B,B,C]|
| 1| 20|[A,C,B,C,A,B,B,C]|
| 2| 26|[X,Y,Z,C,A,B,B,C]|
+---+---+-----------------+
>>> df2.createOrReplaceTempView('df2')
>>> spark.sql('''select id, age, four from df2 lateral view explode(three) tbl as four''').show() //not exploding
+---+---+---------------+
| id|age| four|
+---+---+---------------+
| 1| 25|A,B,B,C,A,B,B,C|
| 1| 20|A,A,B,C,A,B,B,C|
| 1| 20|A,C,B,C,A,B,B,C|
| 2| 26|X,Y,Z,C,A,B,B,C|
+---+---+---------------+
请注意,我可以通过
使它起作用>>> df2 = spark.sql('''select id,age, split(concat_ws(',', one, two),',') as three from df''')
但是想知道为什么第一种方法不起作用。
答案 0 :(得分:1)
concat_ws创建单个字符串列而不是数组:
df.select(F.size(df.one)).show()
df2.select(F.size(df2.three)).show()
输出:
+---------+
|size(one)|
+---------+
| 4|
| 4|
| 4|
| 4|
+---------+
+-----------+
|size(three)|
+-----------+
| 1|
| 1|
| 1|
| 1|
+-----------+
这意味着您的数组只有一个元素:
df2.select(df2.three.getItem(0)).show()
df2.select(df2.three.getItem(1)).show()
df2.printSchema()
输出:
+---------------+
| three[0]|
+---------------+
|A,B,B,C,A,B,B,C|
|A,A,B,C,A,B,B,C|
|A,C,B,C,A,B,B,C|
|X,Y,Z,C,A,B,B,C|
+---------------+
+--------+
|three[1]|
+--------+
| null|
| null|
| null|
| null|
+--------+
root
|-- id: long (nullable = true)
|-- age: long (nullable = true)
|-- three: array (nullable = false)
| |-- element: string (containsNull = false)
所以实际上您应该使用的是concat on spark> = 2.4:
df3 = spark.sql('''select id,age, concat(one, two) as three from df''')
df3.show(truncate=False)
df3.printSchema()
df3.select(df3.three.getItem(0)).show()
df3.select(df3.three.getItem(1)).show()
输出:
+---+---+------------------------+
|id |age|three |
+---+---+------------------------+
|1 |25 |[A, B, B, C, A, B, B, C]|
|1 |20 |[A, A, B, C, A, B, B, C]|
|1 |20 |[A, C, B, C, A, B, B, C]|
|2 |26 |[X, Y, Z, C, A, B, B, C]|
+---+---+------------------------+
root
|-- id: long (nullable = true)
|-- age: long (nullable = true)
|-- three: array (nullable = true)
| |-- element: string (containsNull = true)
+--------+
|three[0]|
+--------+
| A|
| A|
| A|
| X|
+--------+
+--------+
|three[1]|
+--------+
| B|
| A|
| C|
| Y|
+--------+
用spark <2.4连接两个数组需要udf(例如,请检查此answer)。
答案 1 :(得分:0)
使用UDF进行此操作的示例方法:
arraycat = F.udf(lambda x,y : x + y, ArrayType(StringType()))
df = df.withColumn("combined", arraycat("one", "two"))
df = df.withColumn("combined", F.explode("combined"))