我们如何使用pyspark2.4基于某些分隔符(例如'/')分割sparkDataframe列
我的专栏包含:
getSomethingParent
谢谢
答案 0 :(得分:1)
# Creating a dataframe
values = [('America/New_York',),('Africa/Casablanca',),('Europe/Madrid',),('Europe/Madrid',),('Germany',),('',),(None,)]
df = sqlContext.createDataFrame(values,['timezone',])
df.show(truncate=False)
+-----------------+
|timezone |
+-----------------+
|America/New_York |
|Africa/Casablanca|
|Europe/Madrid |
|Europe/Madrid |
|Germany |
| |
|null |
+-----------------+
from pyspark.sql.functions import instr, split
df = df.withColumn('separator_if_exists',(instr(col('timezone'),'/') > 0) & instr(col('timezone'),'/').isNotNull())
df = df.withColumn('col1',when(col('separator_if_exists') == True,split(col('timezone'),'/')[0]).otherwise(None))
df = df.withColumn('col2',when(col('separator_if_exists') == True,split(col('timezone'),'/')[1]).otherwise(None)).drop('separator_if_exists')
df.show(truncate=False)
+-----------------+-------+----------+
|timezone |col1 |col2 |
+-----------------+-------+----------+
|America/New_York |America|New_York |
|Africa/Casablanca|Africa |Casablanca|
|Europe/Madrid |Europe |Madrid |
|Europe/Madrid |Europe |Madrid |
|Germany |null |null |
| |null |null |
|null |null |null |
+-----------------+-------+----------+