我在Pyspark
中有一个使用下面创建的数据框。
df = sqlContext.createDataFrame(
[(1,'Y','Y',0,0,0,2,'Y','N','Y','Y'),
(2,'N','Y',2,1,2,3,'N','Y','Y','N'),
(3,'Y','N',3,1,0,0,'N','N','N','N'),
(4,'N','Y',5,0,1,0,'N','N','N','Y'),
(5,'Y','N',2,2,0,1,'Y','N','N','Y'),
(6,'Y','Y',0,0,3,6,'Y','N','Y','N'),
(7,'N','N',1,1,3,4,'N','Y','N','Y'),
(8,'Y','Y',1,1,2,0,'Y','Y','N','N')
],
('id', 'compatible', 'product', 'ios', 'pc', 'other', 'devices', 'customer', 'subscriber', 'circle', 'smb')
)
df.show
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+
| id|compatible|product|ios| pc|other|devices|customer|subscriber|circle|smb|
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+
| 1| Y| Y| 0| 0| 0| 2| Y| N| Y| Y|
| 2| N| Y| 2| 1| 2| 3| N| Y| Y| N|
| 3| Y| N| 3| 1| 0| 0| N| N| N| N|
| 4| N| Y| 5| 0| 1| 0| N| N| N| Y|
| 5| Y| N| 2| 2| 0| 1| Y| N| N| Y|
| 6| Y| Y| 0| 0| 3| 6| Y| N| Y| N|
| 7| N| N| 1| 1| 3| 4| N| Y| N| Y|
| 8| Y| Y| 1| 1| 2| 0| Y| Y| N| N|
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+
现在从上面的数据框中,我想根据某些条件创建一个新列。
1)如果compatible
列为Y
,则
2)如果product, customer, subscriber, circle, smb
列值= Y
分配值= 10
否则0
3)如果sum of ios, pc, other
列大于4
,则赋值= 10
否则为0
4)如果devices
列大于4
,则赋值= 10
否则为0
然后将以上所有values
相加,并在pyspark数据场中填充score
列
我想要的输出如下。
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+-----+
| id|compatible|product|ios| pc|other|devices|customer|subscriber|circle|smb|score|
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+-----+
| 1| Y| Y| 0| 0| 0| 2| Y| N| Y| Y| 50|
| 2| N| Y| 2| 1| 2| 3| N| Y| Y| N| 0|
| 3| Y| N| 3| 1| 0| 0| N| N| N| N| 0|
| 4| N| Y| 5| 0| 1| 0| N| N| N| Y| 0|
| 5| Y| N| 2| 2| 0| 1| Y| N| N| Y| 30|
| 6| Y| Y| 0| 0| 3| 6| Y| N| Y| N| 40|
| 7| N| N| 1| 1| 3| 4| N| Y| N| Y| 0|
| 8| Y| Y| 1| 1| 2| 0| Y| Y| N| N| 30|
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+-----+
我尝试如下
df1 = df.where(f.col('compatible') == 'Y').\
withColumn('score', f.when(f.col('product') == 'Y', 10) +
f.when(f.col('ios') + f.col('pc') + f.col('other') > 4, 10) + f.when(f.col('devices') > 0, 10) +
f.when(f.col('customer') == 'Y', 10) + f.when(f.col('subscriber') == 'Y', 10) +
f.when(f.col('circle') == 'Y', 10) + f.when(f.col('smb') == 'Y', 10).otherwise(0))
我得到的输出如下
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+-----+
| id|compatible|product|ios| pc|other|devices|customer|subscriber|circle|smb|score|
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+-----+
| 1| Y| Y| 0| 0| 0| 2| Y| N| Y| Y| null|
| 3| Y| N| 3| 1| 0| 0| N| N| N| N| null|
| 5| Y| N| 2| 2| 0| 1| Y| N| N| Y| null|
| 6| Y| Y| 0| 0| 3| 6| Y| N| Y| N| null|
| 8| Y| Y| 1| 1| 2| 0| Y| Y| N| N| null|
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+-----+
我如何实现自己想要的?
答案 0 :(得分:1)
符合when/otherwise
个条件应满足您的要求
df.withColumn('score',
f.when(df['compatible'] == 'Y',
f.when(df['product'] == 'Y', 10).otherwise(0) +
f.when(df['customer'] == 'Y', 10).otherwise(0) +
f.when(df['subscriber'] == 'Y', 10).otherwise(0) +
f.when(df['circle'] == 'Y', 10).otherwise(0) +
f.when(df['smb'] == 'Y', 10).otherwise(0) +
f.when((df['ios'] + df['pc'] + df['other']) > 4, 10).otherwise(0) +
f.when(df['devices'] > 4, 10).otherwise(0)
).otherwise(0))\
.show(truncate=False)
应该给您
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+-----+
|id |compatible|product|ios|pc |other|devices|customer|subscriber|circle|smb|score|
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+-----+
|1 |Y |Y |0 |0 |0 |2 |Y |N |Y |Y |40 |
|2 |N |Y |2 |1 |2 |3 |N |Y |Y |N |0 |
|3 |Y |N |3 |1 |0 |0 |N |N |N |N |0 |
|4 |N |Y |5 |0 |1 |0 |N |N |N |Y |0 |
|5 |Y |N |2 |2 |0 |1 |Y |N |N |Y |20 |
|6 |Y |Y |0 |0 |3 |6 |Y |N |Y |N |40 |
|7 |N |N |1 |1 |3 |4 |N |Y |N |Y |0 |
|8 |Y |Y |1 |1 |2 |0 |Y |Y |N |N |30 |
+---+----------+-------+---+---+-----+-------+--------+----------+------+---+-----+
您可以将其模块化为
def firstCondition(dataframe):
return f.when(dataframe['product'] == 'Y', 10).otherwise(0) + \
f.when(dataframe['customer'] == 'Y', 10).otherwise(0) + \
f.when(dataframe['subscriber'] == 'Y', 10).otherwise(0) + \
f.when(dataframe['circle'] == 'Y', 10).otherwise(0) + \
f.when(dataframe['smb'] == 'Y', 10).otherwise(0)
def secondCondition(dataframe):
return f.when((dataframe['ios'] + dataframe['pc'] + dataframe['other']) > 4, 10).otherwise(0)
def thirdCondition(dataframe):
return f.when(dataframe['devices'] > 4, 10).otherwise(0)
df.withColumn('score',
f.when(df['compatible'] == 'Y', firstCondition(df) + secondCondition(df) + thirdCondition(df)).otherwise(0))\
.show(truncate=False)
我希望答案会有所帮助