PySpark:如果从两个条件实现一个条件,则合并两个数据帧

时间:2018-02-13 03:30:44

标签: join pyspark spark-dataframe

问:如果从两个条件中实现了一个条件,是否有任何方法可以合并两个数据帧?

例如,我有两个Dataframe:

DF1

 name           Exam
 Ahmad          100
 Ahmad          95
 Ahmad          90
 Emma           80
 Emma           85

第二个数据框

DF2

 name       math     phy.   prev._Rank
 Ahmad      100      90     2
 Emma       80       85     1

我希望得到的DF如下:

NEW DF

name     Exam      math     phy.    Prev._Rank
 Ahmad   100       100      90      2
 Ahmad   95        null     null    2
 Ahmad   90        100      90      2
 Emma    80        80       85      1
 Emma    85        80       85      1   

1 个答案:

答案 0 :(得分:3)

DF1=spark.createDataFrame([('Ahmad','100'),('Ahmad','95'),('Ahmad','90'),('Ahmad','50'),('Ahmad','51'),('Ahmad','54'),('Ahmad','53'),('Emma','52'),('Emma','85')],['namea','Exam'])

DF1=DF1.select('namea',DF1.Exam.cast('integer'))




DF1.show()

+-----+----+
|namea|Exam|
+-----+----+
|Ahmad| 100|
|Ahmad|  95|
|Ahmad|  90|
|Ahmad|  50|
|Ahmad|  51|
|Ahmad|  54|
|Ahmad|  53|
| Emma|  52|
| Emma|  85|
+-----+----+

DF2=spark.createDataFrame([('Ahmad','100','90','2'),('Ahmad','50','54','3'),('Emma','52','85','1')],['name','math','phy','Prev_Rank'])
DF2=DF2.select('name',DF2.math.cast('integer'),DF2.phy.cast('integer'),'Prev_Rank')


DF2.show()

+-----+----+---+---------+
| name|math|phy|Prev_Rank|
+-----+----+---+---------+
|Ahmad| 100| 90|        2|
|Ahmad|  50| 54|        3|
| Emma|  52| 85|        1|
+-----+----+---+---------+

解决方案

DF3=DF1.join(DF2,[DF1.namea==DF2.name,DF1.Exam==DF2.math],'leftouter')

DF3.show()

+-----+----+-----+----+----+---------+
|namea|Exam| name|math| phy|Prev_Rank|
+-----+----+-----+----+----+---------+
|Ahmad|  90| null|null|null|     null|
| Emma|  85| null|null|null|     null|
|Ahmad|  50|Ahmad|  50|  54|        3|
|Ahmad|  53| null|null|null|     null|
|Ahmad|  54| null|null|null|     null|
| Emma|  52| Emma|  52|  85|        1|
|Ahmad|  95| null|null|null|     null|
|Ahmad| 100|Ahmad| 100|  90|        2|
|Ahmad|  51| null|null|null|     null|
+-----+----+-----+----+----+---------+

DF4=DF1.join(DF2,[DF1.namea==DF2.name,DF1.Exam==DF2.phy],'leftouter').withColumnRenamed('name','name1').withColumnRenamed('math','math1').withColumnRenamed('phy','phy1').withColumnRenamed('Prev_Rank','Prev_Rank1')

DF4.show()

+-----+----+-----+-----+----+----------+
|namea|Exam|name1|math1|phy1|Prev_Rank1|
+-----+----+-----+-----+----+----------+
|Ahmad|  90|Ahmad|  100|  90|         2|
| Emma|  85| Emma|   52|  85|         1|
|Ahmad|  50| null| null|null|      null|
|Ahmad|  53| null| null|null|      null|
|Ahmad|  54|Ahmad|   50|  54|         3|
| Emma|  52| null| null|null|      null|
|Ahmad|  95| null| null|null|      null|
|Ahmad| 100| null| null|null|      null|
|Ahmad|  51| null| null|null|      null|
+-----+----+-----+-----+----+----------+

DF5=DF4.join(DF3,['namea','Exam'],'inner').orderBy(['namea','Exam'])

DF5.show()

+-----+----+-----+-----+----+----------+-----+----+----+---------+
|namea|Exam|name1|math1|phy1|Prev_Rank1| name|math| phy|Prev_Rank|
+-----+----+-----+-----+----+----------+-----+----+----+---------+
|Ahmad|  50| null| null|null|      null|Ahmad|  50|  54|        3|
|Ahmad|  51| null| null|null|      null| null|null|null|     null|
|Ahmad|  53| null| null|null|      null| null|null|null|     null|
|Ahmad|  54|Ahmad|   50|  54|         3| null|null|null|     null|
|Ahmad|  90|Ahmad|  100|  90|         2| null|null|null|     null|
|Ahmad|  95| null| null|null|      null| null|null|null|     null|
|Ahmad| 100| null| null|null|      null|Ahmad| 100|  90|        2|
| Emma|  52| null| null|null|      null| Emma|  52|  85|        1|
| Emma|  85| Emma|   52|  85|         1| null|null|null|     null|
+-----+----+-----+-----+----+----------+-----+----+----+---------+

DF6=DF5.withColumn("name1",coalesce(DF5.name1,DF5.name)).withColumn("math1",coalesce(DF5.math1,DF5.math)).withColumn("phy1",coalesce(DF5.phy1,DF5.phy)).withColumn("Prev_Rank1",coalesce(DF5.Prev_Rank1,DF5.Prev_Rank)).drop('name','math','phy','Prev_Rank')

DF6.show()

+-----+----+-----+-----+----+----------+
|namea|Exam|name1|math1|phy1|Prev_Rank1|
+-----+----+-----+-----+----+----------+
|Ahmad|  50|Ahmad|   50|  54|         3|
|Ahmad|  51| null| null|null|      null|
|Ahmad|  53| null| null|null|      null|
|Ahmad|  54|Ahmad|   50|  54|         3|
|Ahmad|  90|Ahmad|  100|  90|         2|
|Ahmad|  95| null| null|null|      null|
|Ahmad| 100|Ahmad|  100|  90|         2|
| Emma|  52| Emma|   52|  85|         1|
| Emma|  85| Emma|   52|  85|         1|
+-----+----+-----+-----+----+----------+

import sys
from pyspark.sql.window import Window
import pyspark.sql.functions as fn
​

DF7=DF6.withColumn("Prev_Rank1", fn.last('Prev_Rank1', True).over(Window.partitionBy('namea').orderBy('Exam').rowsBetween(-sys.maxsize, 0))).drop('name1')


DF7.show()

+-----+----+-----+----+----------+
|namea|Exam|math1|phy1|Prev_Rank1|
+-----+----+-----+----+----------+
|Ahmad|  50|   50|  54|         3|
|Ahmad|  51| null|null|         3|
|Ahmad|  53| null|null|         3|
|Ahmad|  54|   50|  54|         3|
|Ahmad|  90|  100|  90|         2|
|Ahmad|  95| null|null|         2|
|Ahmad| 100|  100|  90|         2|
| Emma|  52|   52|  85|         1|
| Emma|  85|   52|  85|         1|
+-----+----+-----+----+----------+