问:如果从两个条件中实现了一个条件,是否有任何方法可以合并两个数据帧?
例如,我有两个Dataframe:
DF1
name Exam
Ahmad 100
Ahmad 95
Ahmad 90
Emma 80
Emma 85
第二个数据框
DF2
name math phy. prev._Rank
Ahmad 100 90 2
Emma 80 85 1
我希望得到的DF如下:
NEW DF
name Exam math phy. Prev._Rank
Ahmad 100 100 90 2
Ahmad 95 null null 2
Ahmad 90 100 90 2
Emma 80 80 85 1
Emma 85 80 85 1
答案 0 :(得分:3)
DF1=spark.createDataFrame([('Ahmad','100'),('Ahmad','95'),('Ahmad','90'),('Ahmad','50'),('Ahmad','51'),('Ahmad','54'),('Ahmad','53'),('Emma','52'),('Emma','85')],['namea','Exam'])
DF1=DF1.select('namea',DF1.Exam.cast('integer'))
DF1.show()
+-----+----+
|namea|Exam|
+-----+----+
|Ahmad| 100|
|Ahmad| 95|
|Ahmad| 90|
|Ahmad| 50|
|Ahmad| 51|
|Ahmad| 54|
|Ahmad| 53|
| Emma| 52|
| Emma| 85|
+-----+----+
DF2=spark.createDataFrame([('Ahmad','100','90','2'),('Ahmad','50','54','3'),('Emma','52','85','1')],['name','math','phy','Prev_Rank'])
DF2=DF2.select('name',DF2.math.cast('integer'),DF2.phy.cast('integer'),'Prev_Rank')
DF2.show()
+-----+----+---+---------+
| name|math|phy|Prev_Rank|
+-----+----+---+---------+
|Ahmad| 100| 90| 2|
|Ahmad| 50| 54| 3|
| Emma| 52| 85| 1|
+-----+----+---+---------+
解决方案
DF3=DF1.join(DF2,[DF1.namea==DF2.name,DF1.Exam==DF2.math],'leftouter')
DF3.show()
+-----+----+-----+----+----+---------+
|namea|Exam| name|math| phy|Prev_Rank|
+-----+----+-----+----+----+---------+
|Ahmad| 90| null|null|null| null|
| Emma| 85| null|null|null| null|
|Ahmad| 50|Ahmad| 50| 54| 3|
|Ahmad| 53| null|null|null| null|
|Ahmad| 54| null|null|null| null|
| Emma| 52| Emma| 52| 85| 1|
|Ahmad| 95| null|null|null| null|
|Ahmad| 100|Ahmad| 100| 90| 2|
|Ahmad| 51| null|null|null| null|
+-----+----+-----+----+----+---------+
DF4=DF1.join(DF2,[DF1.namea==DF2.name,DF1.Exam==DF2.phy],'leftouter').withColumnRenamed('name','name1').withColumnRenamed('math','math1').withColumnRenamed('phy','phy1').withColumnRenamed('Prev_Rank','Prev_Rank1')
DF4.show()
+-----+----+-----+-----+----+----------+
|namea|Exam|name1|math1|phy1|Prev_Rank1|
+-----+----+-----+-----+----+----------+
|Ahmad| 90|Ahmad| 100| 90| 2|
| Emma| 85| Emma| 52| 85| 1|
|Ahmad| 50| null| null|null| null|
|Ahmad| 53| null| null|null| null|
|Ahmad| 54|Ahmad| 50| 54| 3|
| Emma| 52| null| null|null| null|
|Ahmad| 95| null| null|null| null|
|Ahmad| 100| null| null|null| null|
|Ahmad| 51| null| null|null| null|
+-----+----+-----+-----+----+----------+
DF5=DF4.join(DF3,['namea','Exam'],'inner').orderBy(['namea','Exam'])
DF5.show()
+-----+----+-----+-----+----+----------+-----+----+----+---------+
|namea|Exam|name1|math1|phy1|Prev_Rank1| name|math| phy|Prev_Rank|
+-----+----+-----+-----+----+----------+-----+----+----+---------+
|Ahmad| 50| null| null|null| null|Ahmad| 50| 54| 3|
|Ahmad| 51| null| null|null| null| null|null|null| null|
|Ahmad| 53| null| null|null| null| null|null|null| null|
|Ahmad| 54|Ahmad| 50| 54| 3| null|null|null| null|
|Ahmad| 90|Ahmad| 100| 90| 2| null|null|null| null|
|Ahmad| 95| null| null|null| null| null|null|null| null|
|Ahmad| 100| null| null|null| null|Ahmad| 100| 90| 2|
| Emma| 52| null| null|null| null| Emma| 52| 85| 1|
| Emma| 85| Emma| 52| 85| 1| null|null|null| null|
+-----+----+-----+-----+----+----------+-----+----+----+---------+
DF6=DF5.withColumn("name1",coalesce(DF5.name1,DF5.name)).withColumn("math1",coalesce(DF5.math1,DF5.math)).withColumn("phy1",coalesce(DF5.phy1,DF5.phy)).withColumn("Prev_Rank1",coalesce(DF5.Prev_Rank1,DF5.Prev_Rank)).drop('name','math','phy','Prev_Rank')
DF6.show()
+-----+----+-----+-----+----+----------+
|namea|Exam|name1|math1|phy1|Prev_Rank1|
+-----+----+-----+-----+----+----------+
|Ahmad| 50|Ahmad| 50| 54| 3|
|Ahmad| 51| null| null|null| null|
|Ahmad| 53| null| null|null| null|
|Ahmad| 54|Ahmad| 50| 54| 3|
|Ahmad| 90|Ahmad| 100| 90| 2|
|Ahmad| 95| null| null|null| null|
|Ahmad| 100|Ahmad| 100| 90| 2|
| Emma| 52| Emma| 52| 85| 1|
| Emma| 85| Emma| 52| 85| 1|
+-----+----+-----+-----+----+----------+
import sys
from pyspark.sql.window import Window
import pyspark.sql.functions as fn
DF7=DF6.withColumn("Prev_Rank1", fn.last('Prev_Rank1', True).over(Window.partitionBy('namea').orderBy('Exam').rowsBetween(-sys.maxsize, 0))).drop('name1')
DF7.show()
+-----+----+-----+----+----------+
|namea|Exam|math1|phy1|Prev_Rank1|
+-----+----+-----+----+----------+
|Ahmad| 50| 50| 54| 3|
|Ahmad| 51| null|null| 3|
|Ahmad| 53| null|null| 3|
|Ahmad| 54| 50| 54| 3|
|Ahmad| 90| 100| 90| 2|
|Ahmad| 95| null|null| 2|
|Ahmad| 100| 100| 90| 2|
| Emma| 52| 52| 85| 1|
| Emma| 85| 52| 85| 1|
+-----+----+-----+----+----------+