我的数据框如下所示:
对于状态s1和s2的第一次出现,我们将其余发生的状态设置为open_price = 0,我们采用close_price的滞后时间
from pyspark.sql import functions as F
from pyspark.sql.window import Window
df1 = df.withColumn('id', monotonically_increasing_id())
w=Window().partitionBy('Input','Status','Num').orderBy("id")
w1=Window().orderBy("id")
w2=Window().partitionBy("sum").orderBy("id")
df2 = df1.withColumn("rowNum", F.row_number().over(w))\
.withColumn("sum", F.sum(F.when(F.col("rowNum")==1, F.lit(1)).otherwise(F.lit(0))).over(w1))\
.withColumn("sum", F.when((F.row_number().over(w2)==1) & (F.col("sum")==2), F.lit(1)).otherwise(F.col("sum")))\
.withColumn("lag1", F.lag("Close_price1",2).over(w1))\
.withColumn("lag2", F.lag("Close_price1",3).over(w1))\
.withColumn("Opening_Imp_Stock_Stage_1_n", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
.when((F.col("sum")!=1),F.col("lag2"))\
.otherwise(F.lit(0)))\
.withColumn("Open_price1", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price1")))\
.withColumn("lag1", F.lag("Close_price2",2).over(w1))\
.withColumn("lag2", F.lag("Close_price2",3).over(w1))\
.withColumn("Open_price2", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
.when((F.col("sum")!=1),F.col("lag2"))\
.otherwise(F.lit(0)))\
.withColumn("Open_price2", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price2")))\
.withColumn("lag1", F.lag("Close_price3",2).over(w1))\
.withColumn("lag2", F.lag("Close_price3",3).over(w1))\
.withColumn("Open_price3", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
.when((F.col("sum")!=1),F.col("lag2"))\
.otherwise(F.lit(0)))\
.withColumn("Open_price3", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price3")))\
.withColumn('Stage1',(col("Close_price1") - col("Open_price1")))\
.withColumn('Stage2',(col("Close_price2") - col("Open_price2")))\
.withColumn('Stage3',(col("Close_price3") - col("Open_price3")))\
.withColumn('Stage1',when(col("Year") == 0,0).otherwise(col("Stage1")))\
.withColumn('Stage2',when(col("Year") == 0,0).otherwise(col("Stage2")))\
.withColumn('Stage3',when(col("Year") == 0,0).otherwise(col("Stage3")))\
.drop("id","lag1","lag2","rowNum")
("rowNum", F.row_number().over(w))\
.withColumn("sum", F.sum(F.when(F.col("rowNum")==1, F.lit(1)).otherwise(F.lit(0))).over(w1))\
.withColumn("sum", F.when((F.row_number().over(w2)==1) & (F.col("sum")==2), F.lit(1)).otherwise(F.col("sum")))\
.withColumn("lag1", F.lag("Close_price1",2).over(w1))\
.withColumn("lag2", F.lag("Close_price1",3).over(w1))\
.withColumn("Open_price1", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
.when((F.col("sum")!=1),F.col("lag2"))\
.otherwise(F.lit(0)))\
.withColumn("Open_price1", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price1")))\
.withColumn("lag1", F.lag("Close_price2",2).over(w1))\
.withColumn("lag2", F.lag("Close_price2",3).over(w1))\
.withColumn("Open_price2", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
.when((F.col("sum")!=1),F.col("lag2"))\
.otherwise(F.lit(0)))\
.withColumn("Open_price2", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price2")))\
.withColumn("lag1", F.lag("Close_price3",2).over(w1))\
.withColumn("lag2", F.lag("Close_price3",3).over(w1))\
.withColumn("Open_price3", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
.when((F.col("sum")!=1),F.col("lag2"))\
.otherwise(F.lit(0)))\
.withColumn("Open_price3", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price3")))\
.withColumn('Stage1',(col("Close_price1") - col("Open_price1")))\
.withColumn('Stage2',(col("Close_price2") - col("Open_price2")))\
.withColumn('Stage3',(col("Close_price3") - col("Open_price3")))\
.withColumn('Stage1',when(col("Year") == 0,0).otherwise(col("Stage1")))\
.withColumn('Stage2',when(col("Year") == 0,0).otherwise(col("Stage2")))\
.withColumn('Stage3',when(col("Year") == 0,0).otherwise(col("Stage3")))\
.drop("id","lag1","lag2","rowNum")
如何识别状态S1和S2的首次出现,然后将0分配给open_price
open_price和open_price1是输出列
答案 0 :(得分:1)
使用一些 window functions.
df.show() #sample data
#+------+----+-----------+
#|status|year|close_price|
#+------+----+-----------+
#| s1| 0| 1.2|
#| s1| 0| 2.2|
#| s1| 1| 3.2|
#| s1| 1| 4.2|
#| s2| 1| 5.2|
#| s1| 2| 6.2|
#| s1| 2| 7.2|
#+------+----+-----------+
from pyspark.sql import functions as F
from pyspark.sql.window import Window
w=Window().partitionBy("status").orderBy("mono_id")
w1=Window().orderBy("mono_id")
w2=Window().partitionBy("sum").orderBy("mono_id")
df.withColumn("mono_id", F.monotonically_increasing_id())\
.withColumn("rowNum", F.row_number().over(w))\
.withColumn("sum", F.sum(F.when(F.col("rowNum")==1, F.lit(1)).otherwise(F.lit(0))).over(w1))\
.withColumn("sum", F.when((F.row_number().over(w2)==1) & (F.col("sum")==2), F.lit(1)).otherwise(F.col("sum")))\
.withColumn("lag1", F.lag("close_price",2).over(w1))\
.withColumn("lag2", F.lag("close_price",3).over(w1))\
.withColumn("open_price", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
.when((F.col("sum")!=1),F.col("lag2"))\
.otherwise(F.lit(0)))\
.withColumn("open_price", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("open_price")))\
.orderBy("mono_id").drop("mono_id","lag1","lag2","rowNum")\
.show()
#+------+----+-----------+---+----------+
#|status|year|close_price|sum|open_price|
#+------+----+-----------+---+----------+
#| s1| 0| 1.2| 1| 0.0|
#| s1| 0| 2.2| 1| 0.0|
#| s1| 1| 3.2| 1| 1.2|
#| s1| 1| 4.2| 1| 2.2|
#| s2| 1| 5.2| 1| 0.0|
#| s1| 2| 6.2| 2| 3.2|
#| s1| 2| 7.2| 2| 4.2|
#+------+----+-----------+---+----------+