import pyspark
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType
from pyspark.sql.functions import *
from pyspark.sql import Window
print(pyspark.__version__)
>> 2.3.1
我有以下格式的历史预订数据,其中包含需求数据,日期,类型和进行预订的天数:
test_df.csv
location,date,daysout,foodtype,demand
NYC,2016-01-01,2,PIZZA,5
NYC,2016-01-01,1,PIZZA,2
NYC,2016-01-01,5,PIZZA,2
NYC,2016-01-02,5,TACOS,3
NYC,2016-01-02,3,PIZZA,2
MIA,2016-01-01,2,TACOS,2
MIA,2016-01-01,1,TACOS,1
MIA,2016-01-01,5,TACOS,3
MIA,2016-01-02,2,PIZZA,4
MIA,2016-01-01,3,TACOS,5
作为数据框加载
schema = StructType([
StructField("location", StringType(), True),
StructField("date", DateType(), True),
StructField("daysout", IntegerType(), True),
StructField("foodtype", StringType(), True),
StructField("demand", IntegerType(), True), ])
demand = sqlContext.read.format("csv") \
.load("../data/test_demand.csv", header='true', schema=schema) \
.select("location", "date", "daysout", "foodtype", "demand")
demand.show()
+--------+----------+-------+--------+------+
|location| date|daysout|foodtype|demand|
+--------+----------+-------+--------+------+
| NYC|2016-01-01| 2| PIZZA| 5|
| NYC|2016-01-01| 1| PIZZA| 2|
| NYC|2016-01-01| 5| PIZZA| 2|
| NYC|2016-01-02| 5| TACOS| 3|
| NYC|2016-01-02| 3| PIZZA| 2|
| MIA|2016-01-01| 2| TACOS| 2|
| MIA|2016-01-01| 1| TACOS| 1|
| MIA|2016-01-01| 5| TACOS| 3|
| MIA|2016-01-02| 2| PIZZA| 4|
| MIA|2016-01-01| 3| TACOS| 5|
+--------+----------+-------+--------+------+
我想将其转换为剩余的需求格式,从而可以提供一系列位置,日期和停工期-并得到如下结果:
预期结果
+--------+----------+-------+--------+------+----------------+
|location| date|daysout|foodtype|demand|remaining_demand|
+--------+----------+-------+--------+------+----------------+
| MIA|2016-01-01| 0| PIZZA| 0| 0|
| MIA|2016-01-01| 1| PIZZA| 0| 0|
| MIA|2016-01-01| 2| PIZZA| 0| 0|
| MIA|2016-01-01| 3| PIZZA| 0| 0|
| MIA|2016-01-01| 4| PIZZA| 0| 0|
| MIA|2016-01-01| 5| PIZZA| 0| 0|
| NYC|2016-01-01| 0| PIZZA| 0| 0|
| NYC|2016-01-01| 1| PIZZA| 2| 2|
| NYC|2016-01-01| 2| PIZZA| 5| 7|
| NYC|2016-01-01| 3| PIZZA| 0| 7|
| NYC|2016-01-01| 4| PIZZA| 0| 7|
| NYC|2016-01-01| 5| PIZZA| 2| 9|
| MIA|2016-01-02| 0| PIZZA| 0| 0|
| MIA|2016-01-02| 1| PIZZA| 0| 0|
| MIA|2016-01-02| 2| PIZZA| 4| 4|
| MIA|2016-01-02| 3| PIZZA| 0| 4|
| MIA|2016-01-02| 4| PIZZA| 0| 4|
| MIA|2016-01-02| 5| PIZZA| 0| 4|
| NYC|2016-01-02| 0| TACOS| 0| 0|
| NYC|2016-01-02| 1| TACOS| 0| 0|
| NYC|2016-01-02| 2| TACOS| 0| 0|
| NYC|2016-01-02| 3| TACOS| 0| 0|
| NYC|2016-01-02| 4| TACOS| 0| 0|
| NYC|2016-01-02| 5| TACOS| 3| 3|
| MIA|2016-01-01| 0| TACOS| 0| 0|
| MIA|2016-01-01| 1| TACOS| 1| 1|
| MIA|2016-01-01| 2| TACOS| 2| 3|
| MIA|2016-01-01| 3| TACOS| 5| 8|
| MIA|2016-01-01| 4| TACOS| 0| 8|
| MIA|2016-01-01| 5| TACOS| 3| 11|
| MIA|2016-01-02| 0| TACOS| 0| 0|
| MIA|2016-01-02| 1| TACOS| 0| 0|
| MIA|2016-01-02| 2| TACOS| 0| 0|
| MIA|2016-01-02| 3| TACOS| 0| 0|
| MIA|2016-01-02| 4| TACOS| 0| 0|
| MIA|2016-01-02| 5| TACOS| 0| 0|
| NYC|2016-01-02| 0| PIZZA| 0| 0|
| NYC|2016-01-02| 1| PIZZA| 0| 0|
| NYC|2016-01-02| 2| PIZZA| 0| 0|
| NYC|2016-01-02| 3| PIZZA| 2| 2|
| NYC|2016-01-02| 4| PIZZA| 0| 2|
| NYC|2016-01-02| 5| PIZZA| 0| 2|
| NYC|2016-01-01| 0| TACOS| 0| 0|
| NYC|2016-01-01| 1| TACOS| 0| 0|
| NYC|2016-01-01| 2| TACOS| 0| 0|
| NYC|2016-01-01| 3| TACOS| 0| 0|
| NYC|2016-01-01| 4| TACOS| 0| 0|
| NYC|2016-01-01| 5| TACOS| 0| 0|
+--------+----------+-------+--------+------+----------------+
我当前的解决方案涉及使用熊猫,但是当我在真实数据集中对其进行测试时,它不起作用。
我效率低下的方法
start_stay_date = '2016-01-01'
end_stay_date = '2016-01-02'
# Create Date Range Possabilities
daterange = pd.date_range(start_stay_date, end_stay_date)
daterange_df = pd.DataFrame(daterange, columns=['date'])
daterange_df['date'] = pd.to_datetime(daterange_df['date'])
datesschema = StructType([StructField("date", DateType(), True)])
dates_df = sqlContext.createDataFrame(daterange_df, schema=datesschema)
# List of poential days left
daysleft_df = pd.DataFrame([x for x in range(0, 6)], columns=['daysout'])
dl_schema = StructType([StructField("daysout", IntegerType(), True)])
dl = sqlContext.createDataFrame(daysleft_df, schema=dl_schema)
# Create SQL tables
demand.createOrReplaceTempView("demand")
dates_df.createOrReplaceTempView("alldates")
dl.createOrReplaceTempView("daysleft")
# Cross join
distinct_df = sqlContext.sql("""
SELECT loc.location,
dates.date,
dl.daysout,
food.foodtype,
demand.demand
FROM (SELECT distinct location FROM demand) loc cross join
(SELECT distinct foodtype FROM demand) food cross join
(SELECT distinct date FROM alldates) dates cross join
(SELECT distinct daysout FROM daysleft) dl
left join
demand
on demand.location = loc.location and
demand.date = dates.date and
demand.daysout = dl.daysout and
demand.foodtype = food.foodtype
""")
distinct_df = distinct_df.fillna(0)
# Calculate cumsum using window partitionby
windowval = (Window.partitionBy(['location', 'foodtype', 'date']).orderBy('daysout')
.rangeBetween(Window.unboundedPreceding, 0))
rem_demand = distinct_df.withColumn('remainind_demand', sum('demand').over(windowval))
rem_demand.show(50)