我正在努力将以下代码移至PySpark。你可以帮帮我吗? 我是PySpark的新手,需要赶时间。
因为我已经创建了这些表,所以无需在代码中创建数据框。 我面临的主要挑战是在同一个表(第二和第三查询)中已经创建了具有其他属性的求和列,但是我无法转换其中的任何一个。
SELECT attr_data_provider_key,
CUST_KEY,
prod_key,
category_name,
cust_name,
time_period_end_date,
SUM(SU) AS SU
FROM KPI_BOP_STEP1
WHERE attr_data_provider_key IN (SELECT attr_data_provider_key
FROM KPI_BOP_TARGET
WHERE CUSTOMER_TEAM_BOP_LVL = "CUSTOMER"
AND PRODUCT_BOP_LVL = "CATEGORY")
GROUP BY attr_data_provider_key,
CUST_KEY,
prod_key,
category_name,
time_period_end_date,
cust_name;
SELECT BOP1.time_period_end_date,
BOP1.attr_data_provider_key,
BOP1.CUST_KEY,
BOP1.prod_key,
BOP1.category_name,
BOP1.cust_name,
BOP1.su/BOP2.SUM_su AS SHARE
FROM KPI_BOP_STEP2 BOP1
LEFT JOIN (
SELECT time_period_end_date,
attr_data_provider_key,
cust_name,
category_name,
SUM(SU) SUM_SU
FROM KPI_BOP_STEP2
GROUP BY attr_data_provider_key,
cust_name,
category_name,
time_period_end_date
) BOP2
ON ( BOP1.attr_data_provider_key = BOP2.attr_data_provider_key
AND BOP1.cust_name = BOP2.cust_name
AND BOP1.category_name = BOP2.category_name
and BOP1.time_period_end_date = BOP2.time_period_end_date);
SELECT time_period_end_date,
CUST_KEY,
prod_key,
SHARE * BOP_VOLUME_MSU AS TARGET
FROM KPI_BOP_CUSTCAT KPI
JOIN KPI_BOP_TARGET BOP ON (BOP.attr_data_provider_key =
KPI.attr_data_provider_key
AND BOP.CUSTOMER_TEAM = KPI.cust_name
AND BOP.PRODUCT = KPI.category_name
AND last_day(BOP.BOP_MONTH) = KPI.time_period_end_date);