我想使用现有表的条件将值放在空/ NaN中 请找到附件
现有数据
import pandas as pd
col_names = ['Date', 'ID', 'Individual','Category','Age','DW','Gender']
my_df = pd.DataFrame(columns = col_names)
my_df['Date']=2112019,2112019,2112019,2112019,2112019,2112019,2112019,2112019,2112019,2112019,3112019,3112019,3112019,3112019,
3112019,3112019,3112019,3112019,3112019,3112019,'...',8112019,8112019,8112019,8112019,8112019,8112019,8112019,
8112019,8112019,8112019]
my_df['ID']=[1,1,1,2,2,2,2,3,3,3,1,1,1,2,2,2,2,3,3,3,'...',1,1,1,2,2,2,2,3,3,3]
my_df['Individual']=[1,2,3,1,2,3,4,1,2,3,1,2,3,1,2,3,4,1,2,3,'...',1,2,3,1,2,3,4,1,2,3]
my_df['Category']=['DE','DE','DE','C','C','C','C','A','A','A','DE','DE','DE','C','C','C','C','A','A','A','...','DE',
'DE','DE','C','C','C','C','A','A','A']
my_df['Age']=['51-60','02-14','31-40','02-14','31-40','15-21','22-30','60+','22-30','02-14','51-60','02-14','31-40',
'02-14','31-40','15-21','22-30','60+','22-30','02-14','...','51-60','02-14','31-40','02-14','31-40',
'15-21','22-30','60+','22-30','02-14']
my_df['DW']=[6554,7875,10063,5661,7851,10063,6552,2365,8569,7875,6554,7875,10063,5661,7875,'...',
6554,7875,10063,5661,7851,10063,6552,2365,8569,7875,6554,7875,10063,5661,7875]
my_df['Gender']=['M','F','F','M','M','F','M','F','F','M','M','F','F','M','M','F','M','F','F','M',
'...','M','F','F','M','M','F','M','F','F','M']
O / p
Date ID Individual Category Age DW Gender
0 2112019 1 1 DE 51-60 6554 M
1 2112019 1 2 DE 02-14 7875 F
2 2112019 1 3 DE 31-40 10063 F
3 2112019 2 1 C 02-14 5661 M
4 2112019 2 2 C 31-40 7851 M
5 2112019 2 3 C 15-21 10063 F
6 2112019 2 4 C 22-30 6552 M
7 2112019 3 1 A 60+ 2365 F
8 2112019 3 2 A 22-30 8569 F
9 2112019 3 3 A 02-14 7875 M
10 3112019 1 1 DE 51-60 6554 M
11 3112019 1 2 DE 02-14 7875 F
12 3112019 1 3 DE 31-40 10063 F
13 3112019 2 1 C 02-14 5661 M
14 3112019 2 2 C 31-40 7875 M
15 3112019 2 3 C 15-21 10063 F
16 3112019 2 4 C 22-30 5661 M
17 3112019 3 1 A 60+ 2365 F
18 3112019 3 2 A 22-30 8569 F
19 3112019 3 3 A 02-14 7875 M
20 ... ... ... ... ... ... ...
21 8112019 1 1 DE 51-60 6554 M
22 8112019 1 2 DE 02-14 7875 F
23 8112019 1 3 DE 31-40 10063 F
24 8112019 2 1 C 02-14 5661 M
25 8112019 2 2 C 31-40 7851 M
26 8112019 2 3 C 15-21 10063 F
27 8112019 2 4 C 22-30 6552 M
28 8112019 3 1 A 60+ 2365 F
29 8112019 3 2 A 22-30 8569 F
30 8112019 3 3 A 02-14 7875 M
我想使用与上表不同的组合条件生成下表:
col = ['Target', 'Day1', 'Day2','Day3','Day4','Day5','Day6','Day7']
new_df = pd.DataFrame(columns = col)
new_df['Target']=['A-Category & Age 22+','F-Female & ABC-Category & Age <21','M & Age 22-30','...']
new_df
Target Day1 Day2 Day3 Day4 Day5 Day6 Day7
0 A-Category & Age 22+ NaN NaN NaN NaN NaN NaN NaN
1 F-Female & ABC-Category & Age <21 NaN NaN NaN NaN NaN NaN NaN
2 M & Age 22-30 NaN NaN NaN NaN NaN NaN NaN
3 ... NaN NaN NaN NaN NaN NaN NaN
我想根据日期和不同条件在Target变量上(例如)将WT的总和放在每一天。在列表中。
答案 0 :(得分:0)
您没有WT列,因此我们现在不知道它是什么。但是,对于本示例,我将使用DW列作为聚合列。您可以根据需要进行更改。
import pandas as pd
col_names = ['Date', 'ID', 'Individual','Category','Age','DW','Gender']
my_df = pd.DataFrame(columns = col_names)
my_df['Date']=[2112019,2112019,2112019,2112019,2112019,2112019,2112019,2112019,2112019,2112019,3112019,3112019,3112019,3112019,
3112019,3112019,3112019,3112019,3112019,3112019,8112019,8112019,8112019,8112019,8112019,8112019,8112019,
8112019,8112019,8112019]
my_df['ID']=[1,1,1,2,2,2,2,3,3,3,1,1,1,2,2,2,2,3,3,3,1,1,1,2,2,2,2,3,3,3]
my_df['Individual']=[1,2,3,1,2,3,4,1,2,3,1,2,3,1,2,3,4,1,2,3,1,2,3,1,2,3,4,1,2,3]
my_df['Category']=['DE','DE','DE','C','C','C','C','A','A','A','DE','DE','DE','C','C','C','C','A','A','A','DE',
'DE','DE','C','C','C','C','A','A','A']
my_df['Age']=['51-60','02-14','31-40','02-14','31-40','15-21','22-30','60+','22-30','02-14','51-60','02-14','31-40',
'02-14','31-40','15-21','22-30','60+','22-30','02-14','51-60','02-14','31-40','02-14','31-40',
'15-21','22-30','60+','22-30','02-14']
my_df['DW']=[6554,7875,10063,5661,7851,10063,6552,2365,8569,7875,6554,7875,10063,5661,7875,
6554,7875,10063,5661,7851,10063,6552,2365,8569,7875,6554,7875,10063,5661,7875]
my_df['Gender']=['M','F','F','M','M','F','M','F','F','M','M','F','F','M','M','F','M','F','F','M',
'M','F','F','M','M','F','M','F','F','M']
col = ['Target', 'Day1', 'Day2','Day3','Day4','Day5','Day6','Day7']
new_df = pd.DataFrame(columns = col)
new_df['Target']=['A-Category & Age 22+','F-Female & ABC-Category & Age <21','M & Age 22-30','...']
创建包含所有匹配条件的词典列表。由于您的数据中没有任何ABC类别,因此我跳过了列表中的第二个示例。如果您是指这三个中的任何一个,则必须对此进行一些修改。
condition_list = []
groups = [
{
'ID':'any',
'Individual':'any',
'Category':'A',
'age_min':22,
'age_max':100,
'Gender':'any',
'Target':'A-Category & Age 22+'
},
{
'ID':'any',
'Individual':'any',
'Category':'any',
'age_min':22,
'age_max':30,
'Gender':'M',
'Target':'M & Age 22-30'
}
]
for group in groups:
temp_list = []
for key, value in group.items():
if value == 'any':
temp_list.append([x for x in my_df[key].unique()])
else:
temp_list.append([value])
condition_list.append(temp_list)
遍历您的条件列表,对数据框进行切片,分组,对聚合列求和,旋转并附加到最终数据框。
output = pd.DataFrame(columns=['Target'])
for condition in condition_list:
t = my_df[
(my_df['ID'].isin(condition[0])) &
(my_df['Individual'].isin(condition[1])) &
(my_df['Category'].isin(condition[2]) &
(my_df['Age'].apply(lambda x: int(min(x.replace('+','').split('-')))) >= condition[3][0]) &
(my_df['Age'].apply(lambda x: int(max(x.replace('+','').split('-')))) <= condition[4][0]) &
(my_df['Gender']).isin(condition[5]))
]
t['Target'] = condition[6][0]
output = output.append(t.groupby(['Target','Date'])['DW'].sum().reset_index().pivot(index='Target',columns='Date',values='DW'))
分配目标列
output['Target'] = output.index
output = output.reset_index(drop=True)
输出
2112019 3112019 8112019 Target
0 10934.0 15724.0 15724.0 A-Category & Age 22+
1 6552.0 7875.0 7875.0 M & Age 22-30