当我在pandas中有一个数据框时,如:
raw_data = {
'subject_id': ['1', '2', '3', '4', '5'],
'name': ['A', 'B', 'C', 'D', 'E'],
'nationality': ['DE', 'AUT', 'US', 'US', 'US'],
'alotdifferent': ['x', 'y', 'z', 'x', 'a'],
'target': [0,0,0,1,1],
'age_group' : [1, 2, 1, 3, 1]}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'name', 'nationality', 'alotdifferent','target','age_group'])
df_a.nationality = df_a.nationality.astype('category')
df_a.alotdifferent = df_a.alotdifferent.astype('category')
df_a.name = df_a.name.astype('category')
目前,我使用:
FACTOR_FIELDS = df_a.select_dtypes(include=['category']).columns
columnsToDrop = ['alotdifferent']
columnsToBias_keep = FACTOR_FIELDS[~FACTOR_FIELDS.isin(columnsToDrop)]
target = 'target'
def quotients_slow(df_a):
# parallelism = 8
# original = dd.from_pandas(df.copy())
original = df_a.copy()
output_df = original
ratio_weights = {}
for colname in columnsToBias_keep.union(columnsToDrop):
# group only a single time
grouped = original.groupby([colname, target]).size()
# calculate first ratio
df = grouped / original[target].sum()
nameCol = "pre_" + colname
grouped_res = df.reset_index(name=nameCol)
grouped_res = grouped_res[grouped_res[target] == 1]
grouped_res = grouped_res.drop(target, 1)
# todo persist the result in dict for transformer
result_1 = grouped_res
# calculate second ratio
df = (grouped / grouped.groupby(level=0).sum())
nameCol_2 = "pre2_" + colname
grouped = df.reset_index(name=nameCol_2)
grouped_res = grouped[grouped[target] == 1]
grouped_res = grouped_res.drop(target, 1)
result_2 = grouped_res
# persist the result in dict for transformer
# this is required to separate fit and transform stage (later on in a sklearn transformer)
ratio_weights[nameCol] = result_1
ratio_weights[nameCol_2] = result_2
# retrieve results
res_1 = ratio_weights['pre_' + colname]
res_2 = ratio_weights['pre2_' + colname]
# merge ratio_weight with original dataframe
output_df = pd.merge(output_df, res_1, on=colname, how='left')
output_df = pd.merge(output_df, res_2, on=colname, how='left')
output_df.loc[(output_df[nameCol].isnull()), nameCol] = 0
output_df.loc[(output_df[nameCol_2].isnull()), nameCol_2] = 0
if colname in columnsToDrop:
output_df = output_df.drop(colname, 1)
return output_df
quotients_slow(df_a)
以两种方式计算每个(分类)列的每个组与target:1
的比率。由于我想对多列执行此操作,我天真地迭代所有这些操作。但是这个操作非常慢。
在示例中:10 loops, best of 3: 37 ms per loop
。对于我的大约500000行和大约100列的真实数据集,这确实需要一段时间。
是不是可以在dask或pandas中加速它(列并行方式,平凡的并行化)?是否有可能在普通大熊猫中更有效地实施它?是否可以减少用于计算商数的数据传递次数?
尝试在for循环中使用dask.delayed
来实现列的并行性时,我无法弄清楚如何在列上构建图形,因为我需要调用compute来获取元组。
delayed_res_name = delayed(compute_weights)(df_a, 'name')
a,b,c,d = delayed_res_name.compute()
ratio_weights = {}
ratio_weights[c] = a
ratio_weights[d] = b
答案 0 :(得分:1)
这是使用Pandas为您的第一个商提供相当快速的解决方案。它假设您对计算subject_id
的比例不感兴趣。我还在您的示例中添加了一些数据,以涵盖更多边缘情况。
首先,生成样本数据:
raw_data = {
'subject_id': ['1', '2', '3', '4', '5', '6','7'],
'name': ['A', 'B', 'C', 'D', 'E', 'A','A'],
'nationality': ['DE', 'AUT', 'US', 'US', 'US', 'DE','DE'],
'alotdifferent': ['x', 'y', 'z', 'x', 'a','x','z'],
'target': [0,0,0,1,1,0,1],
'age_group' : [1, 2, 1, 3, 1, 2,1]}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'name', 'nationality', 'alotdifferent','target','age_group'])
现在计算比例并测量速度:
def compute_prop(group):
return group.sum() / float(group.count())
def build_master(df):
master = df.copy()
fields = df.drop(['subject_id','target'],1).columns
for field in fields:
master = (pd.merge(master, df.groupby(field, as_index=False)
.agg({'target':compute_prop})
.rename(columns={'target':'pre_{}'.format(field)}),
on=field)
)
master.sort_values('subject_id')
return master
%timeit master = build_master(df_a)
10 loops, best of 3: 17.1 ms per loop
输出:
subject_id name nationality alotdifferent target age_group pre_name \
0 1 A DE x 0 1 0.333333
5 2 B AUT y 0 2 0.000000
2 3 C US z 0 1 0.000000
6 4 D US x 1 3 1.000000
3 5 E US a 1 1 1.000000
4 6 A DE x 0 2 0.333333
1 7 A DE z 1 1 0.333333
pre_nationality pre_alotdifferent pre_age_group
0 0.333333 0.333333 0.5
5 0.000000 0.000000 0.0
2 0.666667 0.500000 0.5
6 0.666667 0.333333 1.0
3 0.666667 1.000000 0.5
4 0.333333 0.333333 0.0
1 0.333333 0.500000 0.5