我在15M行数据帧中运行groupby,按2个键分组(每个最多30个字符)并应用返回多个值的自定义聚合函数,然后写入CSV。我有两种方法,一种是内存耗尽而另一种是失败,另一种是太慢(需要花费24小时才能运行...)。简化代码如下。
有: - 任何减少方法1的内存使用的方法 要么 - 在方法2中加速迭代器的任何方法
import pandas as pd
import numpy as np
def myfunct(x):
# test function
return 1,2,3
# fake the dataset
df = pd.DataFrame(np.random.randn(1500,3), columns=['a', 'b', 'c'])
df['key1'] = np.random.choice(['A','B','C','D','E'], df.shape[0])
df['key2'] = np.random.choice(['A','B','C','D','E'], df.shape[0])
# group and aggregate
grouped = df.groupby(['key1', 'key2'], sort=False)
#
# APPROACH 1.
#
# Works but runs out of memory on 15M row datafram with 30 char key1/2
#
f = {'a':['mean', 'count'], 'b':['mean'], 'c':myfunct}
grouped = grouped.aggregate(f)
grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]
#print grouped.head(5)
# split out columns - expensive?
grouped[['c_myfunct_1', 'c_myfunct_2', 'c_myfunct_3']] = grouped['c_myfunct'].apply(pd.Series)
grouped.drop('c_myfunct', axis=1, inplace=True)
#print grouped.head(5)
# write to file
grouped.to_csv('test1.csv')
#
# APPROACH 2.
#
# Alternate approach works fine but extremely slow!
#
grouped_again = df.groupby(['key1', 'key2'], sort=False)
header_ = True
with open('test2.csv', 'a') as f:
for keys, group in grouped_again:
(k1, k2) = keys
temp_df = df.loc[(df['key1'] == k1) & (df['key2'] == k2)].groupby(['key1', 'key2'], sort=False)
temp_df = temp_df.aggregate({'a':['mean', 'count'], 'b':['mean'], 'c':myfunct})
temp_df.columns = ['_'.join(col).strip() for col in temp_df.columns.values]
if header_ == True:
temp_df.to_csv(f, float_format='%.3f', header = header_)
header_ = False
else:
temp_df.to_csv(f, float_format='%.3f', header = header_)