pandas - group by:使用多列创建聚合函数

时间:2018-04-13 05:44:14

标签: python-2.7 pandas aggregate-functions pandas-groupby

我有以下数据框:

id    my_year   my_month    waiting_time   target
001     2018       1           95            1
002     2018       1           3             3
003     2018       1           4             0
004     2018       1           40            1
005     2018       2           97            1
006     2018       2           3             3
007     2018       3           4             0
008     2018       3           40            1

我想分组my_yearmy_month,然后在每个组中我想基于

计算my_rate
(# of records with waiting_time <= 90 and target = 1)/ total_records in the group

即。我期待输出像:

my_year  my_month    my_rate
 2018     1          0.25
 2018     2          0.0
 2018     3          0.5

我编写了以下代码来计算所需的值my_rate

def my_rate(data):
    waiting_time_list = data['waiting_time']
    target_list = data['target']

    total = len(data)
    my_count = 0

    for i in range(len(data)):
      if total_waiting_time_list[i] <= 90 and target_list[i] == 1:
        my_count += 1

    rate = float(my_count)/float(total)
    return rate

df.groupby(['my_year','my_month']).apply(my_rate)

但是,我收到以下错误:

KeyError 0 
KeyErrorTraceback (most recent call last)
<ipython-input-29-5c4399cefd05> in <module>()
     17 
---> 18 df.groupby(['my_year','my_month']).apply(my_rate)

/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/groupby.pyc in apply(self, func, *args, **kwargs)
    714         # ignore SettingWithCopy here in case the user mutates
    715         with option_context('mode.chained_assignment', None):
--> 716             return self._python_apply_general(f)
    717 
    718     def _python_apply_general(self, f):

/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/groupby.pyc in _python_apply_general(self, f)
    718     def _python_apply_general(self, f):
    719         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 720                                                    self.axis)
    721 
    722         return self._wrap_applied_output(

/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/groupby.pyc in apply(self, f, data, axis)
   1727             # group might be modified
   1728             group_axes = _get_axes(group)
-> 1729             res = f(group)
   1730             if not _is_indexed_like(res, group_axes):
   1731                 mutated = True

<ipython-input-29-5c4399cefd05> in conversion_rate(data)
      8       #print total_waiting_time_list[i],  target_list[i]
      9       #print i, total_waiting_time_list[i], target_list[i]
---> 10       if total_waiting_time_list[i] <= 90:# and target_list[i] == 1:
     11         convert_90_count += 1
     12         #print 'convert ', convert_90_count

/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    599         key = com._apply_if_callable(key, self)
    600         try:
--> 601             result = self.index.get_value(self, key)
    602 
    603             if not is_scalar(result):

/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/indexes/base.pyc in get_value(self, series, key)
   2426         try:
   2427             return self._engine.get_value(s, k,
-> 2428                                           tz=getattr(series.dtype, 'tz', None))
   2429         except KeyError as e1:
   2430             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4363)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4046)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5085)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:13913)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:13857)()

KeyError: 0

知道我在这里做错了吗?我该如何解决?谢谢!

1 个答案:

答案 0 :(得分:1)

我相信更好的是每组使用mean布尔掩码:

def my_rate(x):
    return ((x['waiting_time'] <= 90) & (x['target'] == 1)).mean()

df = df.groupby(['my_year','my_month']).apply(my_rate).reset_index(name='my_rate')
print (df)
   my_year  my_month  my_rate
0     2018         1     0.25
1     2018         2     0.00
2     2018         3     0.50
  

知道我在这里做错了吗?

问题是waiting_time_listtarget_list不是list,而是Series

waiting_time_list = data['waiting_time']
target_list = data['target']

print (type(waiting_time_list))
<class 'pandas.core.series.Series'>

print (type(target_list))
<class 'pandas.core.series.Series'>

因此,如果想要将其编入索引失败,因为在第二组中是索引4,5,而不是0,1

if waiting_time_list[i] <= 90 and target_list[i] == 1:

为避免将Series转换为list

waiting_time_list = data['waiting_time'].tolist()
target_list = data['target'].tolist()