这是一个带有多索引列的Pandas v0.14.0数据框。
> import pandas as pd
> import numpy as np
>
> rng = pd.date_range('1/1/2001', periods=6, freq='H')
> mi = [(dt, i) for dt in rng for i in range(2)]
> f = pd.DataFrame(np.random.randn(len(mi), 2),
> index = pd.MultiIndex.from_tuples(mi, names=['time', 'extra']),
columns =['A', 'B'])
> g = f.unstack('extra')
> g
A B
extra 0 1 0 1
time
2001-01-01 00:00:00 -0.169742 0.390842 -0.017884 1.043376
2001-01-01 01:00:00 -0.184442 -0.102512 -0.013702 0.675290
2001-01-01 02:00:00 0.244708 -0.360740 1.059269 -0.330537
2001-01-01 03:00:00 -2.275161 -1.782581 0.754368 -0.157851
2001-01-01 04:00:00 -0.554282 0.310691 0.917221 -0.114459
2001-01-01 05:00:00 0.599133 0.904824 1.858538 1.319041
我可以在所有列中成功使用一种方法重新取样g
,例如按g.resample('6H', how=np.sum)
。如何针对每列使用不同的方法重新取样g
,例如通过对'A'列求和并对'B'列求平均值?
我尝试了以下内容,它适用于非多索引列,但出现错误。
> g.resample('6H', how={'A': np.sum, 'B': np.mean})
KeyError Traceback (most recent call last)
<ipython-input-217-b1a72fd62178> in <module>()
4 g = f.unstack('extra')
5 print(g)
----> 6 g.resample('6H', how={'A': np.sum, 'B': np.mean})
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/generic.py in resample(self, rule, how, axis, fill_method, closed, label, convention, kind, loffset, limit, base)
2834 fill_method=fill_method, convention=convention,
2835 limit=limit, base=base)
-> 2836 return sampler.resample(self).__finalize__(self)
2837
2838 def first(self, offset):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/tseries/resample.py in resample(self, obj)
81
82 if isinstance(ax, DatetimeIndex):
---> 83 rs = self._resample_timestamps()
84 elif isinstance(ax, PeriodIndex):
85 offset = to_offset(self.freq)
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/tseries/resample.py in _resample_timestamps(self)
252 # downsample
253 grouped = obj.groupby(grouper, axis=self.axis)
--> 254 result = grouped.aggregate(self._agg_method)
255 else:
256 # upsampling shortcut
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
2402 colg = SeriesGroupBy(obj[col], selection=col,
2403 grouper=self.grouper)
-> 2404 result[col] = colg.aggregate(agg_how)
2405 keys.append(col)
2406
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2078 cyfunc = _intercept_cython(func_or_funcs)
2079 if cyfunc and not args and not kwargs:
-> 2080 return getattr(self, cyfunc)()
2081
2082 if self.grouper.nkeys > 1:
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in mean(self)
668 self._set_selection_from_grouper()
669 f = lambda x: x.mean(axis=self.axis)
--> 670 return self._python_agg_general(f)
671
672 def median(self):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
1012 # iterate through "columns" ex exclusions to populate output dict
1013 output = {}
-> 1014 for name, obj in self._iterate_slices():
1015 try:
1016 result, counts = self.grouper.agg_series(obj, f)
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in _iterate_slices(self)
650
651 def _iterate_slices(self):
--> 652 yield self.name, self._selected_obj
653
654 def transform(self, func, *args, **kwargs):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/lib.so in pandas.lib.cache_readonly.__get__ (pandas/lib.c:37563)()
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/groupby.py in _selected_obj(self)
461 return self.obj
462 else:
--> 463 return self.obj[self._selection]
464
465 def _set_selection_from_grouper(self):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/frame.py in __getitem__(self, key)
1682 return self._getitem_multilevel(key)
1683 else:
-> 1684 return self._getitem_column(key)
1685
1686 def _getitem_column(self, key):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/frame.py in _getitem_column(self, key)
1689 # get column
1690 if self.columns.is_unique:
-> 1691 return self._get_item_cache(key)
1692
1693 # duplicate columns & possible reduce dimensionaility
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1050 res = cache.get(item)
1051 if res is None:
-> 1052 values = self._data.get(item)
1053 res = self._box_item_values(item, values)
1054 cache[item] = res
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/internals.py in get(self, item)
2535
2536 if not isnull(item):
-> 2537 loc = self.items.get_loc(item)
2538 else:
2539 indexer = np.arange(len(self.items))[isnull(self.items)]
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/core/index.py in get_loc(self, key)
1154 loc : int if unique index, possibly slice or mask if not
1155 """
-> 1156 return self._engine.get_loc(_values_from_object(key))
1157
1158 def get_value(self, series, key):
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3650)()
/Users/araichev/anaconda/envs/py3k/lib/python3.3/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3577)()
KeyError: 'B'
答案 0 :(得分:3)
如果你以f开头,你可以使用带有TimeGrouper的groupby进行重采样&#34;手动&#34;:
In [11]: grp = f.groupby(pd.TimeGrouper('6H', level=0))
In [12]: grp['A'].sum()
Out[12]:
0
2001-01-01 -1.805954
Freq: 6H, Name: A, dtype: float64
In [13]: grp['B'].mean()
Out[13]:
0
2001-01-01 -0.461053
Freq: 6H, Name: B, dtype: float64
如果你想基于额外添加到groupby *:
分开In [21]: grp2 = f.groupby([pd.TimeGrouper('6H', level=0),
f.index.get_level_values('extra')])
In [22]: grp2['A'].sum()
Out[22]:
0 extra
2001-01-01 0 2.030321
1 -3.836275
Name: A, dtype: float64
In [23]: grp2['B'].mean()
Out[23]:
0 extra
2001-01-01 0 -0.554839
1 -0.367267
Name: B, dtype: float64
*注意:带有列和TimeGrouper的groupby不能在0.14之前的版本中工作。
要从g到f,您可以使用堆栈重塑:
In [31]: f2 = g.stack(level=1) # Note: use stack to get f from g
从上面的结果返回到类似的格式:
In [32]: pd.DataFrame({'A': grp['A'].sum(), 'B': grp['B'].mean()})
Out[32]:
A B
0 extra
2001-01-01 0 -2.762064 -0.269427
1 -2.006839 -0.026213
In [33]: _.unstack(level=1)
Out[33]:
A B
extra 0 1 0 1
0
2001-01-01 -2.762064 -2.006839 -0.269427 -0.026213
另一种方法,可能更简单&#34;,因为你实际上进行了重新采样,是从列中制作dict:
In [41]: dict(zip(g.columns,
map({'A': 'sum', 'B': 'mean'}.get,
[x[0] for x in g.columns])))
Out[41]: {('A', 0): 'sum', ('A', 1): 'sum', ('B', 0): 'mean', ('B', 1): 'mean'}
In [42]: g.resample('6H', _)
Out[42]:
A B A B
1 0 0 1
time
2001-01-01 -3.836275 -0.554839 2.030321 -0.367267