每组添加丢失的小时数时,无法从重复的轴重新索引

时间:2018-06-28 11:26:05

标签: python pandas dataframe

这是我的数据集

customer_id      timestamp
8893             2018-06-24 04:00:00
8894             2018-06-24 16:00:00
8894             2018-06-25 14:00:00

有我的代码来添加每个小组的缺勤时间

geolocs = geolocs.set_index('timestamp').groupby('customer_id').apply(lambda x: x.asfreq('h'))

这是错误消息

ValueError                                Traceback (most recent call last)
<ipython-input-37-6ae38f2531da> in <module>()
----> 1 geolocs = geolocs.set_index('timestamp').groupby('customer_id').apply(lambda x: x.asfreq('h'))

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py in apply(self, func, *args, **kwargs)
    714         # ignore SettingWithCopy here in case the user mutates
    715         with option_context('mode.chained_assignment', None):
--> 716             return self._python_apply_general(f)
    717 
    718     def _python_apply_general(self, f):

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py in _python_apply_general(self, f)
    718     def _python_apply_general(self, f):
    719         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 720                                                    self.axis)
    721 
    722         return self._wrap_applied_output(

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py in apply(self, f, data, axis)
   1800             # group might be modified
   1801             group_axes = _get_axes(group)
-> 1802             res = f(group)
   1803             if not _is_indexed_like(res, group_axes):
   1804                 mutated = True

<ipython-input-37-6ae38f2531da> in <lambda>(x)
----> 1 geolocs = geolocs.set_index('timestamp').groupby('customer_id').apply(lambda x: x.asfreq('h'))

~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in asfreq(self, freq, method, how, normalize, fill_value)
   4513         from pandas.core.resample import asfreq
   4514         return asfreq(self, freq, method=method, how=how, normalize=normalize,
-> 4515                       fill_value=fill_value)
   4516 
   4517     def at_time(self, time, asof=False):

~/anaconda3/lib/python3.6/site-packages/pandas/core/resample.py in asfreq(obj, freq, method, how, normalize, fill_value)
   1372         dti = date_range(obj.index[0], obj.index[-1], freq=freq)
   1373         dti.name = obj.index.name
-> 1374         new_obj = obj.reindex(dti, method=method, fill_value=fill_value)
   1375         if normalize:
   1376             new_obj.index = new_obj.index.normalize()

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in reindex(self, index, columns, **kwargs)
   2731     def reindex(self, index=None, columns=None, **kwargs):
   2732         return super(DataFrame, self).reindex(index=index, columns=columns,
-> 2733                                               **kwargs)
   2734 
   2735     @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)

~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
   2513         # perform the reindex on the axes
   2514         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 2515                                   fill_value, copy).__finalize__(self)
   2516 
   2517     def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   2677         if index is not None:
   2678             frame = frame._reindex_index(index, method, copy, level,
-> 2679                                          fill_value, limit, tolerance)
   2680 
   2681         return frame

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
   2688         return self._reindex_with_indexers({0: [new_index, indexer]},
   2689                                            copy=copy, fill_value=fill_value,
-> 2690                                            allow_dups=False)
   2691 
   2692     def _reindex_columns(self, new_columns, method, copy, level, fill_value=NA,

~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
   2625                                                 fill_value=fill_value,
   2626                                                 allow_dups=allow_dups,
-> 2627                                                 copy=copy)
   2628 
   2629         if copy and new_data is self._data:

~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
   3884         # some axes don't allow reindexing with dups
   3885         if not allow_dups:
-> 3886             self.axes[axis]._can_reindex(indexer)
   3887 
   3888         if axis >= self.ndim:

~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in _can_reindex(self, indexer)
   2834         # trying to reindex on an axis with duplicates
   2835         if not self.is_unique and len(indexer):
-> 2836             raise ValueError("cannot reindex from a duplicate axis")
   2837 
   2838     def reindex(self, target, method=None, level=None, limit=None,

ValueError: cannot reindex from a duplicate axis

1 个答案:

答案 0 :(得分:1)

我认为每组需要唯一的时间戳记,因此可能的解决方案是resample每组进行汇总(而不是groupby + asfreq)或每两列删除drop_duplicates的重复项-customer_idtimestamp

print (geolocs)
   customer_id           timestamp  value
0         8893 2018-06-24 04:00:00      1 <-duplicated timespamp per group
1         8893 2018-06-24 04:00:00      7 <-duplicated timespamp per group
2         8894 2018-06-24 16:00:00      2
3         8894 2018-06-25 14:00:00     10

geolocs1 = geolocs.set_index('timestamp').groupby('customer_id').resample('h').sum()
print (geolocs1)
                                 customer_id  value
customer_id timestamp                              
8893        2018-06-24 04:00:00        17786      8 <- aggregation sum 1 + 7 = 8
8894        2018-06-24 16:00:00         8894      2
            2018-06-24 17:00:00            0      0
            2018-06-24 18:00:00            0      0
            2018-06-24 19:00:00            0      0
            2018-06-24 20:00:00            0      0
            2018-06-24 21:00:00            0      0
            2018-06-24 22:00:00            0      0
            2018-06-24 23:00:00            0      0
            2018-06-25 00:00:00            0      0
            2018-06-25 01:00:00            0      0
            2018-06-25 02:00:00            0      0
            2018-06-25 03:00:00            0      0
            2018-06-25 04:00:00            0      0
            2018-06-25 05:00:00            0      0
            2018-06-25 06:00:00            0      0
            2018-06-25 07:00:00            0      0
            2018-06-25 08:00:00            0      0
            2018-06-25 09:00:00            0      0
            2018-06-25 10:00:00            0      0
            2018-06-25 11:00:00            0      0
            2018-06-25 12:00:00            0      0
            2018-06-25 13:00:00            0      0
            2018-06-25 14:00:00         8894     10

geolocs = geolocs.drop_duplicates(['customer_id','timestamp'])
geolocs1 =geolocs.set_index('timestamp').groupby('customer_id').apply(lambda x: x.asfreq('h'))
print (geolocs1)
                                 customer_id  value
customer_id timestamp                              
8893        2018-06-24 04:00:00       8893.0    1.0 <-only first value
8894        2018-06-24 16:00:00       8894.0    2.0
            2018-06-24 17:00:00          NaN    NaN
            2018-06-24 18:00:00          NaN    NaN
            2018-06-24 19:00:00          NaN    NaN
            2018-06-24 20:00:00          NaN    NaN
            2018-06-24 21:00:00          NaN    NaN
            2018-06-24 22:00:00          NaN    NaN
            2018-06-24 23:00:00          NaN    NaN
            2018-06-25 00:00:00          NaN    NaN
            2018-06-25 01:00:00          NaN    NaN
            2018-06-25 02:00:00          NaN    NaN
            2018-06-25 03:00:00          NaN    NaN
            2018-06-25 04:00:00          NaN    NaN
            2018-06-25 05:00:00          NaN    NaN
            2018-06-25 06:00:00          NaN    NaN
            2018-06-25 07:00:00          NaN    NaN
            2018-06-25 08:00:00          NaN    NaN
            2018-06-25 09:00:00          NaN    NaN
            2018-06-25 10:00:00          NaN    NaN
            2018-06-25 11:00:00          NaN    NaN
            2018-06-25 12:00:00          NaN    NaN
            2018-06-25 13:00:00          NaN    NaN
            2018-06-25 14:00:00       8894.0   10.0