这是我的数据集
customer_id timestamp
8893 2018-06-24 04:00:00
8894 2018-06-24 16:00:00
8894 2018-06-25 14:00:00
有我的代码来添加每个小组的缺勤时间
geolocs = geolocs.set_index('timestamp').groupby('customer_id').apply(lambda x: x.asfreq('h'))
这是错误消息
ValueError Traceback (most recent call last)
<ipython-input-37-6ae38f2531da> in <module>()
----> 1 geolocs = geolocs.set_index('timestamp').groupby('customer_id').apply(lambda x: x.asfreq('h'))
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py in apply(self, func, *args, **kwargs)
714 # ignore SettingWithCopy here in case the user mutates
715 with option_context('mode.chained_assignment', None):
--> 716 return self._python_apply_general(f)
717
718 def _python_apply_general(self, f):
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py in _python_apply_general(self, f)
718 def _python_apply_general(self, f):
719 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 720 self.axis)
721
722 return self._wrap_applied_output(
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py in apply(self, f, data, axis)
1800 # group might be modified
1801 group_axes = _get_axes(group)
-> 1802 res = f(group)
1803 if not _is_indexed_like(res, group_axes):
1804 mutated = True
<ipython-input-37-6ae38f2531da> in <lambda>(x)
----> 1 geolocs = geolocs.set_index('timestamp').groupby('customer_id').apply(lambda x: x.asfreq('h'))
~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in asfreq(self, freq, method, how, normalize, fill_value)
4513 from pandas.core.resample import asfreq
4514 return asfreq(self, freq, method=method, how=how, normalize=normalize,
-> 4515 fill_value=fill_value)
4516
4517 def at_time(self, time, asof=False):
~/anaconda3/lib/python3.6/site-packages/pandas/core/resample.py in asfreq(obj, freq, method, how, normalize, fill_value)
1372 dti = date_range(obj.index[0], obj.index[-1], freq=freq)
1373 dti.name = obj.index.name
-> 1374 new_obj = obj.reindex(dti, method=method, fill_value=fill_value)
1375 if normalize:
1376 new_obj.index = new_obj.index.normalize()
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in reindex(self, index, columns, **kwargs)
2731 def reindex(self, index=None, columns=None, **kwargs):
2732 return super(DataFrame, self).reindex(index=index, columns=columns,
-> 2733 **kwargs)
2734
2735 @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
2513 # perform the reindex on the axes
2514 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 2515 fill_value, copy).__finalize__(self)
2516
2517 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2677 if index is not None:
2678 frame = frame._reindex_index(index, method, copy, level,
-> 2679 fill_value, limit, tolerance)
2680
2681 return frame
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
2688 return self._reindex_with_indexers({0: [new_index, indexer]},
2689 copy=copy, fill_value=fill_value,
-> 2690 allow_dups=False)
2691
2692 def _reindex_columns(self, new_columns, method, copy, level, fill_value=NA,
~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
2625 fill_value=fill_value,
2626 allow_dups=allow_dups,
-> 2627 copy=copy)
2628
2629 if copy and new_data is self._data:
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
3884 # some axes don't allow reindexing with dups
3885 if not allow_dups:
-> 3886 self.axes[axis]._can_reindex(indexer)
3887
3888 if axis >= self.ndim:
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in _can_reindex(self, indexer)
2834 # trying to reindex on an axis with duplicates
2835 if not self.is_unique and len(indexer):
-> 2836 raise ValueError("cannot reindex from a duplicate axis")
2837
2838 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
答案 0 :(得分:1)
我认为每组需要唯一的时间戳记,因此可能的解决方案是resample
每组进行汇总(而不是groupby
+ asfreq
)或每两列删除drop_duplicates
的重复项-customer_id
和timestamp
:
print (geolocs)
customer_id timestamp value
0 8893 2018-06-24 04:00:00 1 <-duplicated timespamp per group
1 8893 2018-06-24 04:00:00 7 <-duplicated timespamp per group
2 8894 2018-06-24 16:00:00 2
3 8894 2018-06-25 14:00:00 10
geolocs1 = geolocs.set_index('timestamp').groupby('customer_id').resample('h').sum()
print (geolocs1)
customer_id value
customer_id timestamp
8893 2018-06-24 04:00:00 17786 8 <- aggregation sum 1 + 7 = 8
8894 2018-06-24 16:00:00 8894 2
2018-06-24 17:00:00 0 0
2018-06-24 18:00:00 0 0
2018-06-24 19:00:00 0 0
2018-06-24 20:00:00 0 0
2018-06-24 21:00:00 0 0
2018-06-24 22:00:00 0 0
2018-06-24 23:00:00 0 0
2018-06-25 00:00:00 0 0
2018-06-25 01:00:00 0 0
2018-06-25 02:00:00 0 0
2018-06-25 03:00:00 0 0
2018-06-25 04:00:00 0 0
2018-06-25 05:00:00 0 0
2018-06-25 06:00:00 0 0
2018-06-25 07:00:00 0 0
2018-06-25 08:00:00 0 0
2018-06-25 09:00:00 0 0
2018-06-25 10:00:00 0 0
2018-06-25 11:00:00 0 0
2018-06-25 12:00:00 0 0
2018-06-25 13:00:00 0 0
2018-06-25 14:00:00 8894 10
geolocs = geolocs.drop_duplicates(['customer_id','timestamp'])
geolocs1 =geolocs.set_index('timestamp').groupby('customer_id').apply(lambda x: x.asfreq('h'))
print (geolocs1)
customer_id value
customer_id timestamp
8893 2018-06-24 04:00:00 8893.0 1.0 <-only first value
8894 2018-06-24 16:00:00 8894.0 2.0
2018-06-24 17:00:00 NaN NaN
2018-06-24 18:00:00 NaN NaN
2018-06-24 19:00:00 NaN NaN
2018-06-24 20:00:00 NaN NaN
2018-06-24 21:00:00 NaN NaN
2018-06-24 22:00:00 NaN NaN
2018-06-24 23:00:00 NaN NaN
2018-06-25 00:00:00 NaN NaN
2018-06-25 01:00:00 NaN NaN
2018-06-25 02:00:00 NaN NaN
2018-06-25 03:00:00 NaN NaN
2018-06-25 04:00:00 NaN NaN
2018-06-25 05:00:00 NaN NaN
2018-06-25 06:00:00 NaN NaN
2018-06-25 07:00:00 NaN NaN
2018-06-25 08:00:00 NaN NaN
2018-06-25 09:00:00 NaN NaN
2018-06-25 10:00:00 NaN NaN
2018-06-25 11:00:00 NaN NaN
2018-06-25 12:00:00 NaN NaN
2018-06-25 13:00:00 NaN NaN
2018-06-25 14:00:00 8894.0 10.0