import pandas as pd
import datetime
import numpy as np
from datetime import timedelta
def diff_func(row):
return (row['Timestamp'] - row['previous_end'])
dfMockLog = [ (1, ("2017-01-01 09:00:00"), "htt://x.org/page1.html"),
(1, ("2017-01-01 09:01:00"), "htt://x.org/page2.html"),
(1, ("2017-01-01 09:02:00"), "htt://x.org/page3.html"),
(1, ("2017-01-01 09:05:00"), "htt://x.org/page3.html"),
(1, ("2017-01-01 09:30:00"), "htt://x.org/page2.html"),
(1, ("2017-01-01 09:33:00"), "htt://x.org/page1.html"),
(1, ("2017-01-01 09:37:00"), "htt://x.org/page2.html"),
(1, ("2017-01-01 09:41:00"), "htt://x.org/page3.html"),
(1, ("2017-01-01 10:00:00"), "htt://x.org/page1.html"),
(1, ("2017-01-01 11:00:00"), "htt://x.org/page2.html"),
(2, ("2017-01-01 09:41:00"), "htt://x.org/page3.html"),
(2, ("2017-01-01 09:42:00"), "htt://x.org/page1.html"),
(2, ("2017-01-01 09:43:00"), "htt://x.org/page2.html")]
dfMockLog = pd.DataFrame(dfMockLog, columns = ['user', 'Timestamp', 'url'])
dfMockLog['Timestamp'] = pd.to_datetime(dfMockLog['Timestamp'])
dfMockLog = dfMockLog.sort_values(['user','Timestamp'])
dfMockLog['previous_end'] = dfMockLog.groupby(['user'])['Timestamp'].shift(1)
dfMockLog['time_diff'] = dfMockLog.apply(diff_func, axis=1)
dfMockLog['cum_sum'] = dfMockLog['time_diff'].cumsum()
print(dfMockLog)
我需要" timediff"要转换为秒的列。" cum_sum"列应包含由" user"分区的累计总和。如果可以分享timedelta的所有可能格式,那将是很棒的。
答案 0 :(得分:0)
你很亲密。我喜欢的方式是通过time_diff
在几秒钟内创建一个包含pd.Series.dt.seconds
的新列。然后使用groupby.transform
从cumsum
提取user
:
dfMockLog['time_diff_secs'] = dfMockLog['time_diff'].dt.seconds
dfMockLog['cum_sum'] = dfMockLog.groupby('user')['time_diff_secs'].transform('cumsum')
print(dfMockLog)
user Timestamp url previous_end \
0 1 2017-01-01 09:00:00 htt://x.org/page1.html NaT
1 1 2017-01-01 09:01:00 htt://x.org/page2.html 2017-01-01 09:00:00
2 1 2017-01-01 09:02:00 htt://x.org/page3.html 2017-01-01 09:01:00
3 1 2017-01-01 09:05:00 htt://x.org/page3.html 2017-01-01 09:02:00
4 1 2017-01-01 09:30:00 htt://x.org/page2.html 2017-01-01 09:05:00
5 1 2017-01-01 09:33:00 htt://x.org/page1.html 2017-01-01 09:30:00
6 1 2017-01-01 09:37:00 htt://x.org/page2.html 2017-01-01 09:33:00
7 1 2017-01-01 09:41:00 htt://x.org/page3.html 2017-01-01 09:37:00
8 1 2017-01-01 10:00:00 htt://x.org/page1.html 2017-01-01 09:41:00
9 1 2017-01-01 11:00:00 htt://x.org/page2.html 2017-01-01 10:00:00
10 2 2017-01-01 09:41:00 htt://x.org/page3.html NaT
11 2 2017-01-01 09:42:00 htt://x.org/page1.html 2017-01-01 09:41:00
12 2 2017-01-01 09:43:00 htt://x.org/page2.html 2017-01-01 09:42:00
time_diff time_diff_secs cum_sum
0 NaT NaN NaN
1 00:01:00 60.0 60.0
2 00:01:00 60.0 120.0
3 00:03:00 180.0 300.0
4 00:25:00 1500.0 1800.0
5 00:03:00 180.0 1980.0
6 00:04:00 240.0 2220.0
7 00:04:00 240.0 2460.0
8 00:19:00 1140.0 3600.0
9 01:00:00 3600.0 7200.0
10 NaT NaN NaN
11 00:01:00 60.0 60.0
12 00:01:00 60.0 120.0