Question

我有包含以下内容的事件日志：开始时间，结束时间，类别ID和计数。它们涵盖了几个月。

我想随着时间的推移聚合它们，以便能够在给定的一天，一周，一个月内追踪直方图。所以我认为最好的方法是在桶中加入句点。我认为5分钟会很好。

e.g。如果一个事件在晚上1.01开始并在晚上1.07结束，我想获得2条记录，因为它涵盖2个5分钟（0-5和5-10）的周期，并复制其余的原始数据对于这些新记录（类别和计数）

如果我的输入日志（x）是这样的：

start / end / catid / count     
2012-11-17 15:05:02.0,  2012-11-17 15:12:52.0,  1, 2    
2012-11-17 15:07:13.0,  2012-11-17 15:17:47.0,  2, 10   
2012-11-17 15:11:00.0,  2012-11-17 15:12:33.0,  3, 5    
2012-11-17 15:12:01.0,  2012-11-17 15:20:00.0,  4, 1

我试图在5分钟内完成输出（b）这样：

start / catid / count   
2012-11-17 15:05:00.0   1, 2    
2012-11-17 15:10:00.0   1, 2

2012-11-17 15:05:00.0   2, 10   
2012-11-17 15:10:00.0   2, 10
2012-11-17 15:15:00.0   2, 10

2012-11-17 15:10:00.0   3, 5

2012-11-17 15:10:00.0   4, 1
2012-11-17 15:15:00.0   4, 1

然后我可以轻松地将新数据框（b）聚合到我想要的时间段（小时，天，周，月）上的类别ID

我从R开始，我发现了很多关于如何挖出时间值而不是一段时间的解释。我看过动物园和xts，但我找不到该做什么。

希望这对你们中的一些人有意义。

编辑：

我稍微修改了Ram的建议，即使用舍入的结束时间而不是原始结束时间来正确计算块。（谢谢拉姆！）

mnslot=15 # size of the buckets/slot in minutes

#Round down the minutes of starttime to a mutliple of mnslot
st.str <- strptime(st, "%Y-%m-%d %H:%M:%S")
min_st <- as.numeric(format(st.str, "%M"))
roundedmins <- floor(min_st/mnslot) * mnslot
st.base <- strptime(st, "%Y-%m-%d %H")
rounded_start <- st.base + (roundedmins * 60)

#Round down the minutes of the endtime to a multiple of mnslot.
en.str <- strptime(en, "%Y-%m-%d %H:%M:%S")
min_en <- as.numeric(format(en.str, "%M"))
roundedmins <- floor(min_en/mnslot) * mnslot
en.base <- strptime(en, "%Y-%m-%d %H")
rounded_end<- en.base + (roundedmins * 60)

# calculate the number of blocks based on the rounded minutes of start and end
numblocks<- as.numeric(floor((rounded_end-rounded_start)/mnslot/60)+1)
# differenced of POSIXct values is in minutes
# but difference of POSIXlt seems to be in seconds , so have to divide by 60 as well

#Create REPLICATED Rows, depending on the size of the interval
replicated_cat = NULL
replicated_count = NULL
replicated_start =     NULL
for (n in 1:length(numblocks)){
  for (newrow in  1:numblocks[n]){
    replicated_start =   c(replicated_start, df$rounded_start[n]+(newrow-1)*300   )  
    replicated_cat = c(replicated_cat,    df$catid[n]) 
    replicated_count = c(replicated_count, df$count[n]) 
  }
}

#Change to readable format
POSIXT <- unix2POSIXct(replicated_start)

newdf <- data.frame(POSIXT, replicated_cat, replicated_count)
names(newdf) <- c("start", "CatId", "Count")
newdf

这会产生所需的输出。虽然它有点慢：p

Answer 1

这是一个完全正常的版本。它涉及到您所追求的一步一步的数据操作。

#storing the original data as a csv
df <- read.csv("tsdata.csv")
st<-as.POSIXlt(df$start)
en<-as.POSIXlt(df$end)

#a utility function to convert formats
unix2POSIXct  <-  function (time)   structure(time, class = c("POSIXt", "POSIXct") )

#For each row, determine how many replications are needed
numdups <- as.numeric(floor((en-st)/5)+1)

st.str <- strptime(st, "%Y-%m-%d %H:%M:%S")
min_st <- as.numeric(format(st.str, "%M"))

#Round down the minutes of start to 5 minute starts. 0,5,10 etc...
roundedmins <- floor(min_st/5) * 5
st.base <- strptime(st, "%Y-%m-%d %H")
df$rounded_start <- st.base + (roundedmins * 60)


#Create REPLICATED Rows, depending on the size of the interval
replicated_cat = NULL
replicated_count = NULL
replicated_start =     NULL
for (n in 1:length(numdups)){
  for (newrow in  1:numdups[n]){
    replicated_start =   c(replicated_start, df$rounded_start[n]+(newrow-1)*300   )  
    replicated_cat = c(replicated_cat,    df$catid[n]) 
    replicated_count = c(replicated_count, df$count[n]) 
  }
}

#Change to readable format
POSIXT <- unix2POSIXct(replicated_start)

newdf <- data.frame(POSIXT, replicated_cat, replicated_count)
names(newdf) <- c("start", "CatId", "Count")
newdf

产生：

                start CatId Count
1 2012-11-17 15:05:00     1     2
2 2012-11-17 15:10:00     1     2
3 2012-11-17 15:05:00     2    10
4 2012-11-17 15:10:00     2    10
5 2012-11-17 15:15:00     2    10
6 2012-11-17 15:10:00     3     5
7 2012-11-17 15:10:00     4     1
8 2012-11-17 15:15:00     4     1

Answer 2

这不是一件容易的事情......我也错过了整个问题的结构，所以我希望如果我限制自己概述基本方法，如果事情不清楚，你可以回复我。首先（如果我是你）我会安装' lubridate '包，这样可以更轻松地玩日期/时间。然后可以尝试这样的事情：

z <- strptime("17/11/12 15:05:00.0", "%d/%m/%y %H:%M:%OS")

这将定义您的起始时间点，如果应该由第一个日志（x）时间定义，那么可以使用分钟命令，例如

z <- strptime("17/11/12 15:05:02.0", "%d/%m/%y %H:%M:%OS")
minute(z)<-5;second(z)<-0.0 #I guess, you get the concept

然后产生一个5分钟间隔的序列

z5s<-z+minutes(seq(0,100,5))

这将产生一个20分钟，5分钟的时间间隔的序列，这里我再也不知道整个事情应该是多么灵活。

最后，您可以使用例如模运算

z2<-z+minutes(2)

z2应该是结束时间，我只是在这里“手动”添加2分钟来说明概念

(as.integer(z2-z))%%5 > 5 
FALSE

或者如果您想查看涵盖了多少5分钟的跨度，请执行(as.integer(z2-z))%%5 或者您希望在z5s POSIXlt间隔内匹配/分配日志时间的任何其他函数。

希望这有点帮助，即给你一些方向。

时间段

2 个答案: