我有一个包含超过一百万行数据的数据框(按天汇总的代理和呼叫指标)。每个代理都会多次列出,因为它们每天处理多个队列中的呼叫(d1 $ Calls)。我想确定代理人在该领域的周数。我通常可以使用“difftime”来获取代理的开始日期(d1 $ Start)和交互日期(d1 $ Interaction)之间的差异:
floor(difftime(d1$Interaction,d1$Start,units='weeks'))
但是,我的系统的开始日期不可靠,往往导致负面的周数:
dput(d1)
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L), .Label = c("a123", "b123"), class = "factor"), QUEUE = structure(c(9L,
8L, 7L, 6L, 5L, 3L, 4L, 1L, 2L, 4L), .Label = c("MHEK", "MMED",
"MMEF", "MMEM", "MNEM", "MSED", "MSEE", "MSEK", "MSEP"), class = "factor"),
Calls = c(1L, 4L, 25L, 14L, 6L, 25L, 5L, 1L, 1L, 3L), Interaction = structure(list(
sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(2L, 2L, 6L, 12L,
12L, 2L, 6L, 6L, 6L, 6L), mon = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), year = c(115L, 115L, 115L, 115L,
115L, 115L, 115L, 115L, 115L, 115L), wday = c(5L, 5L,
2L, 1L, 1L, 5L, 2L, 2L, 2L, 2L), yday = c(1L, 1L, 5L,
11L, 11L, 1L, 5L, 5L, 5L, 5L), isdst = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST",
"PST", "PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), Start = structure(list(
sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), mon = c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), year = c(115L, 115L, 115L, 115L,
115L, 115L, 115L, 115L, 115L, 115L), wday = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), yday = c(59L, 59L, 59L,
59L, 59L, 59L, 59L, 59L, 59L, 59L), isdst = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST",
"PST", "PST", "PST", "PST", "PST", "PST", "PST", "PST"
), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_)), .Names = c("sec", "min",
"hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone",
"gmtoff"), class = c("POSIXlt", "POSIXt")), Weeks = structure(c(-9,
-9, -8, -7, -7, -9, -8, -8, -8, -8), units = "weeks", class = "difftime")), .Names = c("ID",
"QUEUE", "Calls", "Interaction", "Start", "Weeks"), row.names = c(NA,
-10L), class = "data.frame")
要解决此问题,我想计算任何交互日期(d1 $ Interaction)与系统中该代理的第一个交互日期(d1 $ ID)之间的差异。这怎么可能?
答案 0 :(得分:1)
这对我有用(全部在基础R中):
http_server.on('request', [yourFunction])
输出:
#split the data frame according to ID
mylist <- split(df, factor(df$ID))
#use do.call to combine lists elements to one data.frame
#instead of do call you can use data.table::rbindlist for speed
mydata <- do.call(rbind,
lapply(mylist, function(x) {
#order each group
x <- x[order(x$Interaction),]
#calculate time differences
#difftime of Interactions vector from the 2nd element to the last, minus
#the Interactions vector of the 1st element to the penultimate
#I use c(0, difftime.... to add a zero to the first difference
#so that I can add it as a column
x$weekdif <- c(0,difftime(x$Interaction[2:length(x$Interaction)],
x$Interaction[1:(length(x$Interaction)-1)],
units='weeks'))
x
}))
我将功能更改为以下内容,现在可以按照您的需要进行操作:
> mydata
ID QUEUE Calls Interaction Start Weeks weekdif
a123.1 a123 MSEP 1 2015-01-02 2015-03-01 -9 weeks 0.0000000
a123.2 a123 MSEK 4 2015-01-02 2015-03-01 -9 weeks 0.0000000
a123.3 a123 MSEE 25 2015-01-06 2015-03-01 -8 weeks 0.5714286
a123.4 a123 MSED 14 2015-01-12 2015-03-01 -7 weeks 0.8571429
a123.5 a123 MNEM 6 2015-01-12 2015-03-01 -7 weeks 0.0000000
b123.6 b123 MMEF 25 2015-01-02 2015-03-01 -9 weeks 0.0000000
b123.7 b123 MMEM 5 2015-01-06 2015-03-01 -8 weeks 0.5714286
b123.8 b123 MHEK 1 2015-01-06 2015-03-01 -8 weeks 0.0000000
b123.9 b123 MMED 1 2015-01-06 2015-03-01 -8 weeks 0.0000000
b123.10 b123 MMEM 3 2015-01-06 2015-03-01 -8 weeks 0.0000000
输出:
#you need to import this for the na.locf function
library(zoo)
mylist <- split(df, factor(df$ID))
mydata <- do.call(rbind,
lapply(mylist, function(x) {
x <- x[order(x$Interaction),]
x$weekdif <- c(0,difftime(x$Interaction[2:length(x$Interaction)], x$Interaction[1:(length(x$Interaction)-1)], units='weeks'))
#convert all zeros (apart from first to NAs)
x$weekdif[x$weekdif==0] <- NA
#create the rolling values minus the first NAs
#see the examples at ?na.locf for details on what it does
temp <- as.numeric(na.locf(zoo(x$weekdif)))
#add the first NAs
missing_length <- length(x$weekdif) - length(temp)
x$weekdif <- c(rep(0,missing_length), temp)
x
}))
每个id的第一个值为0,因为之前没有交互日期。