自定义R函数中的循环用于转换数据

时间:2016-05-22 19:31:03

标签: r loops

我使用此代码创建事件的示例数据框:

set.seed(100)
mydf <-data.frame(time=(1:100),
                  status = sample(c('OK','UNKNOWN'),1000,replace=TRUE),
                  event = sample(1:10,1000,replace=TRUE)
                  )

数据如下所示:

head(mydf)
  time  status event
1    1      OK     1
2    2      OK     2
3    3 UNKNOWN     7
4    4      OK     7
5    5      OK     4
6    6 UNKNOWN     2

我想创建一个像这样的新数据集:

    StartTime EndTime SeqID Sequence
1      1         3      1    {1,2,7}    
2      4         6      2    {7,4,2}

基本上我想创建一个名为Sequence的列,它是一个事件数组,但是我想在status列等于UNKNOWN之后重新开始。我已尝试使用while循环进行for循环,但没有成功。

2 个答案:

答案 0 :(得分:3)

这是一个data.table解决方案:

library(data.table);
dt <- as.data.table(mydf);
dt[,.(StartTime=time[1L],EndTime=time[length(time)],Sequence=list(event)),.(SeqID=cumsum(status=='UNKNOWN')+1L)];
##      SeqID StartTime EndTime Sequence
##   1:     1         1       2      1,2
##   2:     2         3       6  7,7,4,2
##   3:     3         7       8      1,5
##   4:     4         9      10     6,10
##   5:     5        11      11        4
##  ---
## 513:   513        90      92    7,3,5
## 514:   514        93      93        2
## 515:   515        94      95     8,10
## 516:   516        96      99  3,2,3,1
## 517:   517       100     100        7

我相信你的预期输出有误。如果序列在每次状态列等于UNKNOWN时重新开始,则第一个数组应为1,2而不是1,2,7

更新:如果您想要在状态列等于UNKNOWN之后行中重新开始序列,那么您可以执行此操作:

dt[,.(StartTime=time[1L],EndTime=time[length(time)],Sequence=list(event)),.(SeqID=c(0L,cumsum(status[-length(status)]=='UNKNOWN'))+1L)];
##      SeqID StartTime EndTime Sequence
##   1:     1         1       3    1,2,7
##   2:     2         4       7  7,4,2,1
##   3:     3         8       9      5,6
##   4:     4        10      11    10, 4
##   5:     5        12      12        2
##  ---
## 512:   512        89      90      2,7
## 513:   513        91      93    3,5,2
## 514:   514        94      94        8
## 515:   515        95      96    10, 3
## 516:   516        97     100  2,3,1,7

请注意,您的预期输出仍然不正确;在此设计下,第二组应为7,4,2,1而不是7,4,2编辑:实际上,我认为这个问题可能与mydf中的差异有关;我用你的样本创建代码得到了这个:

head(mydf,10L);
##    time  status event
## 1     1      OK     1
## 2     2      OK     2
## 3     3 UNKNOWN     7
## 4     4      OK     7
## 5     5      OK     4
## 6     6      OK     2
## 7     7 UNKNOWN     1
## 8     8      OK     5
## 9     9 UNKNOWN     6
## 10   10      OK    10

请尝试使用100的种子再次运行样本创建代码。我们应该为mydf获得相同的结果。

这是围绕by()构建的基础R解决方案:

with(list(SeqID=c(0L,cumsum(mydf$status[-nrow(mydf)]=='UNKNOWN'))+1L),
    do.call(rbind,by(cbind(mydf,SeqID),SeqID,function(x)
        data.frame(
            SeqID=x$SeqID[1L],
            StartTime=x$time[1L],
            EndTime=x$time[length(x$time)],
            Sequence=I(list(x$event))
        )
    ))
);
##     SeqID StartTime EndTime     Sequence
## 1       1         1       3      1, 2, 7
## 2       2         4       7   7, 4, 2, 1
## 3       3         8       9         5, 6
## 4       4        10      11        10, 4
## 5       5        12      12            2
##
## ... snip ...
##
## 512   512        89      90         2, 7
## 513   513        91      93      3, 5, 2
## 514   514        94      94            8
## 515   515        95      96        10, 3
## 516   516        97     100   2, 3, 1, 7

基准

library(data.table);
library(microbenchmark);

bgoldst1 <- function(dt) dt[,.(StartTime=time[1L],EndTime=time[length(time)],Sequence=list(event)),.(SeqID=c(0L,cumsum(status[-length(status)]=='UNKNOWN'))+1L)];
bgoldst2 <- function(mydf) with(list(SeqID=c(0L,cumsum(mydf$status[-nrow(mydf)]=='UNKNOWN'))+1L),do.call(rbind,by(cbind(mydf,SeqID),SeqID,function(x) data.frame(SeqID=x$SeqID[1L],StartTime=x$time[1L],EndTime=x$time[length(x$time)],Sequence=I(list(x$event))))));
lebatsnok <- function(mydf) { mydfs <- split(mydf,  head(cumsum(c("", mydf$status) == "UNKNOWN"), -1)); res <- lapply(mydfs, function(x) data.frame(StartTime = x$time[1], EndTime = tail(x$time,1), SeqID = NA, Sequence = paste(x$event, collapse=","))); res <- do.call(rbind, res); res$SeqID <- seq_len(NROW(res)); res; };
set.seed(100L);
mydf <- data.frame(time=1:100,status=sample(c('OK','UNKNOWN'),1000L,T),event=sample(1:10,1000L,T),stringsAsFactors=F);
dt <- as.data.table(mydf);

ex <- as.data.frame(bgoldst1(dt)); o <- names(ex);
all.equal(ex,bgoldst2(mydf)[o],check.attributes=F);
## [1] TRUE
all.equal(transform(ex,Sequence=factor(sapply(Sequence,paste,collapse=','))),lebatsnok(mydf)[o],check.attributes=F);
## [1] TRUE

microbenchmark(bgoldst1(dt),bgoldst2(mydf),lebatsnok(mydf));
## Unit: milliseconds
##             expr        min         lq       mean     median         uq        max neval
##     bgoldst1(dt)   1.363785   1.671909   1.896345   1.839763   2.041828   3.900621   100
##   bgoldst2(mydf) 217.960902 234.978058 244.491406 243.867674 251.392438 298.083774   100
##  lebatsnok(mydf) 254.961413 273.434086 284.439844 283.864322 291.889867 337.319627   100

答案 1 :(得分:2)

基础R解决方案(依赖于stringsAsFactorsFALSE,因此重新定义了mydf):

set.seed(100)
mydf <-data.frame(time=(1:100),
                  status = sample(c('OK','UNKNOWN'),1000,replace=TRUE),
                  event = sample(1:10,1000,replace=TRUE), stringsAsFactors=FALSE
)

mydfs <- split(mydf,  head(cumsum(c("", mydf$status) == "UNKNOWN"), -1))
res <- lapply(mydfs, function(x) 
          data.frame(StartTime = x$time[1], 
                     EndTime = tail(x$time,1), 
                     SeqID = NA, 
                     Sequence = paste(x$event, collapse=",")))
res <- do.call(rbind, res)
res$SeqID <- seq_len(NROW(res))
head(res)
#   StartTime EndTime SeqID Sequence
# 0         1       3     1    1,2,7
# 1         4       7     2  7,4,2,1
# 2         8       9     3      5,6
# 3        10      11     4     10,4
# 4        12      12     5        2
# 5        13      15     6   10,1,8