连续序列未在R中先验定义

时间:2017-03-31 13:16:51

标签: r sequence

对于每个主题和每个试用版ID,我需要检查唯一的 Current_Area_ID 是否等于唯一( Next_Area_ID - 1)。如果是这种情况,在 NewColumn (这是我想用R代码获得的输出)中,我将为该观察指定值0,如果不是这样,则在新列中我将分配值1. Current_Area_ID Next_Area_ID 都是数字类的列。

然而,棘手的部分是我需要考虑区域的独特价值。在以下示例中,带**的数字代表特定情况。当重复数字3时,我可以接受这些情况并在NewColumn中指定值0,因为在唯一值3之后的Next_Area_ID是4.但是,当我们考虑数字6时,重复但后面跟着数字8 (而不是数字7),那么我需要为所有相应的6号行分配值1。

Current_Area_ID   Next_Area_ID  NewColumn
   1              8             1
   8              1             1
   1              1             1
   2              3             0
 **3**          **3**           0
 **3**          **3**           0
 **3**          **4**           0
   4              5             0
   5              6             0
 **6**          **6**           1
 **6**          **6**           1
 **6**          **8**           1
   7              9             1
   8              10            1
   9              11            1
   10             11            0
   10             1             1
   11             1             1
   11             1             1
   11             1             1
   1              1             1
   1              1             1
   1              1             1

这也是一个可重复的例子:

Current_Area_ID <- c(1,8,1,2,3,3,3,4,5,6,6,6,7,8,9,10,10,11,11,11,1,1,1,1,1,1,1,1,5,5,5,6,8,8,10,11,12,12,11,13,1,1,1,1,1,4,5,6,6,8,8,6,6,8,9,9,11,10,11,11,1,1,1,1,1,1,1,5,5,6,7,7,7,6,7,8,8,11,11,12,13,13,13,1,5,1,5,6,7,7,6,1,1,1,4,4,4,5,5,6,6,7,7,9,10,9,11,11,12,1,1,1,4,5,6,7,7,9,10,10,10,11,12,12,12,12,12,11,12,1,1,1,1,1,3,4,4,5,6,6,8,8,9,10,11,11,11,1,11,5,3,3,3,6,5,6,8,9,8,6,6,8,9,10,5,3,3,4,1,1,1,1,5,6,6,7,8,8,10,10,11,11,12,12,13,13,12,1,1,1,1,1,1,1,1,1,3,3,3,3,4,4,5,5,4,6,6,7,8,8,8,9,9,10,10,10,6,4,3,6,7,7,3,3,3,5,6,1,1,1,3,4,5,5,3,6,5,3,6,6,6,6,9,10,10,11,11,12,12,1,12,1,1,1,1,1,4,4,5,6,6,7,7,9,9,10,11,11,11,1,6,4,5,4,6,7,6,6,5,1,1,3,3,4,3,4,5,5,7,7,8,8,9,9,10,10,1,1,1,1,1,1,1,1,4,5,5,6,7,6,7,7,9,10,10,11,11,12,12,1,6,4,4,5,5,6,5,1,1,1,1,1,7,1,7,8,8,9,9,10,12,13,13,14,15,1,9,1,1,1,1,4,4,5,6,7,7,7,7,8,11,12,13,13,14,1,1,1,7,6,4,4,4,6,7,7,7,7,4,5,4,4,1,1,4,4,4,5,6,6,7,8,9,9,10,10,10,11,11,11,1,1,1,1,1,1,1,1,6,6,7,8,8,9,11,11,12,14,1,11,6,1,1,7,8,9,11,12,1,1,1,1,1,1,4,5,6,7,7,7,7,10,11,12,12,11,12,13,1,1,1,5,6,7,7,10,12,7,5,6,1,1,1,1,4,4,5,6,5,5,6,7,7,10,11,12,12,13,1,1,5,1,1,1,3,3,4,4,4,5,6,8,8,9,9,9,10,11,11,11,1,1,1,1,1,1,1,1,1,1,4,1,5,5,6,7,7,7,9,10,11,11,12,12,11,11,12,1,1,7,4,5,7,7,9,7,6,1,1,1,1,1,1,1,3,4,5,6,6,6,8,8,10,10,11,11,1,1,1,1,1,1,3,3,4,4,5,5,6,6,6,6,8,9,10,9,10,9,10,11,11,1,1,1,1,3,4,4,4,4,4,5,6,6,6,9,9,10,11,11,12,12,12,1,12,9,6,5,1,1,1,3,4,5,5,5,5,6,6,8,9,10,10,11,11,1,1,1,1,3,3,3,4,4,5,6,6,6,6,6,8,8,8,9,10,11,11,11,1,5,3,4,5,6,6,8,9,10,6,6,3,5,1,1,1,1,3,4,4,4,5,6,8,8,9,10,11,1,1,3,4,5,6,9,9,3,3,1,1,1,1,1,3,4,5,5,4,4,4,3,4,6,6,6,9,9,10,10,11,12,12,1,6,4,1,1,1,1,5,6,6,7,8,8,10,11,12,13,12,13,1,8,1,5,8,10,11,10,10,7,10,7,6,1,1,1,1,1,6,6,7,7,8,8,9,9,11,12,12,13,1,1,1,1,1,3,4,4,3,3,4,4,5,6,5,6,6,8,9,9,10,11,11,11,11,1,1,1,1,1,1,3,3,4,5,5,6,6,8,9,9,10,11,10,11,11,11,1,1,1,9,1,1,1,4,5,5,6,7,7,7,9,10,11,11,12,1,1,1,1,1,1,3,3,4,4,5,5,6,7,7,8,8,9,9,10,10,10,1,1,5,1,4,1,1,1,1,1,1,4,4,5,6,5,6,7,7,9,10,11,12,11,10,11,12,12,1,1,1,1,1,3,3,4,5,4,3,4,4,5,6,6,10,10,10,12,12,12,11,10,12,12,12,1,1,1,1,1,1,1,1,4,5,5,6,7,8,9,9,10,11,11,11,1,1,1,1,4,4,5,5,6,7,7,9,10,11,12,12,12,1,1,1,6,1,1,1,4,4,5,6,7,7,9,10,11,12,1,1,1,1,1,1,3,4,4,5,5,4,5,6,6,8,9,10,11,11,1,11,11,11,1,1,4,4,5,5,6,7,9,9,10,11,12,12,12,11,12,1,1)
Next_Area_ID <- c(8,1,1,3,3,3,4,5,6,6,6,8,9,10,10,11,11,11,1,1,1,1,1,1,1,1,5,5,5,6,8,8,10,11,12,12,11,13,1,1,NA,1,1,4,5,6,6,8,8,6,6,8,9,9,11,10,11,11,1,1,1,1,1,1,1,5,5,6,7,7,7,6,7,8,8,11,11,12,13,13,13,1,5,1,5,6,7,7,6,NA,1,1,4,4,4,5,5,6,6,7,7,9,10,9,11,11,12,NA,1,1,4,5,6,7,7,9,10,10,10,11,12,12,12,12,12,11,12,1,1,NA,1,1,3,4,4,5,6,6,8,8,9,10,11,11,11,1,11,5,3,3,3,6,5,6,8,9,8,6,6,8,9,10,5,3,3,4,NA,1,1,1,5,6,6,7,8,8,10,10,11,11,12,12,13,13,12,1,1,1,NA,1,1,1,1,1,3,3,3,3,4,4,5,5,4,6,6,7,8,8,8,9,9,10,10,10,6,4,3,6,7,7,3,3,3,5,6,1,1,1,3,4,5,5,3,6,5,3,6,6,6,6,9,10,10,11,11,12,12,1,12,1,1,NA,1,1,4,4,5,6,6,7,7,9,9,10,11,11,11,1,6,4,5,4,6,7,6,6,5,1,1,3,3,4,3,4,5,5,7,7,8,8,9,9,10,10,1,1,1,1,1,NA,1,1,4,5,5,6,7,6,7,7,9,10,10,11,11,12,12,1,6,4,4,5,5,6,5,NA,1,1,1,1,7,1,7,8,8,9,9,10,12,13,13,14,15,1,9,NA,1,1,1,4,4,5,6,7,7,7,7,11,12,13,13,14,1,1,1,7,6,4,4,4,6,7,7,7,7,4,5,4,4,1,1,4,4,4,5,6,6,7,9,9,10,10,10,11,11,11,1,1,1,1,1,NA,1,1,6,6,7,8,8,9,11,11,12,14,1,11,6,1,1,7,8,9,11,12,1,1,1,1,1,1,4,5,6,7,7,7,7,10,11,12,12,11,12,13,1,1,1,5,6,7,7,10,12,7,5,6,NA,1,1,1,4,4,5,6,5,5,6,7,7,10,11,12,12,13,1,1,5,NA,1,1,3,3,4,4,4,5,6,8,8,9,9,9,10,11,11,11,1,1,1,1,1,1,1,1,1,1,4,1,5,5,6,7,7,7,9,10,11,11,12,12,11,11,12,1,1,7,4,5,7,7,9,7,6,1,1,1,1,1,1,1,3,4,5,6,6,6,8,8,10,10,11,11,1,1,1,1,1,1,3,3,4,4,5,5,6,6,6,6,8,9,10,9,10,9,10,11,11,1,1,NA,1,3,4,4,4,4,4,5,6,6,6,9,9,10,11,11,12,12,12,1,12,9,6,5,1,1,1,3,4,5,5,5,5,6,6,8,9,10,10,11,11,1,1,1,NA,3,3,3,4,4,5,6,6,6,6,6,8,8,8,9,10,11,11,11,1,5,3,4,5,6,6,8,9,10,6,6,3,5,NA,1,1,1,3,4,4,4,5,6,8,8,9,10,11,1,1,3,4,5,6,9,9,3,3,1,1,1,1,1,3,4,5,5,4,4,4,3,4,6,6,6,9,9,10,10,11,12,12,1,6,4,NA,1,1,1,5,6,6,7,8,8,10,11,12,13,12,13,1,8,1,5,8,10,11,10,10,7,10,7,6,1,1,1,1,1,6,6,7,7,8,8,9,9,11,12,12,13,1,1,1,1,1,3,4,4,3,3,4,4,5,6,5,6,6,8,9,9,10,11,11,11,11,1,1,1,1,1,1,3,3,4,5,5,6,6,8,9,9,10,11,10,11,11,11,1,1,1,9,1,1,1,4,5,5,6,7,7,7,9,10,11,11,12,1,1,1,1,1,1,3,3,4,4,5,5,6,7,7,8,8,9,9,10,10,10,1,1,5,1,4,1,1,1,1,1,1,4,4,5,6,5,6,7,7,9,10,11,12,11,10,11,12,12,1,1,1,1,1,3,3,4,5,4,3,4,4,5,6,6,10,10,10,12,12,12,11,10,12,12,12,1,1,1,12,1,1,1,1,4,5,5,6,7,8,9,9,10,11,11,11,12,1,1,1,4,4,5,5,6,7,7,9,10,11,12,12,12,1,1,1,6,12,1,1,4,4,5,6,7,7,9,10,11,12,1,1,1,1,1,1,3,4,4,5,5,4,5,6,6,8,9,10,11,11,1,11,11,11,12,1,4,4,5,5,6,7,9,9,10,11,12,12,12,11,12,1,1,1,1,1,1)
Subject <- rep(c(1,2), each=500)
Trial <- rep(1:25,each=20)
DataFrame <- data.frame(cbind(Current_Area_ID,Next_Area_ID,Subject,Trial))

为了创建 NewColumn ,我尝试使用:

library(dplyr)

DataFrame<-{DataFrame %>%
    group_by(Subject, Trial) %>% 
    mutate(NewColumn = ifelse(
    unique(DataFrame$Current_Area_ID) == unique(DataFrame$Next_Area_ID - 1),
    0, 1
))}

但它没有用。

4 个答案:

答案 0 :(得分:4)

使用包data.table的潜在解决方案:

如果我理解正确,根据您的评论,您可以为相同的Current_Area_ID / Subject / Trial提供不同的值。如果NewColumn + 1或Current_Area_IDCurrent_Area_ID相同,则Current_Area_ID需要为0,但在Subject之后的某行上为Trial + 1重复的)同样的价值 我们仍然需要# checking DataFrame is ordered by Subject and Trial: all(DataFrame==DataFrame[order(DataFrame$Subject, DataFrame$Trial), ], na.rm=TRUE) # [1] TRUE # turning DataFrame into a data.table (as previously done) and applying for # each unique Subject/Trial a function which split the 2 "Area Column" # according to unique consecutive Current_Value and assign 0 or 1 depending on whether the constraints are fulfilled: library(data.table) setDT(DataFrame)[, NewColumn:= unlist(lapply(split(.SD, rep(seq_along(rle(Current_Area_ID)$values), rle(Current_Area_ID)$lengths)), function(spdf) { spdf[, diff:=Next_Area_ID-Current_Area_ID] wh1 <- tail(which(spdf$diff==1), 1) # Initiate the Newcolumn to 1 for every row spnc <- rep(1, nrow(spdf)) # if there is a Next_Area consecutive to Current, put everything before to 0, # but only if it is either same as current, NA or consecutive to current if(length(wh1)) spnc[1:wh1][is.na(spdf$diff[1:wh1]) | (spdf$diff[1:wh1] %in% c(0, 1))] <- 0 spnc })) , by=c("Subject", "Trial"), .SDcols=c("Current_Area_ID", "Next_Area_ID")] cbind(Row=c(29:31, 35:40, 49:51), DataFrame[c(29:31, 35:40, 49:51)]) # Row Current_Area_ID Next_Area_ID Subject Trial NewColumn # 1: 29 5 5 1 2 0 # 2: 30 5 6 1 2 0 # 3: 31 5 8 1 2 1 # 4: 35 10 12 1 2 1 # 5: 36 11 12 1 2 0 # 6: 37 12 11 1 2 1 # 7: 38 12 13 1 2 0 # 8: 39 11 1 1 2 1 # 9: 40 13 1 1 2 1 #10: 49 6 8 1 3 1 #11: 50 8 6 1 3 1 #12: 51 8 6 1 3 1

NA

检查您提及的行:

all(as.data.frame(data_proc[, 2:6])==as.data.frame(DataFrame), na.rm=TRUE)
[1] TRUE

与@NickKennedy结果相比: 除了NA外,一切都是一样的:

sum(is.na(data_proc[, 2:6])) 
# [1] 34
sum(is.na(DataFrame))
# [1] 17

查看diffNA <- which(rowSums(is.na(data_proc[, 2:6])) != rowSums(is.na(DataFrame))) head(as.data.frame(data_proc[diffNA, 2:6])) # Current_Area_ID Next_Area_ID Subject Trial NewColumn #1 1 NA 1 3 NA #2 7 NA 1 5 NA #3 11 NA 1 6 NA #4 1 NA 1 7 NA #5 3 NA 1 9 NA #6 1 NA 1 10 NA head(as.data.frame(DataFrame[diffNA])) # Current_Area_ID Next_Area_ID Subject Trial NewColumn #1 1 NA 1 3 1 #2 7 NA 1 5 1 #3 11 NA 1 6 1 #4 1 NA 1 7 1 #5 3 NA 1 9 1 #6 1 NA 1 10 1 s:

Subject

由于NAs而不相同的行的比较:

Trial

上一个回答/编辑:

检查每个Current_Area_ID / Next_Area_ID / Current_Area_ID,如果任何Next_Area_ID满足条件 1等于({{1} }} - 1)并相应地将0Newcolumn分配给%in%。我们使用Next_Area_ID来避免在NAlibrary(data.table) setDT(DataFrame)[, NewColumn:=as.integer(!any((Current_Area_ID-Next_Area_ID+1) %in% 0)), by=c("Current_Area_ID", "Subject", "Trial")] DataFrame[1:20] # Current_Area_ID Next_Area_ID Subject Trial NewColumn # 1: 1 8 1 1 1 # 2: 8 1 1 1 1 # 3: 1 1 1 1 1 # 4: 2 3 1 1 0 # 5: 3 3 1 1 0 # 6: 3 3 1 1 0 # 7: 3 4 1 1 0 # 8: 4 5 1 1 0 # 9: 5 6 1 1 0 #10: 6 6 1 1 1 #11: 6 6 1 1 1 #12: 6 8 1 1 1 #13: 7 9 1 1 1 #14: 8 10 1 1 1 #15: 9 10 1 1 0 #16: 10 11 1 1 0 #17: 10 11 1 1 0 #18: 11 11 1 1 1 #19: 11 1 1 1 1 #20: 11 1 1 1 1 时获得NA。

NA

修改

如果您需要最后一个非Current_Area_ID值为NA + 1,而其他所有值与当前值相同或setDT(DataFrame)[, NewColumn:=as.integer(!(tail(na.omit(Current_Area_ID-Next_Area_ID+1), 1) %in% 0 & all(head(na.omit(Current_Area_ID-Next_Area_ID), -1) %in% 0))), by=c("Current_Area_ID", "Subject", "Trial")]

.Result

答案 1 :(得分:1)

这是使用dplyr的解决方案。

library(dplyr)
# First define a function to check whether the constraints are met
# This assumes that any rows with consecutive `Current_Area_ID` and `Next_Area_ID`
# should be zero, and any preceeding rows within that group where
# `Current_Area_ID` == `Next_Area_ID` should also be zero.
# Rows with `NA` as `Next_Area_ID` will come out as `NA`
check_areas <- function(cur, nxt) {
  ok <- nxt == cur + 1
  if (any(ok, na.rm = TRUE) && min(which(ok)) > 1L) {
    prev <- 1:min(which(ok) - 1)
    ok[prev] <- nxt[prev] == cur[prev]
  }
  as.integer(!ok)
}
# Now add a running sequence number to the data, `group_by` that and then
# add in the new column
data_proc <- DataFrame %>%
  mutate(Seq = cumsum(c(1, .$Current_Area_ID[-1] != .$Current_Area_ID[-nrow(DataFrame)] |
                          .$Subject[-1] != .$Subject[-nrow(DataFrame)] |
                          .$Trial[-1] != .$Trial[-nrow(DataFrame)]))) %>%
  group_by(Seq) %>%
  mutate(NewColumn = check_areas(Current_Area_ID, Next_Area_ID)) %>%
  select(-Seq)

注意这与@ Cath的回答有所不同,例如第37行,其中我返回1(正如@dede的评论

所示)
  如你所描述的那样,

第30行完美无缺,因为Current_Area和   Next_Area是连续的。但是,在第31行Current_Area = 5和   Next_Area = 8(而不是6)。在第37行,Current_Area = 12和   Next_Area = 11(而不是13)。在第39行Current_Area = 11和   Next_Area = 1(而不是12)。在第50行,Current_Area = 8和   Next_Area = 6(而不是9)。在所有这些例子中,应该有一个   NewColumn中的值为1,因为Current_Area和New_Area不是   连续的。

答案 2 :(得分:0)

我同意@Sotos,我并不是100%明白你想要获得NewColumn,但我看了你提供的内容并试图推理出来,我想这可能是做你想做的......

NewColumn <- rep(1, length(Current_Area_ID))
for(i in length(Current_Area_ID):1) {

  if(!is.na(Next_Area_ID[i])){

    if(Current_Area_ID[i] == (Next_Area_ID[i]-1)) NewColumn[i] <- 0

    if(!is.na(Next_Area_ID[i+1])){
      if(Current_Area_ID[i + 1] == Current_Area_ID[i] &
             NewColumn[i + 1] == 0) NewColumn[i] <- 0
    }
  }
}

答案 3 :(得分:0)

请找到这个可能的解决方案:

从您的数据中创建一个主键来区分所有不同的情况:

DataFrame$PK <- paste(DataFrame[,1],DataFrame[,2],DataFrame[,3],DataFrame[,4], sep = "_")

然后检查每个不同情况下的任何行是否满足您的条件(Next_Area_ID-1 == Current_Area_ID)并创建结果表

library(plyr)
result_table     <- ddply(unique(DataFrame), 
                 .(PK),
                 summarise, 
                 test = any((Next_Area_ID-1) == Current_Area_ID) ,
                 .parallel = F )

然后根据主键合并结果和原始数据,计算NewColumn并清理DF:

# creation of NewColumn
DataFrame$NewColumn <- 1
# assignation of 0 if needed
DataFrame$NewColumn[DataFrame$test] <- 0
# clean up 
DataFrame <-DataFrame[,c(-1,-6)] 
# quick test                                      
DataFrame[DataFrame$NewColumn == 0,]  

似乎有效