Question

我有一个问题，希望不会成为高级R用户的巨大障碍......

test.data <- data.frame(case = c(1, 1, 1, 2, 2, 2, 3),
                        year = c(2006, 2007, 2008, 2007, 2006, 2008, 2006),
                        level = c(10, 20, 20, 12, 20, 20, 20))

正如您可能看到的那样，每个案例都有多次出现，以年区分。 level 的值在一个案例中有所不同，我想通过将 level 的每个值设置为给定案例的最低级别来纠正它。在此示例中，case = 1的 level 的每个值应为10，case = 2的 level 的每个值应为12。对于任何特定情况，我可以执行以下操作：

test.data$level[test.data$case==1] <- min(test.data$level[test.data$case==1])

但由于我有几百个病例，这需要很长时间。因此，我想问一下你是否有更快的解决方案。

Answer 1

你可以尝试

 library(data.table)
 setDT(test.data)[, level:= min(level, na.rm=TRUE), case]
 #    case year level
 #1:    1 2006    10
 #2:    1 2007    10
 #3:    1 2008    10
 #4:    2 2007    12
 #5:    2 2006    12
 #6:    2 2008    12
 #7:    3 2006    20

或使用dplyr

 library(dplyr)
 test.data %>% 
        group_by(case) %>% 
        mutate(level= min(level, na.rm=TRUE))
 #   case year level
 #1    1 2006    10
 #2    1 2007    10
 #3    1 2008    10
 #4    2 2007    12
 #5    2 2006    12
 #6    2 2008    12
 #7    3 2006    20

或使用sqldf/dplyr

  library(sqldf)
  library(dplyr)
  sqldf('select * from "test.data"
            left join(select "case", 
              min(level) as Level
              from "test.data" 
              group by "case")
            using ("case")') %>%
                         select(-level)
  #   case year Level
  #1    1 2006    10
  #2    1 2007    10
  #3    1 2008    10
  #4    2 2007    12
  #5    2 2006    12
  #6    2 2008    12
  #7    3 2006    20

@ G.Grothendieck建议的修改只使用sqldf

  sqldf('select "case", year, "min(level)" as Level 
            from "test.data" 
               left join(select "case", min(level)
                         from "test.data" 
                         group by "case") 
                     using ("case")')

  #1    1 2006    10
  #2    1 2007    10
  #3    1 2008    10
  #4    2 2007    12
  #5    2 2006    12
  #6    2 2008    12
  #7    3 2006    20

或使用base R

 test.data$level <- with(test.data, ave(level, case, FUN=min))

Answer 2

这是使用基本R函数的经典之作。

# may not be optimal for larger datasets due to merge
min.lvl <- aggregate(level ~ case, data = test.data, FUN = min)
merge(x = test.data, y = min.lvl, by = "case", all.x = TRUE, sort = FALSE)

  case year level.x level.y
1    1 2006      10      10
2    1 2007      20      10
3    1 2008      20      10
4    2 2007      12      12
5    2 2006      20      12
6    2 2008      20      12
7    3 2006      20      20

第二个做事的香草选择

new.data <- by(data = test.data, INDICES = test.data$case, FUN = function(x) {
  x$level <- min(x$level)
  x
})

do.call("rbind", new.data)

    case year level
1.1    1 2006    10
1.2    1 2007    10
1.3    1 2008    10
2.4    2 2007    12
2.5    2 2006    12
2.6    2 2008    12
3      3 2006    20

Answer 3

使用doBy

的备选方案

library(doBy)
summaryBy(level ~ case, id=~ year, test.data, 
          full.dimension=TRUE,  keep.names=TRUE, min)

#   case level year
#1:    1    10 2006
#2:    1    10 2006
#3:    1    10 2006
#4:    2    12 2007
#5:    2    12 2007
#6:    2    12 2007
#7:    3    20 2006

或者更紧凑

library(plyr)
ddply(test.data, .(case), mutate, level = min(level))

#  case year level
#1    1 2006    10
#2    1 2007    10
#3    1 2008    10
#4    2 2007    12
#5    2 2006    12
#6    2 2008    12
#7    3 2006    20

使用lapply

的另一种基本R方法

do.call(rbind,lapply(split(test.data, test.data$case), 
              function(x){x$level = min(x$level); x}))

#   case year level
#1:    1 2006    10
#2:    1 2007    10
#3:    1 2008    10
#4:    2 2007    12
#5:    2 2006    12
#6:    2 2008    12
#7:    3 2006    20

R：调整案例多次出现的值

3 个答案: