Question

我正在处理患者数据集＆＃39;随着时间的推移健康状我想计算过渡的数据框架从目前的健康状况到下一个健康状态。

以下是测量健康状况的示例只有法新社的水平和体重。健康状况测量结果可能如下所示：

**************setting things up**************
INFO: preparing database
INFO: create new users
**************end of setup****************
INFO: starting test one
INFO: ...
**************Cleaning things**************
INFO: delete users
...

所需的输出如下所示：

x <- data.frame(id = c(1, 1, 1, 2, 2, 2),
                day = c(1, 2, 3, 1, 2, 3),
                event = c('status', 'status', 'death', 'status', 'status', 'status'),
                afp = c(10, 50, NA, 20, 30, 40),
                weight = c(100, 105, NA, 200, 200, 200))

获得输出的一种低效方法是：

将测量数据框的叉积与自身相结合
仅保留匹配ID的行，并且day.x + 1 = day.y
重命名列

是否有更有效的方法来获得输出？

注意：实际测量数据框可以有超过10列，所以从代码行的角度来看效率不高明确写

y <- data.frame(id = c(1, 1, 2, 2),
                current_afp = c(10, 50, 20, 30),
                current_weight = c(100, 105, 200, 200),
                next_event = c('status', 'death', 'status', 'status'),
                next_afp = c(50, NA, 30, 40),
                next_weight = c(105, NA, 200, 200))

等等。

Answer 1

你可以尝试：

library(dplyr)

x %>%
  mutate_each(funs(lead(.)), -id, -day) %>%
  full_join(x, ., by = c("id", "day")) %>%
  select(-event.x) %>%
  setNames(c(names(.)[1:2], 
             paste0("current_", sub("\\..*","", names(.)[3:4])), 
             paste0("next_", sub("\\..*","", names(.)[5:7])))) %>%
  group_by(id) %>%
  filter(day != last(day))

给出了：

#  id day current_afp current_weight next_event next_afp next_weight
#1  1   1          10            100     status       50         105
#2  1   2          50            105      death       NA          NA
#3  2   1          20            200     status       30         200
#4  2   2          30            200     status       40         200

Answer 2

将base R与split-apply-combine方法结合使用

res <- lapply(split(x[-2], x$id), function(y) {
  xx <- cbind(y[1:(nrow(y)-1), ], y[2:nrow(y), -1])
  colnames(xx) <- c("id", paste("current", colnames(y)[-1], sep="_"), 
      paste("next", colnames(y)[-1], sep="_"))
  xx[, which(colnames(xx) != "current_event")]
})
do.call(rbind, res)

  id current_afp current_weight next_event next_afp next_weight
1  1          10            100     status       50         105
2  1          50            105      death       NA          NA
3  2          20            200     status       30         200
4  2          30            200     status       40         200

或者，并非所有日子都按顺序排列的例子

x <- data.frame(id = c(1, 1, 1, 2, 2, 2),
            day = c(1, 2, 3, 1, 2, 4),
            event = c('status', 'status', 'death', 'status', 'status', 'status'),
            afp = c(10, 50, NA, 20, 30, 40),
            weight = c(100, 105, NA, 200, 200, 200))
x
  id day  event afp weight
1  1   1 status  10    100
2  1   2 status  50    105
3  1   3  death  NA     NA
4  2   1 status  20    200
5  2   2 status  30    200
6  2   4 status  40    200

某些转换为NA，如果需要，可以将其删除。

res <- lapply(split(x, x$id), function(y) {
  y <- merge(data.frame(id=unique(y$id), day = 1:max(y$day)), y, 
    by = c("id",   "day"), all.x=TRUE)[, -2]
  xx <- cbind(y[1:(nrow(y)-1), ], y[2:nrow(y), -1])
  colnames(xx) <- c("id", paste("current", colnames(y)[-1], sep="_"), 
      paste("next", colnames(y)[-1], sep="_"))
  xx[, which(colnames(xx) != "current_event")]
})
do.call(rbind, res)
    id current_afp current_weight next_event next_afp next_weight
1.1  1          10            100     status       50         105
1.2  1          50            105      death       NA          NA
2.1  2          20            200     status       30         200
2.2  2          30            200       <NA>       NA          NA
2.3  2          NA             NA     status       40         200

在R中构建（当前状态，下一状态）数据帧的有效方法

2 个答案: