R:按索引和更新记录对记录进行分组

时间:2017-04-11 19:15:21

标签: r

我想按setID.x对记录进行分组。然后对于每个分组,我想取VAR1和VAR2的总和,然后确定哪个是更高的数字并获得两者的比率。例如,如果sum(VAR1)大于sum(VAR2),则ratio = sum(VAR1)/ sum(VAR2)。计算出比率后,VAR2(较低的数字)的值将乘以比率。调整后的值将形成新的变量。实际上,新变量现在具有相同的总和。我如何在R中执行此操作?这是一个示例数据集:

dput(testfile)
structure(list(setID.x = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 
3L, 3L, 3L, 3L), VAR1 = c(NA, 30.96, 0.85, 17.78, 30.96, 0.85, 
1.23, 6.92, 0.64, 2.5, 1.52, 4.11, 0.35), VAR2 = c(NA, 0, 0.18, 
4.11, 0, 0.18, 1.09, 0.9, 2.24, 6.96, 8.89, 17.78, 3.72)), .Names =     c("setID.x", 
"VAR1", "VAR2"), row.names = c(NA, -13L), spec = structure(list(
    cols = structure(list(setID.x = structure(list(), class =     c("collector_integer", 
    "collector")), VAR1 = structure(list(), class = c("collector_double", 
    "collector")), VAR2 = structure(list(), class = c("collector_double", 
    "collector")), newVAR1 = structure(list(), class = c("collector_double", 
    "collector")), newVAR2 = structure(list(), class = c("collector_double", 
    "collector"))), .Names = c("setID.x", "VAR1", "VAR2", "newVAR1", 
    "newVAR2")), default = structure(list(), class = c("collector_guess", 
    "collector"))), .Names = c("cols", "default"), class = "col_spec"),     class = c("tbl_df", 
"tbl", "data.frame"))

这是我的预期输出:

dput(output)
  structure(list(setID.x = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 
  3L, 3L, 3L, 3L), VAR1 = c(NA, 30.96, 0.85, 17.78, 30.96, 0.85, 
  1.23, 6.92, 0.64, 2.5, 1.52, 4.11, 0.35), VAR2 = c(NA, 0, 0.18, 
  4.11, 0, 0.18, 1.09, 0.9, 2.24, 6.96, 8.89, 17.78, 3.72), newVAR1 = c(NA, 
  30.96, 0.85, 17.78, 30.96, 0.85, 1.23, 6.92, 2.78, 10.85, 6.6, 
  17.84, 1.52), newVAR2 = c(NA, 0, 2.08, 47.51, 0, 3.31, 20.07, 
  16.57, 2.24, 6.96, 8.89, 17.78, 3.72)), class = c("tbl_df", "tbl", 
  "data.frame"), row.names = c(NA, -13L), .Names = c("setID.x", 
  "VAR1", "VAR2", "newVAR1", "newVAR2"), spec = structure(list(
      cols = structure(list(setID.x = structure(list(), class =       c("collector_integer", 
      "collector")), VAR1 = structure(list(), class = c("collector_double", 
      "collector")), VAR2 = structure(list(), class = c("collector_double", 
      "collector")), newVAR1 = structure(list(), class = c("collector_double", 
      "collector")), newVAR2 = structure(list(), class = c("collector_double", 
      "collector"))), .Names = c("setID.x", "VAR1", "VAR2", "newVAR1", 
      "newVAR2")), default = structure(list(), class = c("collector_guess", 
      "collector"))), .Names = c("cols", "default"), class = "col_spec"))

1 个答案:

答案 0 :(得分:0)

这是使用dplyr快速完成的方法。

library(dplyr)
testfile %>%
  group_by(setID.x) %>% 
  mutate(sumVar1 = sum(VAR1, na.rm=T), sumVar2 = sum(VAR2, na.rm =T), test = sumVar1>sumVar2,
         ratio = ifelse(test, sumVar1/sumVar2, sumVar2/sumVar1),
         newVAR1 = ifelse(test, VAR1, VAR1*ratio), newVAR2 = ifelse(test,VAR2*ratio, VAR2)) %>%
  select(setID.x, VAR1, VAR2, newVAR1, newVAR2)

您得到如下结果。

# Source: local data frame [13 x 5]
# Groups: setID.x [3]
# 
# setID.x  VAR1  VAR2 newVAR1   newVAR2
# <int> <dbl> <dbl>   <dbl>     <dbl>
# 1        1    NA    NA      NA        NA
# 2        1 30.96  0.00   30.96  0.000000
# 3        1  0.85  0.18    0.85  2.080699
# 4        1 17.78  4.11   17.78 47.509301
# 5        2 30.96  0.00   30.96  0.000000
# 6        2  0.85  0.18    0.85  3.314654
# 7        2  1.23  1.09    1.23 20.072074
# 8        2  6.92  0.90    6.92 16.573272
# 9        3  2.24  0.64    2.24  2.778246
# 10       3  6.96  2.50    6.96 10.852522
# 11       3  8.89  1.52    8.89  6.598333
# 12       3 17.78  4.11   17.78 17.841546
# 13       3  3.72  0.35    3.72  1.519353