我想按setID.x对记录进行分组。然后对于每个分组,我想取VAR1和VAR2的总和,然后确定哪个是更高的数字并获得两者的比率。例如,如果sum(VAR1)大于sum(VAR2),则ratio = sum(VAR1)/ sum(VAR2)。计算出比率后,VAR2(较低的数字)的值将乘以比率。调整后的值将形成新的变量。实际上,新变量现在具有相同的总和。我如何在R中执行此操作?这是一个示例数据集:
dput(testfile)
structure(list(setID.x = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L), VAR1 = c(NA, 30.96, 0.85, 17.78, 30.96, 0.85,
1.23, 6.92, 0.64, 2.5, 1.52, 4.11, 0.35), VAR2 = c(NA, 0, 0.18,
4.11, 0, 0.18, 1.09, 0.9, 2.24, 6.96, 8.89, 17.78, 3.72)), .Names = c("setID.x",
"VAR1", "VAR2"), row.names = c(NA, -13L), spec = structure(list(
cols = structure(list(setID.x = structure(list(), class = c("collector_integer",
"collector")), VAR1 = structure(list(), class = c("collector_double",
"collector")), VAR2 = structure(list(), class = c("collector_double",
"collector")), newVAR1 = structure(list(), class = c("collector_double",
"collector")), newVAR2 = structure(list(), class = c("collector_double",
"collector"))), .Names = c("setID.x", "VAR1", "VAR2", "newVAR1",
"newVAR2")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"), class = c("tbl_df",
"tbl", "data.frame"))
这是我的预期输出:
dput(output)
structure(list(setID.x = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L), VAR1 = c(NA, 30.96, 0.85, 17.78, 30.96, 0.85,
1.23, 6.92, 0.64, 2.5, 1.52, 4.11, 0.35), VAR2 = c(NA, 0, 0.18,
4.11, 0, 0.18, 1.09, 0.9, 2.24, 6.96, 8.89, 17.78, 3.72), newVAR1 = c(NA,
30.96, 0.85, 17.78, 30.96, 0.85, 1.23, 6.92, 2.78, 10.85, 6.6,
17.84, 1.52), newVAR2 = c(NA, 0, 2.08, 47.51, 0, 3.31, 20.07,
16.57, 2.24, 6.96, 8.89, 17.78, 3.72)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -13L), .Names = c("setID.x",
"VAR1", "VAR2", "newVAR1", "newVAR2"), spec = structure(list(
cols = structure(list(setID.x = structure(list(), class = c("collector_integer",
"collector")), VAR1 = structure(list(), class = c("collector_double",
"collector")), VAR2 = structure(list(), class = c("collector_double",
"collector")), newVAR1 = structure(list(), class = c("collector_double",
"collector")), newVAR2 = structure(list(), class = c("collector_double",
"collector"))), .Names = c("setID.x", "VAR1", "VAR2", "newVAR1",
"newVAR2")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
答案 0 :(得分:0)
这是使用dplyr快速完成的方法。
library(dplyr)
testfile %>%
group_by(setID.x) %>%
mutate(sumVar1 = sum(VAR1, na.rm=T), sumVar2 = sum(VAR2, na.rm =T), test = sumVar1>sumVar2,
ratio = ifelse(test, sumVar1/sumVar2, sumVar2/sumVar1),
newVAR1 = ifelse(test, VAR1, VAR1*ratio), newVAR2 = ifelse(test,VAR2*ratio, VAR2)) %>%
select(setID.x, VAR1, VAR2, newVAR1, newVAR2)
您得到如下结果。
# Source: local data frame [13 x 5]
# Groups: setID.x [3]
#
# setID.x VAR1 VAR2 newVAR1 newVAR2
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 NA NA NA NA
# 2 1 30.96 0.00 30.96 0.000000
# 3 1 0.85 0.18 0.85 2.080699
# 4 1 17.78 4.11 17.78 47.509301
# 5 2 30.96 0.00 30.96 0.000000
# 6 2 0.85 0.18 0.85 3.314654
# 7 2 1.23 1.09 1.23 20.072074
# 8 2 6.92 0.90 6.92 16.573272
# 9 3 2.24 0.64 2.24 2.778246
# 10 3 6.96 2.50 6.96 10.852522
# 11 3 8.89 1.52 8.89 6.598333
# 12 3 17.78 4.11 17.78 17.841546
# 13 3 3.72 0.35 3.72 1.519353