我有2个大型数据框,结构如下。我只想将df1中每个名称的变量VA对应的sec1至sec3列中的值除以df1中每个名称的df2值列中对应名称的对应值。
Df1
name variable year Sec1 Sec2 Sec3
CHN VA 1950 23 45 32
CHN VA 1951 43 45 67
CHN E 1950 45 67 87
CHN E 1951 34 53 62
IND VA 1950 45 56 24
DF2
name value
CHN 3
IND 6
MLY 7
EUR 4
结果应该类似于
DF1
name variable year Sec1 Sec2 Sec3
CHN VA 1950 23/3 45/3 32/3
CHN VA 1951 43/3 45/3 67/3
CHN E 1950 45 67 87
CHN E 1951 34 53 62
IND VA 1950 45/6 56/6 24/6
我尝试了子设置并使用cbind()
,但是遇到了问题。
答案 0 :(得分:4)
1)左联接DF1
和DF2
,然后对于每个Sec
列除以value
。最后删除value
列。
library(dplyr)
DF1 %>%
left_join(DF2, by = "name") %>%
mutate(value = if_else(variable == "VA", value, 1L)) %>%
mutate_at(vars(starts_with("Sec")), ~ .x / value) %>%
select(-value)
给予:
name variable year Sec1 Sec2 Sec3
1 CHN VA 1950 7.666667 15.000000 10.66667
2 CHN VA 1951 14.333333 15.000000 22.33333
3 CHN E 1950 45.000000 67.000000 87.00000
4 CHN E 1951 34.000000 53.000000 62.00000
5 IND VA 1950 7.500000 9.333333 4.00000
2)基本R版本为:
m <- merge(DF1, DF2, by = "name", all.x = TRUE, all.y = FALSE)
ix <- m$variable == "VA"
jx <- grep("^Sec", names(m))
m[ix, jx] <- m[ix, jx] / m$value[ix]
m <- m[names(DF1)]
3)另一种方法是将其转换为长格式,执行联接和除法,然后转换回去。请注意,这将对行进行重新排序。
library(dplyr)
library(tidyr)
DF1 %>%
gather(key, val, -name, -variable, -year) %>%
left_join(DF2, by = "name") %>%
mutate(value = if_else(variable == "VA", value, 1L)) %>%
mutate(val = val / value) %>%
spread(key, val)
4)如果您不介意将其写出来:
library(dplyr)
DF1 %>%
left_join(DF2, by = "name") %>%
mutate(value = if_else(variable == "VA", value, 1L)) %>%
mutate(Sec1 = Sec1 / value, Sec2 = Sec2 / value, Sec3 = Sec3 / value, value = NULL)
4a)或使用底数R:
m <- merge(DF1, DF2, by = "name", all.x = TRUE, all.y = FALSE)
m <- transform(m, value = ifelse(variable == "VA", value, 1))
transform(m, Sec1 = Sec1 / value, Sec2 = Sec2 / value, Sec3 = Sec3 / value, value = NULL)
Lines1 <- "name variable year Sec1 Sec2 Sec3
CHN VA 1950 23 45 32
CHN VA 1951 43 45 67
CHN E 1950 45 67 87
CHN E 1951 34 53 62
IND VA 1950 45 56 24"
DF1 <- read.table(text = Lines1, header = TRUE, as.is = TRUE)
Lines2 <- "name value
CHN 3
IND 6
MLY 7
EUR 4"
DF2 <- read.table(text = Lines2, header = TRUE, as.is = TRUE)
答案 1 :(得分:1)
1)使用data.table
这是进行data.table
连接的一个选项
library(data.table)# v 1.12.4
nm1 <- paste0("Sec", 1:3)
setDT(df1)[df2, (nm1) := lapply(mget(nm1), function(x)
fifelse(variable == 'VA', x/value, x)), on = .(name)]
df1
# name variable year Sec1 Sec2 Sec3
#1: CHN VA 1950 7.666667 15.000000 10.66667
#2: CHN VA 1951 14.333333 15.000000 22.33333
#3: CHN E 1950 45.000000 67.000000 87.00000
#4: CHN E 1951 34.000000 53.000000 62.00000
#5: IND VA 1950 7.500000 9.333333 4.00000
2)使用dplyr
library(dplyr)
df1 %>%
mutate_at(vars(starts_with('Sec')), ~
case_when(variable == 'VA' ~ ./ df2$value[match(name, df2$name)], TRUE ~ .))
# name variable year Sec1 Sec2 Sec3
#1 CHN VA 1950 7.666667 15.000000 10.66667
#2 CHN VA 1951 14.333333 15.000000 22.33333
#3 CHN E 1950 45.000000 67.000000 87.00000
#4 CHN E 1951 34.000000 53.000000 62.00000
#5 IND VA 1950 7.500000 9.333333 4.00000
3)使用基本R -match
i1 <- df1$variable == 'VA'
df1[i1, nm1] <- df1[i1,nm1]/with(df1, df2$value[match(name[i1], df2$name)])
df1
# name variable year Sec1 Sec2 Sec3
#1 CHN VA 1950 7.666667 15.000000 10.66667
#2 CHN VA 1951 14.333333 15.000000 22.33333
#3 CHN E 1950 45.000000 67.000000 87.00000
#4 CHN E 1951 34.000000 53.000000 62.00000
#5 IND VA 1950 7.500000 9.333333 4.00000
df1 <-structure(list(name = c("CHN", "CHN", "CHN", "CHN", "IND"),
variable = c("VA",
"VA", "E", "E", "VA"), year = c(1950L, 1951L, 1950L, 1951L, 1950L
), Sec1 = c(23, 43, 45, 34, 45), Sec2 = c(45, 45, 67, 53, 56),
Sec3 = c(32, 67, 87, 62, 24)), row.names = c(NA, -5L), class = "data.frame")
df2 <- structure(list(name = c("CHN", "IND", "MLY", "EUR"), value = c(3L,
6L, 7L, 4L)), class = "data.frame", row.names = c(NA, -4L))