如何将一个数据框中的多个列的子集与另一个数据框中的值相除

时间:2019-10-13 13:53:29

标签: r dataframe multiple-columns

我有2个大型数据框,结构如下。我只想将df1中每个名称的变量VA对应的sec1至sec3列中的值除以df1中每个名称的df2值列中对应名称的对应值。

Df1

name    variable    year    Sec1    Sec2    Sec3    
CHN VA  1950        23    45    32   
CHN VA  1951        43    45    67    
CHN E   1950        45    67    87    
CHN E   1951    34  53  62    
IND VA  1950    45  56  24

DF2

name  value    
CHN 3    
IND 6    
MLY 7    
EUR 4

结果应该类似于

DF1

name  variable   year   Sec1    Sec2    Sec3   
CHN VA  1950    23/3    45/3    32/3    
CHN VA  1951    43/3    45/3    67/3    
CHN E   1950    45  67  87    
CHN E   1951    34  53  62   
IND VA  1950    45/6    56/6    24/6

我尝试了子设置并使用cbind(),但是遇到了问题。

2 个答案:

答案 0 :(得分:4)

1)左联接DF1DF2,然后对于每个Sec列除以value。最后删除value列。

library(dplyr)

DF1 %>%
  left_join(DF2, by = "name") %>%
  mutate(value = if_else(variable == "VA", value, 1L)) %>%
  mutate_at(vars(starts_with("Sec")), ~ .x / value) %>%
  select(-value)

给予:

  name variable year      Sec1      Sec2     Sec3
1  CHN       VA 1950  7.666667 15.000000 10.66667
2  CHN       VA 1951 14.333333 15.000000 22.33333
3  CHN        E 1950 45.000000 67.000000 87.00000
4  CHN        E 1951 34.000000 53.000000 62.00000
5  IND       VA 1950  7.500000  9.333333  4.00000

2)基本R版本为:

m <- merge(DF1, DF2, by = "name", all.x = TRUE, all.y = FALSE)
ix <- m$variable == "VA"
jx <- grep("^Sec", names(m))
m[ix, jx] <- m[ix, jx] / m$value[ix]
m <- m[names(DF1)]

3)另一种方法是将其转换为长格式,执行联接和除法,然后转换回去。请注意,这将对行进行重新排序。

library(dplyr)
library(tidyr)

DF1 %>%
  gather(key, val, -name, -variable, -year) %>%
  left_join(DF2, by = "name") %>%
  mutate(value = if_else(variable == "VA", value, 1L)) %>%
  mutate(val = val / value) %>%
  spread(key, val)

4)如果您不介意将其写出来:

library(dplyr)

DF1 %>%
  left_join(DF2, by = "name") %>%
  mutate(value = if_else(variable == "VA", value, 1L)) %>%
  mutate(Sec1 = Sec1 / value, Sec2 = Sec2 / value, Sec3 = Sec3 / value, value = NULL)

4a)或使用底数R:

m <- merge(DF1, DF2, by = "name", all.x = TRUE, all.y = FALSE)
m <- transform(m, value = ifelse(variable == "VA", value, 1))
transform(m, Sec1 = Sec1 / value, Sec2 = Sec2 / value, Sec3 = Sec3 / value, value = NULL)

注意

Lines1 <- "name variable year Sec1 Sec2 Sec3
CHN VA 1950 23 45 32
CHN VA 1951 43 45 67
CHN E 1950 45 67 87
CHN E 1951 34 53 62
IND VA 1950 45 56 24"
DF1 <- read.table(text = Lines1, header = TRUE, as.is = TRUE)

Lines2 <- "name value
CHN 3
IND 6
MLY 7
EUR 4"
DF2 <- read.table(text = Lines2, header = TRUE, as.is = TRUE)

答案 1 :(得分:1)

1)使用data.table 这是进行data.table连接的一个选项

library(data.table)# v 1.12.4
nm1 <- paste0("Sec", 1:3)
setDT(df1)[df2, (nm1) := lapply(mget(nm1), function(x) 
        fifelse(variable == 'VA', x/value, x)), on = .(name)]
df1
#   name variable year      Sec1      Sec2     Sec3
#1:  CHN       VA 1950  7.666667 15.000000 10.66667
#2:  CHN       VA 1951 14.333333 15.000000 22.33333
#3:  CHN        E 1950 45.000000 67.000000 87.00000
#4:  CHN        E 1951 34.000000 53.000000 62.00000
#5:  IND       VA 1950  7.500000  9.333333  4.00000

2)使用dplyr

library(dplyr)
df1 %>%
      mutate_at(vars(starts_with('Sec')), ~ 
        case_when(variable == 'VA' ~ ./ df2$value[match(name, df2$name)], TRUE ~ .))
#  name variable year      Sec1      Sec2     Sec3
#1  CHN       VA 1950  7.666667 15.000000 10.66667
#2  CHN       VA 1951 14.333333 15.000000 22.33333
#3  CHN        E 1950 45.000000 67.000000 87.00000
#4  CHN        E 1951 34.000000 53.000000 62.00000
#5  IND       VA 1950  7.500000  9.333333  4.00000

3)使用基本R -match

i1 <- df1$variable == 'VA'
df1[i1, nm1] <- df1[i1,nm1]/with(df1, df2$value[match(name[i1], df2$name)])
df1
#  name variable year      Sec1      Sec2     Sec3
#1  CHN       VA 1950  7.666667 15.000000 10.66667
#2  CHN       VA 1951 14.333333 15.000000 22.33333
#3  CHN        E 1950 45.000000 67.000000 87.00000
#4  CHN        E 1951 34.000000 53.000000 62.00000
#5  IND       VA 1950  7.500000  9.333333  4.00000

数据

df1 <-structure(list(name = c("CHN", "CHN", "CHN", "CHN", "IND"), 
  variable = c("VA", 
 "VA", "E", "E", "VA"), year = c(1950L, 1951L, 1950L, 1951L, 1950L
), Sec1 = c(23, 43, 45, 34, 45), Sec2 = c(45, 45, 67, 53, 56), 
Sec3 = c(32, 67, 87, 62, 24)), row.names = c(NA, -5L), class = "data.frame")

df2 <- structure(list(name = c("CHN", "IND", "MLY", "EUR"), value = c(3L, 
 6L, 7L, 4L)), class = "data.frame", row.names = c(NA, -4L))