如何在回收值的同时将数据帧的每一行除以数据帧中相应列的行

时间:2018-06-23 13:15:01

标签: r dataframe dplyr data.table purrr

我有一个看起来像这样的数据框:

Gene.names=c("ESR", "ESR.1", "ESR.2", "ESR.3", "PKB", "PKB.1", "PKB.2", "PKB.3")
mean_0.x = c(3,2,5,9,2,4,6,7)
mean_1.x = c(6,2,5,1,9,1,1,9)
mean_2.x = c(3,2,9,9,6,7,3,3)
mean_0.y = c(1,NA,NA,NA,6,NA,NA,NA)
mean_1.y = c(1,NA,NA,NA,3,NA,NA,NA)
mean_2.y = c(6,NA,NA,NA,4,NA,NA,NA)

df = cbind.data.frame(Gene.names, mean_0.x, mean_1.x, mean_2.x, mean_0.y, mean_1.y, mean_2.y)

我想要的输出:

Gene.names = c("ESR", "ESR.1", "ESR.2", "ESR.3", "PKB", "PKB.1", "PKB.2", "PKB.3")
mean_0_diff = c(3,2,5,9,0.33,0.66,1,1.16)
mean_1_diff = c(6,2,5,1,3,0.33,.0.33,3)
mean_2_diff = c(0.5,0.33,1.5,1.5,1.5,1.75,0.75,0.75)

df_out = cbind.data.frame(Gene.names, mean_0_diff, mean_1_diff, mean_2_diff)
  1. 我的数据框包含数千行和> 50列
  2. 我想划分相应的列,例如mean_0.x/mean_0.y; mean_1.x/mean_1.y; mean_2.x/mean_2.y;
  3. 我想回收* .y中的行值,以便在此示例中,mean_0.x上的mean_0.y中的值使用了4次。但是,在我的真实数据集中,这种“回收”必须发生任何未知的次数。

3 个答案:

答案 0 :(得分:2)

使用tidyverse

library(tidyverse)
res <- cbind(df[1],
             `/`(df %>% select(ends_with('x')),
                 df %>% select(ends_with('y')) %>% 
                   fill(everything())))

#   Gene.names  mean_0.x  mean_1.x  mean_2.x
# 1        ESR 3.0000000 6.0000000 0.5000000
# 2      ESR.1 2.0000000 2.0000000 0.3333333
# 3      ESR.2 5.0000000 5.0000000 1.5000000
# 4      ESR.3 9.0000000 1.0000000 1.5000000
# 5        PKB 0.3333333 3.0000000 1.5000000
# 6      PKB.1 0.6666667 0.3333333 1.7500000
# 7      PKB.2 1.0000000 0.3333333 0.7500000
# 8      PKB.3 1.1666667 3.0000000 0.7500000

这将是惯用的方式:

df %>%
  fill(ends_with('y')) %>%
  gather(,,-1) %>%
  separate(key,c("key","xy"),sep="\\.") %>%
  spread(xy,value) %>%
  transmute(Gene.names,key, value=x /y) %>%
  spread(key,value) 

#   Gene.names    mean_0    mean_1    mean_2
# 1        ESR 3.0000000 6.0000000 0.5000000
# 2      ESR.1 2.0000000 2.0000000 0.3333333
# 3      ESR.2 5.0000000 5.0000000 1.5000000
# 4      ESR.3 9.0000000 1.0000000 1.5000000
# 5        PKB 0.3333333 3.0000000 1.5000000
# 6      PKB.1 0.6666667 0.3333333 1.7500000
# 7      PKB.2 1.0000000 0.3333333 0.7500000
# 8      PKB.3 1.1666667 3.0000000 0.7500000 

答案 1 :(得分:1)

软件包na.locf中的以下需求函数zoo

inx.x <- grep("x$", names(df))
inx.y <- grep("y$", names(df))

df[inx.y] <- lapply(df[inx.y], zoo::na.locf)

df_out2 <- df[1]
df_out2 <- cbind(df_out2, df[inx.x]/df[inx.y])

nms <- sub("\\.x$", "", names(df[inx.x]))
names(df_out2)[-1] <- paste(nms, "diff", sep = "_")

df_out2
#  Gene.names mean_0_diff mean_1_diff mean_2_diff
#1        ESR   3.0000000   6.0000000   0.5000000
#2      ESR.1   2.0000000   2.0000000   0.3333333
#3      ESR.2   5.0000000   5.0000000   1.5000000
#4      ESR.3   9.0000000   1.0000000   1.5000000
#5        PKB   0.3333333   3.0000000   1.5000000
#6      PKB.1   0.6666667   0.3333333   1.7500000
#7      PKB.2   1.0000000   0.3333333   0.7500000
#8      PKB.3   1.1666667   3.0000000   0.7500000

请注意,结果不相等,因为结果是四舍五入的值:

all.equal(df_out, df_out2)
#[1] "Component “mean_0_diff”: Mean relative difference: 0.007751938"
#[2] "Component “mean_1_diff”: Mean relative difference: 0.01010101" 
#[3] "Component “mean_2_diff”: Mean relative difference: 0.01010101"

答案 2 :(得分:0)

另一个选择是可以处理wide-format本身的数据。使用dplyr的基于mutate_at的解决方案可以写为:

library(dplyr)

# Group data on base name of 'Gene.names` first.
df %>% group_by(Gene = gsub("(^\\w+)\\..*","\\1", Gene.names)) %>%
  # For each column ending with .x divide corresponding column ending with .y
  mutate_at(vars(ends_with(".x")), 
            funs(diff = ./get(sub("\\.x",".y",quo_name(quo(.))))[1] )) %>%
  ungroup() %>%
  select( Gene.names, ends_with("diff"))


# # A tibble: 8 x 4
# Gene.names   mean_0.x_diff mean_1.x_diff mean_2.x_diff
# <fctr>             <dbl>         <dbl>         <dbl>
# 1 ESR                3.00          6.00          0.500
# 2 ESR.1              2.00          2.00          0.333
# 3 ESR.2              5.00          5.00          1.50 
# 4 ESR.3              9.00          1.00          1.50 
# 5 PKB                0.333         3.00          1.50 
# 6 PKB.1              0.667         0.333         1.75 
# 7 PKB.2              1.00          0.333         0.750
# 8 PKB.3              1.17          3.00          0.750