我有一个看起来像这样的数据框:
Gene.names=c("ESR", "ESR.1", "ESR.2", "ESR.3", "PKB", "PKB.1", "PKB.2", "PKB.3")
mean_0.x = c(3,2,5,9,2,4,6,7)
mean_1.x = c(6,2,5,1,9,1,1,9)
mean_2.x = c(3,2,9,9,6,7,3,3)
mean_0.y = c(1,NA,NA,NA,6,NA,NA,NA)
mean_1.y = c(1,NA,NA,NA,3,NA,NA,NA)
mean_2.y = c(6,NA,NA,NA,4,NA,NA,NA)
df = cbind.data.frame(Gene.names, mean_0.x, mean_1.x, mean_2.x, mean_0.y, mean_1.y, mean_2.y)
我想要的输出:
Gene.names = c("ESR", "ESR.1", "ESR.2", "ESR.3", "PKB", "PKB.1", "PKB.2", "PKB.3")
mean_0_diff = c(3,2,5,9,0.33,0.66,1,1.16)
mean_1_diff = c(6,2,5,1,3,0.33,.0.33,3)
mean_2_diff = c(0.5,0.33,1.5,1.5,1.5,1.75,0.75,0.75)
df_out = cbind.data.frame(Gene.names, mean_0_diff, mean_1_diff, mean_2_diff)
mean_0.x/mean_0.y; mean_1.x/mean_1.y; mean_2.x/mean_2.y;
等答案 0 :(得分:2)
使用tidyverse
:
library(tidyverse)
res <- cbind(df[1],
`/`(df %>% select(ends_with('x')),
df %>% select(ends_with('y')) %>%
fill(everything())))
# Gene.names mean_0.x mean_1.x mean_2.x
# 1 ESR 3.0000000 6.0000000 0.5000000
# 2 ESR.1 2.0000000 2.0000000 0.3333333
# 3 ESR.2 5.0000000 5.0000000 1.5000000
# 4 ESR.3 9.0000000 1.0000000 1.5000000
# 5 PKB 0.3333333 3.0000000 1.5000000
# 6 PKB.1 0.6666667 0.3333333 1.7500000
# 7 PKB.2 1.0000000 0.3333333 0.7500000
# 8 PKB.3 1.1666667 3.0000000 0.7500000
这将是惯用的方式:
df %>%
fill(ends_with('y')) %>%
gather(,,-1) %>%
separate(key,c("key","xy"),sep="\\.") %>%
spread(xy,value) %>%
transmute(Gene.names,key, value=x /y) %>%
spread(key,value)
# Gene.names mean_0 mean_1 mean_2
# 1 ESR 3.0000000 6.0000000 0.5000000
# 2 ESR.1 2.0000000 2.0000000 0.3333333
# 3 ESR.2 5.0000000 5.0000000 1.5000000
# 4 ESR.3 9.0000000 1.0000000 1.5000000
# 5 PKB 0.3333333 3.0000000 1.5000000
# 6 PKB.1 0.6666667 0.3333333 1.7500000
# 7 PKB.2 1.0000000 0.3333333 0.7500000
# 8 PKB.3 1.1666667 3.0000000 0.7500000
答案 1 :(得分:1)
软件包na.locf
中的以下需求函数zoo
。
inx.x <- grep("x$", names(df))
inx.y <- grep("y$", names(df))
df[inx.y] <- lapply(df[inx.y], zoo::na.locf)
df_out2 <- df[1]
df_out2 <- cbind(df_out2, df[inx.x]/df[inx.y])
nms <- sub("\\.x$", "", names(df[inx.x]))
names(df_out2)[-1] <- paste(nms, "diff", sep = "_")
df_out2
# Gene.names mean_0_diff mean_1_diff mean_2_diff
#1 ESR 3.0000000 6.0000000 0.5000000
#2 ESR.1 2.0000000 2.0000000 0.3333333
#3 ESR.2 5.0000000 5.0000000 1.5000000
#4 ESR.3 9.0000000 1.0000000 1.5000000
#5 PKB 0.3333333 3.0000000 1.5000000
#6 PKB.1 0.6666667 0.3333333 1.7500000
#7 PKB.2 1.0000000 0.3333333 0.7500000
#8 PKB.3 1.1666667 3.0000000 0.7500000
请注意,结果不相等,因为结果是四舍五入的值:
all.equal(df_out, df_out2)
#[1] "Component “mean_0_diff”: Mean relative difference: 0.007751938"
#[2] "Component “mean_1_diff”: Mean relative difference: 0.01010101"
#[3] "Component “mean_2_diff”: Mean relative difference: 0.01010101"
答案 2 :(得分:0)
另一个选择是可以处理wide-format
本身的数据。使用dplyr
的基于mutate_at
的解决方案可以写为:
library(dplyr)
# Group data on base name of 'Gene.names` first.
df %>% group_by(Gene = gsub("(^\\w+)\\..*","\\1", Gene.names)) %>%
# For each column ending with .x divide corresponding column ending with .y
mutate_at(vars(ends_with(".x")),
funs(diff = ./get(sub("\\.x",".y",quo_name(quo(.))))[1] )) %>%
ungroup() %>%
select( Gene.names, ends_with("diff"))
# # A tibble: 8 x 4
# Gene.names mean_0.x_diff mean_1.x_diff mean_2.x_diff
# <fctr> <dbl> <dbl> <dbl>
# 1 ESR 3.00 6.00 0.500
# 2 ESR.1 2.00 2.00 0.333
# 3 ESR.2 5.00 5.00 1.50
# 4 ESR.3 9.00 1.00 1.50
# 5 PKB 0.333 3.00 1.50
# 6 PKB.1 0.667 0.333 1.75
# 7 PKB.2 1.00 0.333 0.750
# 8 PKB.3 1.17 3.00 0.750