我有两个列数不等的数据帧。我想从df1中减去df2行的列强度(即样本方式)的强度值。 我的条件是:
数据框1
pep_seq = c("aaaaaaaaa", "ababababba", "dfsfsfsfds", "xbbcbcncncc", "fbbdsgffhhh", "dggdgdgegeggerr",
"dfgthrgfgf", "wegregegg", "egegegergewge", "sfngegebser", "qegqeefbew", "qegqetegqt",
"qwtqtewr", "etghsfrgf", "sfsdfbdfbergeagaegr", "wasfqertsdfaefwe")
int_samp_1 = c("2421432", "24242424", "NA", "4684757849", "NA", "10485040", "NA",
"6849400", "40300", "NA", "NA", "NA", "556456466", "4646456466", "246464266", "4564242646")
int_samp_2 = c("NA", "5342353", "14532556", "43566", "46367367", "768769769", "797899", "NA", "NA", "NA",
"686899", "7898979", "678568", "NA", "68886", "488")
int_samp_3 = c("11351", "NA", "NA", "NA", "1354151345", "1351351354", "314534", "1535", "3145354", "4353455",
"324535", "3543445", "34535", "34535534", "NA", "NA")
gene_nm = c("A", "A", "A", "A", "A", "A", "B", "B", "B", "C", "C", "C", "C", "C", "C", "C")
df_1 = cbind.data.frame(pep_seq, int_samp_1, int_samp_2, int_samp_3, gene_nm)
数据框2
int_samp_1a = c("2421432", "24242424", "NA")
int_samp_2a = c("NA", "5342353", "14532556")
int_samp_3a = c("11351", "NA", "NA")
gene_nm.a = c("A", "B", "C")
df_2 = cbind.data.frame(gene_nm.a, int_samp_1a, int_samp_2a, int_samp_3a)
请建议。
答案 0 :(得分:2)
一种选择是使用df_1
加入df_2
和dplyr
,然后执行简单的矩阵减法。
注意:数据框的因子得到强度读数。当你期望进行减法时,我认为保持测量因素并不是一个好主意。因此我将它们转换为integer
。
library(dplyr)
# The NA values from df_2 has been changed to 0 since keeping those NA, will
# turn values in df_A NA for no reason.
mod <- df_1 %>% left_join(df_2, by= c("gene_nm" = "gene_nm.a")) %>% # join on gene
mutate_at(vars(starts_with("int_samp")), funs(as.integer(as.character(.)))) %>%
mutate_at(vars(ends_with("a")), funs(ifelse(is.na(.),0L,.))) #Values are converted
# The modified data.frame got columns from both df_1 and df_2
mod[,grepl("^int_samp_\\d+$", names(mod))] <-
mod[,grepl("^int_samp_\\d+$", names(mod))] -
mod[,grepl("^int_samp_\\d+[a-z]+$", names(mod))]
# Take columns from df_1.
mod[names(df_1)]
# pep_seq int_samp_1 int_samp_2 int_samp_3 gene_nm
# 1 aaaaaaaaa 0 NA 0 A
# 2 ababababba 21820992 5342353 NA A
# 3 dfsfsfsfds NA 14532556 NA A
# 4 xbbcbcncncc NA 43566 NA A
# 5 fbbdsgffhhh NA 46367367 1354139994 A
# 6 dggdgdgegeggerr 8063608 768769769 1351340003 A
# 7 dfgthrgfgf NA -4544454 314534 B
# 8 wegregegg -17393024 NA 1535 B
# 9 egegegergewge -24202124 NA 3145354 B
# 10 sfngegebser NA NA 4353455 C
# 11 qegqeefbew NA -13845657 324535 C
# 12 qegqetegqt NA -6633577 3543445 C
# 13 qwtqtewr 556456466 -13853988 34535 C
# 14 etghsfrgf NA NA 34535534 C
# 15 sfsdfbdfbergeagaegr 246464266 -14463670 NA C
# 16 wasfqertsdfaefwe NA -14532068 NA C
答案 1 :(得分:2)
IIUC,您在df_1
和df_2
中有相同名称的列(例如int_samp_X
对于某个整数X
),您希望获得匹配的差异列名称,按gene_nm
分组(例如df_1[df_1$gene_nm == 'A', int_samp_1] - df_2[df_2$gene_nm == 'A', int_samp_1]
)。
我们可以使用tidyverse
系列软件包来解决此问题,尤其是dplyr
和purrr
。
首先,将df_1
和df_2
合并为left_join
,以确保df_1
中的所有条目与df_2
中的基因级条目匹配时保留1}}:
library(tidyverse)
df_3 <- df_1 %>% left_join(df_2, by = "gene_nm")
df_3
pep_seq int_samp_1.x int_samp_2.x int_samp_3.x gene_nm int_samp_1.y int_samp_2.y int_samp_3.y
1 aaaaaaaaa 2421432 NA 11351 A 2421432 NA 11351
2 ababababba 24242424 5342353 NA A 2421432 NA 11351
3 dfsfsfsfds NA 14532556 NA A 2421432 NA 11351
4 xbbcbcncncc 4684757849 43566 NA A 2421432 NA 11351
5 fbbdsgffhhh NA 46367367 1354151345 A 2421432 NA 11351
6 dggdgdgegeggerr 10485040 768769769 1351351354 A 2421432 NA 11351
7 dfgthrgfgf NA 797899 314534 B 24242424 5342353 NA
8 wegregegg 6849400 NA 1535 B 24242424 5342353 NA
9 egegegergewge 40300 NA 3145354 B 24242424 5342353 NA
10 sfngegebser NA NA 4353455 C NA 14532556 NA
11 qegqeefbew NA 686899 324535 C NA 14532556 NA
12 qegqetegqt NA 7898979 3543445 C NA 14532556 NA
13 qwtqtewr 556456466 678568 34535 C NA 14532556 NA
14 etghsfrgf 4646456466 NA 34535534 C NA 14532556 NA
15 sfsdfbdfbergeagaegr 246464266 68886 NA C NA 14532556 NA
16 wasfqertsdfaefwe 4564242646 488 NA C NA 14532556 NA
然后map
超过感兴趣的列名称,从每个列对中获取差异。 (请注意,您需要先将int_samp
列从factor
转换为numeric
。
更新(根据OP评论):
要在计算差异之前将NA
转换为0
,我们可以使用mutate_if()
和replace()
,将以下内容添加到方法链中:
mutate_if(is.numeric, funs(replace(., is.na(.), 0)))
最后,join
返回df_1
:
var_names <- df_1 %>% select(starts_with("int_samp")) %>% names()
var_names # [1] "int_samp_1" "int_samp_2" "int_samp_3"
var_names %>%
map_dfc(~df_3 %>%
mutate_at(vars(matches(.x)), funs(as.numeric(as.character(.)))) %>%
mutate_if(is.numeric, funs(replace(., is.na(.), 0))) %>%
select(matches(.x)) %>%
reduce(`-`)) %>%
set_names(paste0(var_names, "_diff")) %>%
bind_cols(df_1)
输出:
int_samp_1_diff int_samp_2_diff int_samp_3_diff pep_seq int_samp_1 int_samp_2 int_samp_3 gene_nm
<dbl> <dbl> <dbl> <fct> <fct> <fct> <fct> <fct>
1 0. 0. 0. aaaaaaaaa 2421432 NA 11351 A
2 21820992. 5342353. -11351. ababababba 24242424 5342353 NA A
3 -2421432. 14532556. -11351. dfsfsfsfds NA 14532556 NA A
4 4682336417. 43566. -11351. xbbcbcncncc 4684757849 43566 NA A
5 -2421432. 46367367. 1354139994. fbbdsgffhhh NA 46367367 1354151345 A
6 8063608. 768769769. 1351340003. dggdgdgegeggerr 10485040 768769769 1351351354 A
7 -24242424. -4544454. 314534. dfgthrgfgf NA 797899 314534 B
8 -17393024. -5342353. 1535. wegregegg 6849400 NA 1535 B
9 -24202124. -5342353. 3145354. egegegergewge 40300 NA 3145354 B
10 0. -14532556. 4353455. sfngegebser NA NA 4353455 C
11 0. -13845657. 324535. qegqeefbew NA 686899 324535 C
12 0. -6633577. 3543445. qegqetegqt NA 7898979 3543445 C
13 556456466. -13853988. 34535. qwtqtewr 556456466 678568 34535 C
14 4646456466. -14532556. 34535534. etghsfrgf 4646456466 NA 34535534 C
15 246464266. -14463670. 0. sfsdfbdfbergeagaegr 246464266 68886 NA C
16 4564242646. -14532068. 0. wasfqertsdfaefwe 4564242646 488 NA C
注意:这个答案主要来自akrun的回答here。