逐列减去数据帧的行,保留多因子列

时间:2018-06-02 19:46:41

标签: r dataframe dplyr purrr

我有两个列数不等的数据帧。我想从df1中减去df2行的列强度(即样本方式)的强度值。 我的条件是:

  1. 在df1中,对于每个基因(gene_nm),肽序列(pep_seq)和它们对应的每个样品的强度(int_sam)有多行。同一基因多次出现,即占据几排。
  2. 在df2中,基因(行)仅以其相应的强度值出现一次
  3. 因此,df1比df2长得多(例如,55000行对6000行)
  4. 强度列数(int_samp)可以很多。我在这个例子中有3个
  5. 数据框1

    pep_seq = c("aaaaaaaaa", "ababababba", "dfsfsfsfds", "xbbcbcncncc", "fbbdsgffhhh", "dggdgdgegeggerr", 
            "dfgthrgfgf", "wegregegg", "egegegergewge", "sfngegebser", "qegqeefbew", "qegqetegqt", 
            "qwtqtewr", "etghsfrgf", "sfsdfbdfbergeagaegr", "wasfqertsdfaefwe")
    int_samp_1 = c("2421432", "24242424", "NA", "4684757849", "NA", "10485040", "NA", 
              "6849400", "40300", "NA", "NA", "NA", "556456466", "4646456466", "246464266", "4564242646")
    int_samp_2 = c("NA", "5342353", "14532556", "43566", "46367367", "768769769", "797899", "NA", "NA", "NA", 
              "686899", "7898979", "678568", "NA", "68886", "488")
    int_samp_3 = c("11351", "NA", "NA", "NA", "1354151345", "1351351354", "314534", "1535", "3145354", "4353455", 
              "324535", "3543445", "34535", "34535534", "NA", "NA")
    gene_nm = c("A", "A", "A", "A", "A", "A", "B", "B", "B", "C", "C", "C", "C", "C", "C", "C")
    df_1 = cbind.data.frame(pep_seq, int_samp_1, int_samp_2, int_samp_3, gene_nm)
    

    数据框2

    int_samp_1a = c("2421432", "24242424", "NA")
    int_samp_2a = c("NA", "5342353", "14532556")
    int_samp_3a = c("11351", "NA", "NA")
    gene_nm.a = c("A", "B", "C")
    df_2 = cbind.data.frame(gene_nm.a, int_samp_1a, int_samp_2a, int_samp_3a)
    

    请建议。

2 个答案:

答案 0 :(得分:2)

一种选择是使用df_1加入df_2dplyr,然后执行简单的矩阵减法。

注意:数据框的因子得到强度读数。当你期望进行减法时,我认为保持测量因素并不是一个好主意。因此我将它们转换为integer

library(dplyr)

# The NA values from df_2 has been changed to 0 since keeping those NA, will
# turn values in df_A NA for no reason. 
mod <- df_1 %>% left_join(df_2, by= c("gene_nm" = "gene_nm.a")) %>% # join on gene
  mutate_at(vars(starts_with("int_samp")), funs(as.integer(as.character(.)))) %>%
  mutate_at(vars(ends_with("a")), funs(ifelse(is.na(.),0L,.))) #Values are converted

# The modified data.frame got columns from both df_1 and df_2
mod[,grepl("^int_samp_\\d+$", names(mod))] <- 
                mod[,grepl("^int_samp_\\d+$", names(mod))] -  
                mod[,grepl("^int_samp_\\d+[a-z]+$", names(mod))]

# Take columns from df_1. 
mod[names(df_1)]
#                pep_seq int_samp_1 int_samp_2 int_samp_3 gene_nm
# 1            aaaaaaaaa          0         NA          0       A
# 2           ababababba   21820992    5342353         NA       A
# 3           dfsfsfsfds         NA   14532556         NA       A
# 4          xbbcbcncncc         NA      43566         NA       A
# 5          fbbdsgffhhh         NA   46367367 1354139994       A
# 6      dggdgdgegeggerr    8063608  768769769 1351340003       A
# 7           dfgthrgfgf         NA   -4544454     314534       B
# 8            wegregegg  -17393024         NA       1535       B
# 9        egegegergewge  -24202124         NA    3145354       B
# 10         sfngegebser         NA         NA    4353455       C
# 11          qegqeefbew         NA  -13845657     324535       C
# 12          qegqetegqt         NA   -6633577    3543445       C
# 13            qwtqtewr  556456466  -13853988      34535       C
# 14           etghsfrgf         NA         NA   34535534       C
# 15 sfsdfbdfbergeagaegr  246464266  -14463670         NA       C
# 16    wasfqertsdfaefwe         NA  -14532068         NA       C

答案 1 :(得分:2)

IIUC,您在df_1df_2中有相同名称的列(例如int_samp_X对于某个整数X),您希望获得匹配的差异列名称,按gene_nm分组(例如df_1[df_1$gene_nm == 'A', int_samp_1] - df_2[df_2$gene_nm == 'A', int_samp_1])。

我们可以使用tidyverse系列软件包来解决此问题,尤其是dplyrpurrr

首先,将df_1df_2合并为left_join,以确保df_1中的所有条目与df_2中的基因级条目匹配时保留1}}:

library(tidyverse)

df_3 <- df_1 %>% left_join(df_2, by = "gene_nm")

df_3
               pep_seq int_samp_1.x int_samp_2.x int_samp_3.x gene_nm int_samp_1.y int_samp_2.y int_samp_3.y
1            aaaaaaaaa      2421432           NA        11351       A      2421432           NA        11351
2           ababababba     24242424      5342353           NA       A      2421432           NA        11351
3           dfsfsfsfds           NA     14532556           NA       A      2421432           NA        11351
4          xbbcbcncncc   4684757849        43566           NA       A      2421432           NA        11351
5          fbbdsgffhhh           NA     46367367   1354151345       A      2421432           NA        11351
6      dggdgdgegeggerr     10485040    768769769   1351351354       A      2421432           NA        11351
7           dfgthrgfgf           NA       797899       314534       B     24242424      5342353           NA
8            wegregegg      6849400           NA         1535       B     24242424      5342353           NA
9        egegegergewge        40300           NA      3145354       B     24242424      5342353           NA
10         sfngegebser           NA           NA      4353455       C           NA     14532556           NA
11          qegqeefbew           NA       686899       324535       C           NA     14532556           NA
12          qegqetegqt           NA      7898979      3543445       C           NA     14532556           NA
13            qwtqtewr    556456466       678568        34535       C           NA     14532556           NA
14           etghsfrgf   4646456466           NA     34535534       C           NA     14532556           NA
15 sfsdfbdfbergeagaegr    246464266        68886           NA       C           NA     14532556           NA
16    wasfqertsdfaefwe   4564242646          488           NA       C           NA     14532556           NA

然后map超过感兴趣的列名称,从每个列对中获取差异。 (请注意,您需要先将int_samp列从factor转换为numeric

更新(根据OP评论): 要在计算差异之前将NA转换为0,我们可以使用mutate_if()replace(),将以下内容添加到方法链中:

mutate_if(is.numeric,  funs(replace(., is.na(.), 0)))

最后,join返回df_1

var_names <- df_1 %>% select(starts_with("int_samp")) %>% names()

var_names # [1] "int_samp_1" "int_samp_2" "int_samp_3"

var_names %>%
  map_dfc(~df_3 %>%
            mutate_at(vars(matches(.x)), funs(as.numeric(as.character(.)))) %>%
            mutate_if(is.numeric,  funs(replace(., is.na(.), 0))) %>%
            select(matches(.x)) %>%
            reduce(`-`)) %>%
  set_names(paste0(var_names, "_diff")) %>%
  bind_cols(df_1)

输出:

   int_samp_1_diff int_samp_2_diff int_samp_3_diff pep_seq             int_samp_1 int_samp_2 int_samp_3 gene_nm
             <dbl>           <dbl>           <dbl> <fct>               <fct>      <fct>      <fct>      <fct>  
 1              0.              0.              0. aaaaaaaaa           2421432    NA         11351      A      
 2       21820992.        5342353.         -11351. ababababba          24242424   5342353    NA         A      
 3       -2421432.       14532556.         -11351. dfsfsfsfds          NA         14532556   NA         A      
 4     4682336417.          43566.         -11351. xbbcbcncncc         4684757849 43566      NA         A      
 5       -2421432.       46367367.     1354139994. fbbdsgffhhh         NA         46367367   1354151345 A      
 6        8063608.      768769769.     1351340003. dggdgdgegeggerr     10485040   768769769  1351351354 A      
 7      -24242424.       -4544454.         314534. dfgthrgfgf          NA         797899     314534     B      
 8      -17393024.       -5342353.           1535. wegregegg           6849400    NA         1535       B      
 9      -24202124.       -5342353.        3145354. egegegergewge       40300      NA         3145354    B      
10              0.      -14532556.        4353455. sfngegebser         NA         NA         4353455    C      
11              0.      -13845657.         324535. qegqeefbew          NA         686899     324535     C      
12              0.       -6633577.        3543445. qegqetegqt          NA         7898979    3543445    C      
13      556456466.      -13853988.          34535. qwtqtewr            556456466  678568     34535      C      
14     4646456466.      -14532556.       34535534. etghsfrgf           4646456466 NA         34535534   C      
15      246464266.      -14463670.              0. sfsdfbdfbergeagaegr 246464266  68886      NA         C      
16     4564242646.      -14532068.              0. wasfqertsdfaefwe    4564242646 488        NA         C  

注意:这个答案主要来自akrun的回答here