为什么函数的dplyr :: mutate仅影响第一行而不影响其余

时间:2018-06-22 05:43:04

标签: r dplyr

我有以下代码:

library(tidyverse)


# Function ----------------------------------------------------------------

convert <- function(three_aa_seq = NULL) {

  pep_dat <- structure(list(full = c(
    "Alanine", "Arginine", "Asparagine",
    "Aspartate", "Cysteine", "Glutamine", "Glutamate", "Glycine",
    "Histidine", "Isoleucine", "Leucine", "Lysine", "Methionine",
    "Phenylalanine", "Proline", "Serine", "Threonine", "Tryptophan",
    "Tyrosine", "Valine"
  ), three = c(
    "Ala", "Arg", "Asn", "Asp",
    "Cys", "Gln", "Glu", "Gly", "His", "Ile", "Leu", "Lys", "Met",
    "Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val"
  ), one = c(
    "A",
    "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F",
    "P", "S", "T", "W", "Y", "V"
  )), .Names = c("full", "three", "one"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(
    NA,
    -20L
  ))
  str_split(three_aa_seq, pattern = "-")[[1]] %>%
    as.tibble() %>%
    rename(three = value) %>%
    inner_join(pep_dat, by = "three") %>%
    pull(one) %>%
    paste(., collapse = "")
}

基本上将一组三个字母的氨基酸字符串更改为单个字母字符串。例如"His-Ser-Leu"HSL

但是当我尝试使用此代码时:

tribble(
  ~ pep_name, ~ three_seq,
  "PA_19", "His-Ser-Leu-Gly-Lys-Trp-Leu-Gly-His-Pro-Asp-Lys-Phe",
  "PA_20", "Thr-Ala-Pro-Arg-Ser-Leu-Arg-Arg-Ser-Ser-Cys-Phe-Gly-Gly-Arg-Met-Asp-Arg-Ile-Gly-Ala-Gln-Ser-Gly-Leu-Gly-Cys-Asn-Ser-Phe-Arg-Tyr"
) %>%
  mutate(pep = convert(three_aa_seq = three_seq)) %>%
  select(pepname, pep)

它返回:

# A tibble: 2 x 2
  pep_name pep          
  <chr>    <chr>        
1 PA_19    HSLGKWLGHPDKF
2 PA_20    HSLGKWLGHPDKF

只有pep的结果列PA_20不会被TAPRSLRRSSCFGGRMDRIGAQSGLGCNSFRY更新。正确的方法是什么?

2 个答案:

答案 0 :(得分:3)

str_split的输出为list,而OP仅使用list提取[[1]]的第一个元素。相反,它应该被循环。如果初始数据集为“ tbl”

tbl %>%
   mutate(pep =  str_split(three_seq, pattern = "-") %>% 
                     map_chr(~
                            as_tibble(.x) %>%
                            rename(three=value) %>%
                            inner_join(pep_dat, by = 'three') %>% 
                            pull(one) %>%
                            paste(collapse=""))) %>%
   select(-three_seq)
# A tibble: 2 x 2
#  pep_name pep                       
#  <chr>    <chr>                           
#1 PA_19    HSLGKWLGHPDKF                   
#2 PA_20    TAPRSLRRSSCFGGRMDRIGAQSGLGCNSFRY

将以上内容转换为功能

convertfn <- function(three_aa_seq, keydat) {
               str_split(three_aa_seq, pattern = "-") %>%
                   map_chr(~ 
                         as_tibble(.x) %>% 
                         rename(three = value) %>%
                         inner_join(keydat, by = 'three') %>% 
                         pull(one) %>% 
                         paste(collapse = ""))
                         }

tbl %>% 
     mutate(pep = convertfn(three_seq, pep_dat)) %>%
     select(-three_seq)

答案 1 :(得分:1)

请注意,与更多行一起使用时,为每个替换创建一个tibble会非常慢。最好的方法是立即使用stringr中可用的矢量化字符串替换:

library(tidyverse)
library(stringr)

tibble(three = c(
    "Ala", "Arg", "Asn", "Asp",
    "Cys", "Gln", "Glu", "Gly", "His", "Ile", "Leu", "Lys", "Met",
    "Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val"
  ), one = c(
    "A",
    "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F",
    "P", "S", "T", "W", "Y", "V"
  )) %>% 
  {set_names(.$one, .$three)} ->
  aa_map

tribble(
  ~ pep_name, ~ three_seq,
  "PA_19", "His-Ser-Leu-Gly-Lys-Trp-Leu-Gly-His-Pro-Asp-Lys-Phe",
  "PA_20", "Thr-Ala-Pro-Arg-Ser-Leu-Arg-Arg-Ser-Ser-Cys-Phe-Gly-Gly-Arg-Met-Asp-Arg-Ile-Gly-Ala-Gln-Ser-Gly-Leu-Gly-Cys-Asn-Ser-Phe-Arg-Tyr") %>% 
  mutate(one_seq = str_replace_all(three_seq, aa_map) %>% str_replace_all("-", ""))