Question

Bellow我发布了三个函数，我编写这些函数来计算并生成一个网格，其中包含不同字母位置之间的所有可能组合。让我举个例子，然后看看实际的代码。

E.g

让一个等于＆＃34的短语;今天是一个完美的一天＆＃34;并且假设我们有一个列表（短语1）的所有单词。 phrase1 = list("Today", "is", "a", "perfect", "day")

此变量（phrase1）是包含更多短语的较大列表的一部分。 phrases = list( phrase1=list(), phrase2=list() ...)

现在，对于每个短语的每个单词，我想找到指定字母之间的所有可能组合。

在开始时，我使用包含目标字符，类别列表和短语列表的字符列表来补充我的函数。

char_list = c("o","a") cat_list = c("cat1", "cat2")

cat1始终对应于字母＆＃34; o＆＃34;并cat2致信＆＃34; a＆＃34;分别。

如果是这种情况那么，对于＆＃34;今天＆＃34;在第一个短语（phrase1）中，我得到了下表

 word   | cat1 | cat2 
 -------|----- |-----
  p1w1  |  2   |  0   
  p1w1  |  0   |  0   
  p1w1  |  0   |  4   
  p1w1  |  2   |  4

编辑：所以这里是第一个短语（p1）的第一个单词（w1）的表格，其中包含了之前指定的两个字母位置之间的所有可能组合。

而对于整个短语列表，这个网格看起来像：

 word   | cat1  | cat2 
 -------|------ |-----
  p1w1  |  ..   |  ..   
  p1w1  |  ..   |  ..   
  p1w2  |  ..   |  ..   
  p1w2  |  ..   |  ..   
  p2w1  |  ..   |  ..   
  p2w1  |  ..   |  ..  
  ...

编辑：整个过程的最后一个表是包含所有短语中所有单词的指定字母位置之间所有可能组合的表格。

现在的问题是，如果我的初始列表足够大（4096个短语，每个单词有数百个单词），那么此过程需要花费大量时间直到完成，因为它会逐个读取每个单词，依此类推。

例如，我在Windows服务器上运行我的脚本，具有25个内核和32GB RAM，并且只使用5-10％的一个内核，大约需要4个小时。

我开始阅读R中的并行计算，但是要在这里询问这个特定的方法，看看是否有任何其他好主意，以提高性能。

我还想提一下，我不是一个专业的R编码器，可以找到许多性能错误。

谢谢。

功能区：

我只是致电all.combs.grid(char_list, cat_list, phrases_list)

all.combs.grid = function(char_list , cat_list , phrases_list){

  mod_words = matrix()

  final.grid = matrix(ncol = 2+length(cat_list))
  colnames(final.grid) = c("phrase", "sequence" , cat_list)

  i = 1

  for(phrase in phrases_list){

    # Get phrase ID
    phrase_id = names(phrases_list)[i]

    # Get word after word for each phrase
    for(d in 1:length(phrase[[1]])){
      # word sequence
      word = phrase[[1]][d]

      # get the matrix of all possible combinations for that word
      word_pos_combs = all.combs.word.grid(char_list , cat_list, word)


      # number of combinations
      no_of_comb = nrow(word_pos_combs)
      # Create a phrase id and word vector
      phrase_tag = rep(phrase_id, no_of_comb)
      word_tag = rep(word, no_of_comb)

      # Combine phrase_tag , word_tag and word_pos_combs
      mod_words = cbind(phrase_tag , word_tag, word_pos_combs)

      # Combine mod_words variable for all words in a phrase into a matrix
      # one under the other
      final.grid = rbind(final.grid, mod_words)

    } # word loop    
  }

  # Remove the first row which has NAs
  final.grid = na.omit(final.grid)
  # Set proper row names.
  row.names(final.grid) = 1:nrow(final.grid)

  write.csv(x = final.grid , file = combs, row.names = F)

}

all.combs.word.grid = function(char_list , cat_list , word ){

  comb_list = list()

  for( i in seq_along(char_list) ){

      char = char_list[i]
      char_combs = char.combs( word, char )
      comb_list = c( comb_list, list( char_combs ) )

      if(a.a == "0" && length(a.a) == 1) # If there is no instance of the target a.a in the peptide
        return("0")     
    }
  }

  # Generate the matrix of the unique instances for each category.
  # e.g
  #      cat1     cat2    cat3
  # [1,] "1"      "0"      "0"      
  # [2,] "3"      "0"      "0"      
  # [3,] "6"      "0"      "0"      
  # [4,] "1:3"    "0"      "0"      
  # [5,] "1:6"    "0"      "0"      
  # [6,] "3:6"    "0"      "0"      
  # [7,] "1:3:6"  "0"      "0"      
  # [8,] "0"      "2"      "0"      
  # [9,] "0"      "9"      "0"      
  # [10,] "0"     "2:9"    "0"      
  # [11,] "0"     "0"      "8"      

  m1 = create_mat(comb_list)

  # Get all the possible combinations between the above categories
  m1 = as.list(data.frame(m1))
  m1 = expand.grid(lapply(m1,unique))
  colnames(m1) = cat_list

  # Get all the possible combinations of the rest (comb_list)
  exp = expand.grid(comb_list)
  colnames(exp) = cat_list

  # Combine the two matrices
  result = rbind(m1,exp)

  result = unique(result)

  return(result)
}

char.combs = function(word , char ){

  # GET ALL INDICES of INSTANCES
  pos = unlist(gregexpr(char, word ))
  char = c()

  # If there is only one instance of the target char
  if( length(pos) == 1 && pos != -1 ){

    char = c(char , as.character(pos))
    return(char)

  # If there is no instance of the target char
  }else if ( pos == -1 || length(pos) == 1 ) {

    char = "0"
    return(char)

  # if there are more than one instances of the target char
  }else{

    for( i in 1:length(pos) ){
        comb = t(combn(pos,i))
        for( y in 1:nrow(comb)){
          comb_n = as.character(comb[y,])
          comb_n = paste(comb_n, collapse = ":")
          char = c(char , comb_n)
        }
    }

    return(char)
  } # else

}

Answer 1

以

开头的变量

phrase1 = list("Today", "is", "a", "perfect", "day")
phrases = list( phrase1, phrase1 )
char_list = c("o","a")

以下解决方案需要以下库

library(stringr)          # str_locate
library(purrr)            # map2

我制作了一个单一短语的功能

parsephrase <- function(phrase, z) {
                  intermediate1 <- lapply(phrase1, function(x) sapply(char_list, function(y) str_locate(x,y)[1,1]))
                  intermediate2 <- map2(intermediate1, 1:length(intermediate1), ~expand.grid(c(0,.x[1]),c(0,.x[2])) %>% filter(complete.cases(.)) %>% mutate(ID=paste0("p", z, "w",.y)))
                  intermediate3 <- Reduce("rbind", intermediate2)
                  return(intermediate3)
               }

此处仅使用phrase1（head）

输出

   Var1 Var2   ID
1     0    0 p1w1
2     2    0 p1w1
3     0    4 p1w1
4     2    4 p1w1
5     0    0 p1w2

操作phrases使用

final <- Reduce("rbind", map2(phrases, 1:length(phrases), ~ parsephrase(.x, .y)))

减少R中顺序过程的执行时间

1 个答案: