Question

我非常感谢你对我的问题的帮助。我已经在这几个星期了，并且找不到解决方案。请继续阅读。我想我发布了许多细节，所以你完全得到了问题，虽然它并不难理解。

情况：

数据框由400万个条目组成（总大小：250 MB）。两列：ID和TEXT。文本字符串最多为200个字符。

任务：

预处理文本字符串

问题：

这需要太长时间。在我的8GB RAM双核机器上：1天后取消。在70GB 8核Amazon EC2实例上：1天后取消。

详细信息：

我基本上是

计算某些字词出现在一个字符串中的频率
将此号码写入新列（COUNT）
替换此（计数）字
替换其他单词（我之前不需要计算）
替换一些正则表达式

用作模式的向量如下所示：

"\\bWORD1\\b|\\bWORD2\\b|\\bWORD3\\b|\\bWORD4\\b..."

因此，那些'替换向量'是长度为1的字符向量，每个向量包含多达800个字

主要

library("parallel")
library("stringr")

preprocessText<-function(x){

  # Replace the 'html-and'
  arguments<-list(pattern="\\&amp\\;",replacement="and",x=x, ignore.case=TRUE)
  y<-do.call(gsub, arguments)  

  # Remove some special characters
   arguments<-list(pattern="[^-[:alnum:]\\'\\:\\/\\$\\%\\.\\,\\+\\-\\#\\@\\_\\!\\?+[:space:]]",replacement="",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)  

  # Lowercase 
  arguments<-list(string=y,pattern=tolower(rep_ticker))
  first<-do.call(str_match,arguments)  

  # Identify signal words and count them
  # Need to be done in parts, because otherwise R can't handle this many at once
  arguments<-list(string=x, pattern=rep_words_part1)
  t1<-do.call(str_extract_all,arguments)

  arguments<-list(string=x, pattern=rep_words_part2)
  t2<-do.call(str_extract_all,arguments)

  arguments<-list(string=x, pattern=rep_words_part3)
  t3<-do.call(str_extract_all,arguments)

  arguments<-list(string=x, pattern=rep_words_part4)
  t4<-do.call(str_extract_all,arguments)

  count=length(t1[[1]])+length(t2[[1]])+length(t3[[1]])+length(t4[[1]])
  signal_words=c(t1[[1]],t2[[1]],t3[[1]],t4[[1]])


  # Replacements

  arguments<-list(pattern=rep_wordsA,replacement=" [wordA] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments) 

  arguments<-list(pattern=rep_wordB_part1,replacement=" [wordB] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern=rep_wordB_part2,replacement=" [wordB] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern=rep_wordB_part3,replacement=" [wordB] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern=rep_wordB_part4,replacement=" [wordB] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern=rep_email,replacement=" [email_adress] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern=rep_url,replacement=" [url] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern=rep_wordC,replacement=" [wordC] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  # Some regular expressions
  arguments<-list(pattern="\\+[[:digit:]]*.?[[:digit:]]+%",replacement=" [positive_percentage] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern="-[[:digit:]]*.?[[:digit:]]+%",replacement=" [negative_percentage] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern="[[:digit:]]*.?[[:digit:]]+%",replacement=" [percentage] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern="\\$[[:digit:]]*.?[[:digit:]]+",replacement=" [dollar_value] ",x=y,ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern="\\+[[:digit:]]*.?[[:digit:]]+",replacement=" [pos_number] ",x=y, ignore.case=TRUE)# remaining numbers 
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern="\\-[[:digit:]]*.?[[:digit:]]+",replacement=" [neg_number] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern="[[:digit:]]*.?[[:digit:]]+",replacement=" [number] ",x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)   

  arguments<-list(pattern=rep_question,replacement=" [question] ", x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)    


  # Unify synonyms
  arguments<-list(pattern=rep_syno1,replacement="happy", x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)  

  arguments<-list(pattern=rep_syno2,replacement="sad", x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)  

  arguments<-list(pattern=rep_syno3,replacement="people", x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)  

  arguments<-list(pattern=rep_syno4,replacement="father", x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)  

  arguments<-list(pattern=rep_syno5,replacement="mother", x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)  

  arguments<-list(pattern=rep_syno6,replacement="money", x=y, ignore.case=TRUE)
  y<-do.call(gsub, arguments)  

  # Remove words
  # Punctuation (I know there a pre-defined R commands for this, but I need to customize this
  arguments<-list(pattern=rem_punct,replacement="", x=y, ignore.case=TRUE) 
  y<-do.call(gsub, arguments)  

  arguments<-list(pattern=rem_linebreak,replacement=" ", x=y, ignore.case=TRUE) #Remove line breaks
  y<-do.call(gsub, arguments) 

  #Append Positive or Negative Emotion  
  arguments<-list(x=y)
  y<-do.call(appendEmotion, arguments)  


  # Output
  result<-list(
    textclean=y,
    first_signal=first,
    all_signals=signal_words,
    signal_count=count)

  return(result)
}

resultList<-mclapply(dataframe$text_column,preprocessText)

（返回将是一个列表，我打算将其转换为data.frame）。

之前，我还尝试单独调用每个gsub，从而在每个文本字符串上执行第一个gsub，然后在第二个gsub上执行等等...但我猜这个效率更低。

代码本身有效，但对我来说似乎可以加速。不幸的是，我不熟悉哈希表，我听说这可能是一个解决方案。

非常感谢您的建议和帮助！

在preprocessText

内调用的一个函数的定义

appendEmotion<-function(x){

  if (grepl(app_pos,x)){
    x<-paste(x," [posemo] ")
  } 
  if(grepl(app_neg,x)){
    x<-paste(x," [negemo] ")
  }  
  #Output
  return(x)
}

示例数据：

+------------+-----------------------------------------+
|  ID  |                     Text                      |
+------------+-----------------------------------------+
| 123  | My dad and me finished the race top 5%        |
| 456  | Look at this http://www.google.com, Like it ? |
  ..
+------------+-----------------------------------------+

应该成为

+------------+-------------------------------------------------+
|  ID  |                     Text                              |
+------------+-------------------------------------------------+
| 123  | my father and me finished the race top [percentage]   |
| 456  | look at this [url] like it [question]                 |
  ..
+------------+--------------------------------------------------+

R：使用gsub预处理文本字符串的时间太长

0 个答案: