我非常感谢你对我的问题的帮助。我已经在这几个星期了,并且找不到解决方案。请继续阅读。我想我发布了许多细节,所以你完全得到了问题,虽然它并不难理解。
情况:
数据框由400万个条目组成(总大小:250 MB)。两列:ID
和TEXT
。文本字符串最多为200个字符。
任务:
预处理文本字符串
问题:
这需要太长时间。在我的8GB RAM双核机器上:1天后取消。在70GB 8核Amazon EC2实例上:1天后取消。
详细信息:
我基本上是
用作模式的向量如下所示:
"\\bWORD1\\b|\\bWORD2\\b|\\bWORD3\\b|\\bWORD4\\b..."
因此,那些'替换向量'是长度为1的字符向量,每个向量包含多达800个字
主要
library("parallel")
library("stringr")
preprocessText<-function(x){
# Replace the 'html-and'
arguments<-list(pattern="\\&\\;",replacement="and",x=x, ignore.case=TRUE)
y<-do.call(gsub, arguments)
# Remove some special characters
arguments<-list(pattern="[^-[:alnum:]\\'\\:\\/\\$\\%\\.\\,\\+\\-\\#\\@\\_\\!\\?+[:space:]]",replacement="",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
# Lowercase
arguments<-list(string=y,pattern=tolower(rep_ticker))
first<-do.call(str_match,arguments)
# Identify signal words and count them
# Need to be done in parts, because otherwise R can't handle this many at once
arguments<-list(string=x, pattern=rep_words_part1)
t1<-do.call(str_extract_all,arguments)
arguments<-list(string=x, pattern=rep_words_part2)
t2<-do.call(str_extract_all,arguments)
arguments<-list(string=x, pattern=rep_words_part3)
t3<-do.call(str_extract_all,arguments)
arguments<-list(string=x, pattern=rep_words_part4)
t4<-do.call(str_extract_all,arguments)
count=length(t1[[1]])+length(t2[[1]])+length(t3[[1]])+length(t4[[1]])
signal_words=c(t1[[1]],t2[[1]],t3[[1]],t4[[1]])
# Replacements
arguments<-list(pattern=rep_wordsA,replacement=" [wordA] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_wordB_part1,replacement=" [wordB] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_wordB_part2,replacement=" [wordB] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_wordB_part3,replacement=" [wordB] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_wordB_part4,replacement=" [wordB] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_email,replacement=" [email_adress] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_url,replacement=" [url] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_wordC,replacement=" [wordC] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
# Some regular expressions
arguments<-list(pattern="\\+[[:digit:]]*.?[[:digit:]]+%",replacement=" [positive_percentage] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern="-[[:digit:]]*.?[[:digit:]]+%",replacement=" [negative_percentage] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern="[[:digit:]]*.?[[:digit:]]+%",replacement=" [percentage] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern="\\$[[:digit:]]*.?[[:digit:]]+",replacement=" [dollar_value] ",x=y,ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern="\\+[[:digit:]]*.?[[:digit:]]+",replacement=" [pos_number] ",x=y, ignore.case=TRUE)# remaining numbers
y<-do.call(gsub, arguments)
arguments<-list(pattern="\\-[[:digit:]]*.?[[:digit:]]+",replacement=" [neg_number] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern="[[:digit:]]*.?[[:digit:]]+",replacement=" [number] ",x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_question,replacement=" [question] ", x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
# Unify synonyms
arguments<-list(pattern=rep_syno1,replacement="happy", x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_syno2,replacement="sad", x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_syno3,replacement="people", x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_syno4,replacement="father", x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_syno5,replacement="mother", x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rep_syno6,replacement="money", x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
# Remove words
# Punctuation (I know there a pre-defined R commands for this, but I need to customize this
arguments<-list(pattern=rem_punct,replacement="", x=y, ignore.case=TRUE)
y<-do.call(gsub, arguments)
arguments<-list(pattern=rem_linebreak,replacement=" ", x=y, ignore.case=TRUE) #Remove line breaks
y<-do.call(gsub, arguments)
#Append Positive or Negative Emotion
arguments<-list(x=y)
y<-do.call(appendEmotion, arguments)
# Output
result<-list(
textclean=y,
first_signal=first,
all_signals=signal_words,
signal_count=count)
return(result)
}
resultList<-mclapply(dataframe$text_column,preprocessText)
(返回将是一个列表,我打算将其转换为data.frame)。
之前,我还尝试单独调用每个gsub
,从而在每个文本字符串上执行第一个gsub
,然后在第二个gsub
上执行等等...但我猜这个效率更低。
代码本身有效,但对我来说似乎可以加速。不幸的是,我不熟悉哈希表,我听说这可能是一个解决方案。
非常感谢您的建议和帮助!
在preprocessText
appendEmotion<-function(x){
if (grepl(app_pos,x)){
x<-paste(x," [posemo] ")
}
if(grepl(app_neg,x)){
x<-paste(x," [negemo] ")
}
#Output
return(x)
}
示例数据:
+------------+-----------------------------------------+
| ID | Text |
+------------+-----------------------------------------+
| 123 | My dad and me finished the race top 5% |
| 456 | Look at this http://www.google.com, Like it ? |
..
+------------+-----------------------------------------+
应该成为
+------------+-------------------------------------------------+
| ID | Text |
+------------+-------------------------------------------------+
| 123 | my father and me finished the race top [percentage] |
| 456 | look at this [url] like it [question] |
..
+------------+--------------------------------------------------+