加速R代码通过向量化从字符串中删除停用词

时间:2016-02-23 23:50:48

标签: r tm

我在下面列出了一个成功运行的代码,用于删除文本中的停用词以及相应的词性[POS]。但是需要一段时间才能在大约4小时内运行。        我在想如果我通过矢量化摆脱for循环,它会加速。但我不知道它是否可行或是否有用。我需要通过更好的方式帮助加快代码。

我可以使用tm packge R tm removeWords stopwords is not removing stopwords删除停用词,但我需要删除相应的POS标记,这在tm包中是不可能的。

注意:我已经能够使用foreeach并行化最外层的for循环,以便在12个核心上运行。

代码:

# Reproducible data
# id is to identify the source
# phrase contains original string
# modifiedphrase contains string with stop words removved

id <- c(1,2,3)
phrase <- c("choice_for_selection","accordingly_choices_for_selection", "only_top_selection")
pos <-  c("NN JJ NN","NN JJ NN NN", "NNS NN NNS") #fake part of speech
df <- as.data.frame(cbind(id,phrase,pos))
df<-cbind(df,df$phrase) # creating copy of the phrase to modify it 
df<-cbind(df,df$pos) # creating copy of the pos to modify it
colnames(df) <- c("id","phrase","pos","modifiedphrase","modpos")
df$modifiedphrase<-as.character(df$modifiedphrase)
df$modpos<-as.character(df$modpos)

# stop words list
library(tm)
SWList<-  stopwords(kind = "SMART")

library(stringr)

#Code to remove stop words in strings
# the first outermost for loop i am able to parallelize using foreach
for(i in 1:length(df[,1])){
  tokensplit<-str_split(df[i,"phrase"],"_")[[1]]
  possplit<-str_split(df[i,"pos"]," ")[[1]]
   change=0
  forremoval=NULL
  for(j in 1:length(tokensplit)){
    if(tokensplit[j] %in% SWList){
      change=1
      forremoval<-append(forremoval,j)
      tmppos<-paste(possplit[-forremoval],collapse=" ")
    }
  }
  if(change==1){
    tmp<-paste(tokensplit[-forremoval],collapse="_")
    if(length(tmp)==0){
      tmp=""
      tmppos=""
    }
  df[i,"modifiedphrase"]=tmp
  df[i,"modpos"]=tmppos
  }
}

# Final output
print(df)

  id                            phrase         pos    modifiedphrase modpos
1  1              choice_for_selection    NN JJ NN  choice_selection  NN NN
2  2 accordingly_choices_for_selection NN JJ NN NN choices_selection  JJ NN
3  3                only_top_selection  NNS NN NNS     top_selection NN NNS

> 

1 个答案:

答案 0 :(得分:1)

这是一个蓝宝石版本:

# stop words list
library(tm)
SWList <- stopwords(kind = "SMART")

df$modpos <- apply(df[,c('phrase', 'pos')], 1, function(x){
    paste(strsplit(x[2],' ')[[1]][!((strsplit(x[1],'_')[[1]])%in%SWList)], collapse=" ")
})

df$modifiedphrase <- sapply(df$modified, function(x) {
    paste(setdiff(strsplit(x,"_")[[1]],SWList),collapse="_")
})

我知道它的假数据,但您可能还想考虑删除停用词中的撇号:

SWList = gsub('\'','',SWList)

<强>更新

效率检查:

(1)设置数据功能:我们可以在每次效率检查之前设置数据。

setup_data = function(){
  id <- c(1,2,3)
  phrase <- c("choice_for_selection","accordingly_choices_for_selection", "only_top_selection")
  pos <-  c("NN JJ NN","NN JJ NN NN", "NNS NN NNS") #fake part of speech
  df <- as.data.frame(cbind(id,phrase,pos))
  df<-cbind(df,df$phrase) # creating copy of the phrase to modify it 
  df<-cbind(df,df$pos) # creating copy of the pos to modify it
  colnames(df) <- c("id","phrase","pos","modifiedphrase","modpos")
  df$modifiedphrase<-as.character(df$modifiedphrase)
  df$modpos<-as.character(df$modpos)
  return(df)
}

(2)原始的For循环方法:

forloop_method = function(){
    for(i in 1:length(df[,1])){
    tokensplit<-str_split(df[i,"phrase"],"_")[[1]]
    possplit<-str_split(df[i,"pos"]," ")[[1]]
    change=0
    forremoval=NULL
    for(j in 1:length(tokensplit)){
      if(tokensplit[j] %in% SWList){
        change=1
        forremoval<-append(forremoval,j)
        tmppos<-paste(possplit[-forremoval],collapse=" ")
      }
    }
    if(change==1){
      tmp<-paste(tokensplit[-forremoval],collapse="_")
      if(length(tmp)==0){
        tmp=""
        tmppos=""
      }
      df[i,"modifiedphrase"]=tmp
      df[i,"modpos"]=tmppos
    }
  }
}

(3)申请方法:

apply_method = function(){
  df$modpos <- apply(df[,c('phrase', 'pos')], 1, function(x){
    paste(strsplit(x[2],' ')[[1]][!((strsplit(x[1],'_')[[1]])%in%SWList)], collapse=" ")
  })

  df$modifiedphrase <- sapply(df$modified, function(x) {
    paste(setdiff(strsplit(x,"_")[[1]],SWList),collapse="_")
  })
}

(4)使用&#39;微基准测试的微秒效率&#39;包:

library(microbenchmark)
df = setup_data()
microbenchmark(forloop_method(), unit='us')

Unit: microseconds
           expr     min       lq     mean  median      uq      max neval
 forloop_method 884.229 965.2805 1050.775 992.224 1032.69 2680.374   100

df = setup_data()
microbenchmark(apply_method, unit='us')

Unit: microseconds
         expr   min    lq    mean median    uq    max neval
 apply_method 0.018 0.025 0.49948  0.026 0.027 45.379   100

1050.775 / 0.49948 =我的系统加速2103.738倍。