Question

我正在循环文本以查找和计算各种词典中的特定单词。我使用两个FOR循环，它们非常慢并且需要数天才能完成。下面的可重现代码：

library(stringr)

#Sample data
tweets=data.frame(id=c(1,2,3),text=c("This is a tweet that contains word1",
                                     "And here you can find word1 and word2 word2",
                                     "And here is only one word3 and one word3a"))

words=data.frame(id=c(1,2,3),word=c("word1","word2","word3"))

for(i in 1:nrow(tweets)){
  for(j in 1:nrow(words)){
    term = paste("\\<",words[j,2],"\\>", sep="")
    if (str_count(tweets[i,2], term) != 0) {
     tmp <- data.frame(id=tweets[i,1],termfound=words[j,2],count=str_count(tweets[i,2], term), row.names=NULL)
     message("ID ",tweets[i,1]," - Word '",words[j,2],"' found ",str_count(tweets[i,2], term)," times")
     #sqlSave(myconn, tmp, "DataTable", append=T, rownames=F)
    }
  }
}

备注：
我有大约1M行的文字和约25,000个单词消息行仅用于调试最终值写入SQL - 行注释掉，因为它不可重现。

有什么方法可以改进吗？我在想一个APPLY函数???

干杯乙

Answer 1

观察：你的代码正在计算每个单词三次。一旦进入IF语句，一次进入tmp赋值，一次进入调试消息。减少对字符串计数功能的调用次数肯定会提高代码的效率。

如上所述，stringi包提供了一组更快的字符串函数。

以下矢量化代码将生成一个二维矩阵，其中包含您想要的结果然后转换为数据库所需的格式。

require(stringi)
tweets=data.frame(id=c(1,2,3),text=c("This is a tweet that contains word1",
                                     "And here you can find word1 and word2 word2",
                                     "And here is only one word3 and one word3a"),
                  stringsAsFactors = FALSE)
words=data.frame(id=c(1,2,3),word=c("word1","word2","word3"), stringsAsFactors = FALSE)
pat <- paste("\\b",words$word,"\\b", sep="")
sd <- function(text) { stri_count(text, regex=pat) }
results <- sapply(tweets$text, sd, USE.NAMES=F)
colnames(results) <- words$word
rownames(results) <- paste("ID", tweets$id)
results

产生以下输出：

##      word1 word2 word3
## ID 1     1     1     0
## ID 2     0     2     0
## ID 3     0     0     1

Answer 2

更新：您还可以在stringi内尝试data.table。

library(data.table); library(stringi)

## convert tweets to  data table and set key on 'id' column 
dtweets <- as.data.table(tweets)
setkey(dtweets, id)

## convert words to data.table and set up the regex
dtw <- as.data.table(words)
dtw[,term := stri_c("\\b", word, "\\b")]

## run stri_count_regex by each id 
dtn <- dt[dtw, stri_count_regex(text, term), by = key(dt)]
#    id V1
# 1:  1  1
# 2:  1  0
# 3:  1  0
# 4:  2  1
# 5:  2  2
# 6:  2  0
# 7:  3  0
# 8:  3  0
# 9:  3  1 

## melt the rows to columns
melted <- melt(dtn, id = 1L, measure = 2L)
dcast(melted, id ~ value, sum)
#   id 0 1 2
# 1  1 0 1 0
# 2  2 0 1 2
# 3  3 0 1 0

原始回答

这是另一种仅采用逻辑匹配的方法，然后根据这些值计算结果。我必须使用\\b作为term中的字边界。

library(stringi)

term <- stri_c("\\b", words$word, "\\b")

out <- vapply(seq_along(tweets$text), function(i) {
        a <- stri_detect_regex(tweets$text[i], term)
        a[a] <- cumsum(a[a != 0])
        a
    }, integer(nrow(tweets)))

cbind(tweets[1], `colnames<-`(out, words$word))
#   id word1 word2 word3
# 1  1     1     1     0
# 2  2     0     2     0
# 3  3     0     0     1

Answer 3

我和Daddy the Runner一样对这个问题有同样的想法：

term = paste("\\<",words$word,"\\>", sep="") # create a regex for every word
# [1] "\\<word1\\>" "\\<word2\\>" "\\<word3\\>"

m <- sapply(tweets$text,function(tweet) str_count(tweet,term)) # find a number of occurences of every word in every tweet
#      [,1] [,2] [,3]
# [1,]    1    1    0
# [2,]    0    2    0
# [3,]    0    0    1



library(reshape)
df <- melt(m) # convert the result into the data frame format
#   X1 X2 value
# 1  1  1     1
# 2  2  1     0
# 3  3  1     0
# 4  1  2     1
# 5  2  2     2
# 6  3  2     0
# 7  1  3     0
# 8  2  3     0
# 9  3  3     1

colnames(df) <- c('id.tweet','id.word','count')

tmp <- with(df,data.frame(id=id.tweet,termfound=words$word[id.word],count=count)) # create a data frame similar to the one in the example
# id termfound count
# 1  1     word1     1
# 2  2     word1     0
# 3  3     word1     0
# 4  1     word2     1
# 5  2     word2     2
# 6  3     word2     0
# 7  1     word3     0
# 8  2     word3     0
# 9  3     word3     1

使用R从文本字符串中提取字向量数组的更快的解决方案

3 个答案: