我正在循环文本以查找和计算各种词典中的特定单词。我使用两个FOR循环,它们非常慢并且需要数天才能完成。下面的可重现代码:
library(stringr)
#Sample data
tweets=data.frame(id=c(1,2,3),text=c("This is a tweet that contains word1",
"And here you can find word1 and word2 word2",
"And here is only one word3 and one word3a"))
words=data.frame(id=c(1,2,3),word=c("word1","word2","word3"))
for(i in 1:nrow(tweets)){
for(j in 1:nrow(words)){
term = paste("\\<",words[j,2],"\\>", sep="")
if (str_count(tweets[i,2], term) != 0) {
tmp <- data.frame(id=tweets[i,1],termfound=words[j,2],count=str_count(tweets[i,2], term), row.names=NULL)
message("ID ",tweets[i,1]," - Word '",words[j,2],"' found ",str_count(tweets[i,2], term)," times")
#sqlSave(myconn, tmp, "DataTable", append=T, rownames=F)
}
}
}
备注:
我有大约1M行的文字和约25,000个单词
消息行仅用于调试
最终值写入SQL - 行注释掉,因为它不可重现。
有什么方法可以改进吗?我在想一个APPLY函数???
干杯 乙
答案 0 :(得分:8)
观察:你的代码正在计算每个单词三次。一旦进入IF语句,一次进入tmp赋值,一次进入调试消息。减少对字符串计数功能的调用次数肯定会提高代码的效率。
如上所述,stringi包提供了一组更快的字符串函数。
以下矢量化代码将生成一个二维矩阵,其中包含您想要的结果 然后转换为数据库所需的格式。
require(stringi)
tweets=data.frame(id=c(1,2,3),text=c("This is a tweet that contains word1",
"And here you can find word1 and word2 word2",
"And here is only one word3 and one word3a"),
stringsAsFactors = FALSE)
words=data.frame(id=c(1,2,3),word=c("word1","word2","word3"), stringsAsFactors = FALSE)
pat <- paste("\\b",words$word,"\\b", sep="")
sd <- function(text) { stri_count(text, regex=pat) }
results <- sapply(tweets$text, sd, USE.NAMES=F)
colnames(results) <- words$word
rownames(results) <- paste("ID", tweets$id)
results
产生以下输出:
## word1 word2 word3
## ID 1 1 1 0
## ID 2 0 2 0
## ID 3 0 0 1
答案 1 :(得分:5)
更新:您还可以在stringi
内尝试data.table
。
library(data.table); library(stringi)
## convert tweets to data table and set key on 'id' column
dtweets <- as.data.table(tweets)
setkey(dtweets, id)
## convert words to data.table and set up the regex
dtw <- as.data.table(words)
dtw[,term := stri_c("\\b", word, "\\b")]
## run stri_count_regex by each id
dtn <- dt[dtw, stri_count_regex(text, term), by = key(dt)]
# id V1
# 1: 1 1
# 2: 1 0
# 3: 1 0
# 4: 2 1
# 5: 2 2
# 6: 2 0
# 7: 3 0
# 8: 3 0
# 9: 3 1
## melt the rows to columns
melted <- melt(dtn, id = 1L, measure = 2L)
dcast(melted, id ~ value, sum)
# id 0 1 2
# 1 1 0 1 0
# 2 2 0 1 2
# 3 3 0 1 0
原始回答
这是另一种仅采用逻辑匹配的方法,然后根据这些值计算结果。我必须使用\\b
作为term
中的字边界。
library(stringi)
term <- stri_c("\\b", words$word, "\\b")
out <- vapply(seq_along(tweets$text), function(i) {
a <- stri_detect_regex(tweets$text[i], term)
a[a] <- cumsum(a[a != 0])
a
}, integer(nrow(tweets)))
cbind(tweets[1], `colnames<-`(out, words$word))
# id word1 word2 word3
# 1 1 1 1 0
# 2 2 0 2 0
# 3 3 0 0 1
答案 2 :(得分:3)
我和Daddy the Runner一样对这个问题有同样的想法:
term = paste("\\<",words$word,"\\>", sep="") # create a regex for every word
# [1] "\\<word1\\>" "\\<word2\\>" "\\<word3\\>"
m <- sapply(tweets$text,function(tweet) str_count(tweet,term)) # find a number of occurences of every word in every tweet
# [,1] [,2] [,3]
# [1,] 1 1 0
# [2,] 0 2 0
# [3,] 0 0 1
library(reshape)
df <- melt(m) # convert the result into the data frame format
# X1 X2 value
# 1 1 1 1
# 2 2 1 0
# 3 3 1 0
# 4 1 2 1
# 5 2 2 2
# 6 3 2 0
# 7 1 3 0
# 8 2 3 0
# 9 3 3 1
colnames(df) <- c('id.tweet','id.word','count')
tmp <- with(df,data.frame(id=id.tweet,termfound=words$word[id.word],count=count)) # create a data frame similar to the one in the example
# id termfound count
# 1 1 word1 1
# 2 2 word1 0
# 3 3 word1 0
# 4 1 word2 1
# 5 2 word2 2
# 6 3 word2 0
# 7 1 word3 0
# 8 2 word3 0
# 9 3 word3 1