我正在寻找关于下面MWE目标的最佳重做实现的一些建议,该目标实际上必须使用agrep检查列表中的每个元素与另一个列表中的每个元素;这个例子是2x2,但我的实际问题是2,500x75,000 - 所以任何有关并行化的提示都可能有用。
text<-c("The quack brown fox jumps over a lazy dog.", "Pack my box with five dozzen liquor jugs.")
texts<-data.frame(text, stringsAsFactors = FALSE)
words<-c("quick","dozen")
search<-data.frame(words, stringsAsFactors = FALSE)
texts$match<-""
for (i in 1:nrow(search)) {
print(i)
for (j in 1:nrow(texts)) {
print(j)
temp<- agrep(search$words[i], texts$text[j], max.distance = 0.1, costs = NULL,
ignore.case = TRUE, value = TRUE, fixed = TRUE,
useBytes = FALSE)
# print(temp)
if (!((length(temp) == 0) && (typeof(temp) == "character"))) {
texts$match[j]<-paste0(texts$match[j], search$words[i],';')
}
rm(temp)
}
}
texts
text match
1 The quack brown fox jumps over a lazy dog. quick;
2 Pack my box with five dozzen liquor jugs. dozen;
答案 0 :(得分:0)
你至少可以使用agrep的第二个参数中的向量来删除第二个循环。
text<-c("The quack brown fox jumps over a lazy dog.", "Pack my box with five dozzen liquor jugs.")
texts<-data.frame(text, stringsAsFactors = FALSE)
words<-c("quick","dozen")
search<-data.frame(words, stringsAsFactors = FALSE)
texts$matchs <- ""
for (i in 1:nrow(search)) { # i <- 1
print(i)
temp<- agrepl(search$words[i], texts$text, max.distance = 0.1, costs = NULL,
ignore.case = TRUE, fixed = TRUE,
useBytes = FALSE)
# print(temp)
if (max(temp) == 1 ) {
texts$matchs[temp] <- paste(texts$matchs[temp], search$words[i], sep = ';')
}
}