下面是使用模糊匹配识别重复项的代码。原始for循环被注释掉并用foreach循环替换。它不会产生任何输出错误消息。
library(foreach)
library(parallel)
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)
sample_data <- structure(list(idnty_frst_nm = structure(c(2L, 1L, 3L), .Label = c("I","Ima", "L"), class = "factor"), idnty_last_nm = structure(c(1L,1L, 2L), .Label = c("NINJA", "starr"), class = "factor"), PERSON_IDENTIFIER_VALUE = c(9021099834,9021099834, 8021088834)), .Names = c("idnty_frst_nm", "idnty_last_nm","PERSON_IDENTIFIER_VALUE"), row.names = c(NA, 3L), class = "data.frame")
sample_data$name<-paste(sample_data$idnty_frst_nm, sample_data$idnty_last_nm)
innov.df<-sample_data
rownames(innov.df)<-1:nrow(innov.df)
#Checking for duplicate names, using 'agrep' function and storing intermediate results in table 'p'
p<-data.frame(a=integer(),b=integer())
k<-1
#for(i in 1:nrow(innov.df)){
foreach(i = 1:nrow(innov.df)) %:% #this is a nesting operator
foreach (j = 1:nrow(innov.df)) %dopar% {
#for(j in 1:nrow(innov.df)){
agrep(innov.df$name[i], innov.df$name[j],
ignore.case=TRUE, value=FALSE,
max.distance = 0.07, useBytes = TRUE)
if(i!=j)
{
if((innov.df[i,2]==innov.df[j,2])&&(innov.df[i,3]==innov.df[j,3]))
{
p[k,1]<-min(i,j)
p[k,2]<-max(i,j)
k<-k+1
}
}
}
p
关于我做错的任何想法?预期的输出是:
a b
1 1 2
表示第一项有重复..