我试图比较两组地址(一个小的:pams:~30,000行)和另一个大的(nppes:~4.5M行)。我正在寻找与名字和姓氏的紧密匹配,然后是地址上的后续最佳匹配(我已经使用了邮政编码的子集)。下面是我到目前为止编写的代码。
问题是: - 它很慢(下面的代码比我上次运行时有点优化但是 它花了几个小时而没有完成)。 - 限制匹配姓氏的第一个字母。我加了 尝试让代码运行的限制,但我宁愿没有 附加限制。
我非常感谢有关如何改善这一点的任何建议。
npi_1 <- data.frame(npi = nppes$npi, first_name = tolower(nppes$first_name), last_name = tolower(nppes$last_name),
zip = nppes$b_post_code)
pams_1 <- data.frame(pams_id = pams$ID, npi = pams$NPI, first_name = tolower(pams$First.Name),
last_name = tolower(pams$Last.Name), zip = gsub("-", "", pams$Zip))
result <- data.frame(pams_id = "", npi = "", match = "", stringsAsFactors = F)
for (i in 1:27809)
{
pams_2 <- pams_1[i,]
npi_2 <- npi_1 %>% filter(substr(last_name, 1, 1) == substr(pams_2$last_name, 1, 1))
npi_2 <- npi_2 %>% mutate(match_last = stringdist(last_name, pams_2$last_name, method = "jw")) %>%
filter(match_last <= 0.1000000)
npi_2 <- npi_2 %>% mutate(match_first = stringdist(first_name, pams_2$first_name, method = "jw")) %>%
filter(match_first <= 0.1000000)
npi_2 <- npi_2 %>% mutate(match_zip3 = stringdist(substr(zip, 1, 3), substr(pams_2$zip, 1, 3), method = "jw"))
npi_2 <- npi_2 %>% mutate(match_zip5 = stringdist(substr(zip, 1, 3), substr(pams_2$zip, 1, 5), method = "jw"))
npi_2 <- npi_2 %>% mutate(match_zip = stringdist(zip, pams_2$zip, method = "jw"))
npi_2 <- npi_2 %>% mutate(match_score = ((match_last + match_first) * 1.4) + (match_zip3 * 1.2) + match_zip5 + (match_zip * 0.8))
npi_2 <- npi_2 %>% arrange(match_score)
pair_1 <- c(pams_2$pams_id[1], npi_2$npi[1], npi_2$match_score[1])
result[i, ] <- pair_1
}