我尝试了一些事情,但没有做出任何明显的更改。我将其隔离到for循环中,希望有人可以给我一些想法。
快速背景:流程处于SaS中,SaS即将消失。我们采用两个数据集,通过zip合并,然后比较带有Jaro-Winkler的字符串,以识别两个实体在两个集合中可能是相同的。例如。邮编是一样的吗?是的,移至下一个条件,X是否相同?不,但是它是 close ,它会继续抛出不需要的内容,将匹配项放入新的数据框中。 这部分内容完全不需要花费时间。
我们不能直接合并,因为我们在EC2实例上使用了所有128g的RAM。
因此,逐行循环并仅保留那些符合我们的匹配条件的循环,但是循环速度是一个很大的问题。对于R来说还算是新手,所以我可能会遗漏一些显而易见的东西。
structure(list(PPOppID = c("785041315", "829852094", "854136412",
"787141118"), BusinessName = c("HAPPY COMPANY",
"SAD COMPANY", "HORRIBLE COMPANY", "MILDLY UPSET COMPANY"
), StreetName = c("HAPPY TRAIL", "SAD TRAIL", "HORRIBLE TRAIL",
"MILDLY UPSET TRAIL"), City = c("TOWNA", "TOWNB", "TOWNC", "TOWND"
), State = structure(c(52L, 52L, 52L, 52L), .Label = c("AK",
"AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI",
"IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI",
"MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV",
"NY", "OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN", "TX",
"UT", "VA", "VT", "WA", "WI", "WV", "WY"), class = "factor"),
Zip = c("12345", "12345", "12345", "12345"), street_num = c("1435",
"110", "105", "875"), street_pre_direction = c("S", "E",
"", ""), street_name = c("HAPPY TRAIL", "SAD TRAIL", "HORRIBLE TRAIL", "MILDLY UPSET TRAIL"
), suffix = c("RD", "ST", "", ""), streetdirection = c("",
"", "", ""), unit = c("", "STE", "", ""), unit = c("",
"202", "", ""), po_box = c("", "", "", ""), phone = c("1234567891",
"3231234543", "9876543219", "9087653456"), state_code = c("56029",
"56021", "56013", "56043"), name = c("", "", "",
"PERSONA"), FirstName = c("", "", "", "K"), contact_middle = c("",
"", "", ""), LastName = c("", "", "", "LAST_NAME"), contact_title = c("",
"", "", "OWNER"), year_started = c("1996", "2005",
"1993", "1981"), number_of_employees = c("3", "5",
"5", "5"), size_code = c("A", "B", "B", "B"), type = c("C",
"C", "C", "C"), industry_class = c("", "",
"A", "A"), code= c(NA, NA, "139", "119"
), code2 = c(NA, NA, "111100", "111100"), annual_amount = c("289",
"378", "0", "306"), annual_amount2 = c("A", "A",
"A", "A"), acc002 = c("3", "1", "1", "0"), brc014 = c("0",
"0", "0", "0"), t055 = c("1", "0", "1", "0"), t068 = c("2",
"2", "1", "1"), loaddate = c("2018-05-25", "2018-05-25",
"2018-05-25", "2018-05-25"), DBAName = c("HAPPY COMPANY",
"SAD COMPANY", "HORRIBLE COMPANY", "MILDLY UPSET COMPANY"
), Phone = c(NA_character_, NA_character_, NA_character_,
NA_character_), PhoneAreaCode = c(NA_character_, NA_character_,
NA_character_, NA_character_), StreetNum = c("1435", "110",
"105", "875")), class = c("data.table", "data.frame"), row.names = c(NA, -4L)
structure(list(rn = c("1", "2", "3", "4"), Id = c("0000000000abcG7MAI",
"000C000000abcg9MAA", "000QC000000abcG9MAI", "000C000000abcGaMAI"
), NumId = c("187639087", "237893456", "923785629", "298777656"
), BusinessName = c("HAPPY COMPANY", "K&W PHARMACY INC",
"SCOTTISH INN", "CORY SMITH STUDIOS"), DBABusinessName = c("",
"", "", ""), Phone = c("123456789", "987654321", "9999999999",
"6086892577"), PhoneAreaCode = c("999", "123", "456", "678"),
FirstName = c("SAM", "KYLE", "TONY", "MIKE"), LastName = c("SAM",
"SMITH", "TRAVIS", "JOHNSON"), StreetNum = c("7585", "170",
"2457", ""), StreetName = c("Avoderm WAY", "Blue Buffalo U", "Farmina BLVD",
"PO BOX 0"), State = c("NJ", "NY", "AK", "PR"), City = c("P",
"X", "X", ""), Zip = c("19425", "08765",
"37355", "54632")), class = c("data.table", "data.frame"), row.names = c(NA,-4L)
for(row in 1:length(df1$Zip)) {
df1 <- inner_join(df1, df2, by = c('Zip'))
df1[] <- lapply(df1, as.character)
df1$MatchBizName <- pmax(1-stringdist(df1$BusinessName, df1$BusinessName, method="jw", p=0.1),
1-stringdist(df1$DBAName, df1$DBAName, method="jw", p=0.1),
1-stringdist(df1$BusinessName, df1$DBABusinessName, method="jw", p=0.1),
1-stringdist(df1$DBAName, df1$BusinessName, method="jw", p=0.1))
df1$MatchPhone <- ifelse(1-stringdist(df1$Phone, df1$Phone, method="jw", p=0.1)>=1,1,0)
df1$MatchFirstName <- 1-stringdist(df1$FirstName, df1$FirstName, method="jw", p=0.1)
df1$MatchLastName <- 1-stringdist(df1$LastName, df1$LastName, method="jw", p=0.1)
df1$MatchStreetNum <- 1-stringdist(df1$StreetNum, df1$StreetNum, method="jw", p=0.1)
df1$MatchStreetName <- 1-stringdist(df1$StreetName, df1$StreetName, method="jw", p=0.1)
df1$MatchCity <- 1-stringdist(df1$City, df1$City, method="jw", p=0.1)
df1matches <- subset(df1,(MatchBizName >= 0.9 & MatchCity == 1 & MatchStreetName >= 0.7 & MatchStreetNum >= 0.7) |
(MatchPhone == 1 & MatchFirstName == 1 & MatchLastName == 1 & MatchStreetNum == 1 & MatchStreetName == 1 & MatchCity == 1) |
(MatchBizName >= 0.9 & MatchStreetNum == 1 & MatchStreetName >= 0.9 & MatchCity == 1) |
(MatchStreetNum == 1 & MatchPhone == 1 & MatchStreetName == 1 & MatchCity == 1) |
(MatchLastName >= 0.9 & MatchPhone == 1 & MatchBizName >= 0.9 & MatchCity == 1) |
(MatchPhone == 1 & MatchCity == 1 & MatchStreetNum == 1 & MatchStreetName >= 0.9 & MatchBizName >= 0.6) )
rm(df1)
rm(df1matches)
}