需要大大提高R

时间:2018-07-01 16:00:21

标签: r

我尝试了一些事情,但没有做出任何明显的更改。我将其隔离到for循环中,希望有人可以给我一些想法。

快速背景:流程处于SaS中,SaS即将消失。我们采用两个数据集,通过zip合并,然后比较带有Jaro-Winkler的字符串,以识别两个实体在两个集合中可能是相同的。例如。邮编是一样的吗?是的,移至下一个条件,X是否相同?不,但是它是 close ,它会继续抛出不需要的内容,将匹配项放入新的数据框中。 这部分内容完全不需要花费时间。

我们不能直接合并,因为我们在EC2实例上使用了所有128g的RAM。

因此,逐行循环并仅保留那些符合我们的匹配条件的循环,但是循环速度是一个很大的问题。对于R来说还算是新手,所以我可能会遗漏一些显而易见的东西。

DataSet1

structure(list(PPOppID = c("785041315", "829852094", "854136412", 
"787141118"), BusinessName = c("HAPPY COMPANY", 
"SAD COMPANY", "HORRIBLE COMPANY", "MILDLY UPSET COMPANY"
), StreetName = c("HAPPY TRAIL", "SAD TRAIL", "HORRIBLE TRAIL", 
"MILDLY UPSET TRAIL"), City = c("TOWNA", "TOWNB", "TOWNC", "TOWND"
), State = structure(c(52L, 52L, 52L, 52L), .Label = c("AK", 
"AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", 
"IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", 
"MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", 
"NY", "OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN", "TX", 
"UT", "VA", "VT", "WA", "WI", "WV", "WY"), class = "factor"), 
    Zip = c("12345", "12345", "12345", "12345"), street_num = c("1435", 
    "110", "105", "875"), street_pre_direction = c("S", "E", 
    "", ""), street_name = c("HAPPY TRAIL", "SAD TRAIL", "HORRIBLE TRAIL", "MILDLY UPSET TRAIL"
    ), suffix = c("RD", "ST", "", ""), streetdirection = c("", 
    "", "", ""), unit = c("", "STE", "", ""), unit = c("", 
    "202", "", ""), po_box = c("", "", "", ""), phone = c("1234567891", 
    "3231234543", "9876543219", "9087653456"), state_code = c("56029", 
    "56021", "56013", "56043"), name = c("", "", "", 
    "PERSONA"), FirstName = c("", "", "", "K"), contact_middle = c("", 
    "", "", ""), LastName = c("", "", "", "LAST_NAME"), contact_title = c("", 
    "", "", "OWNER"), year_started = c("1996", "2005", 
    "1993", "1981"), number_of_employees = c("3", "5", 
    "5", "5"), size_code = c("A", "B", "B", "B"), type = c("C", 
    "C", "C", "C"), industry_class = c("", "", 
    "A", "A"), code= c(NA, NA, "139", "119"
    ), code2 = c(NA, NA, "111100", "111100"), annual_amount = c("289", 
    "378", "0", "306"), annual_amount2 = c("A", "A", 
    "A", "A"), acc002 = c("3", "1", "1", "0"), brc014 = c("0", 
    "0", "0", "0"), t055 = c("1", "0", "1", "0"), t068 = c("2", 
    "2", "1", "1"), loaddate = c("2018-05-25", "2018-05-25", 
    "2018-05-25", "2018-05-25"), DBAName = c("HAPPY COMPANY", 
    "SAD COMPANY", "HORRIBLE COMPANY", "MILDLY UPSET COMPANY"
    ), Phone = c(NA_character_, NA_character_, NA_character_, 
    NA_character_), PhoneAreaCode = c(NA_character_, NA_character_, 
    NA_character_, NA_character_), StreetNum = c("1435", "110", 
    "105", "875")), class = c("data.table", "data.frame"), row.names = c(NA, -4L)

DataSet2

structure(list(rn = c("1", "2", "3", "4"), Id = c("0000000000abcG7MAI", 
"000C000000abcg9MAA", "000QC000000abcG9MAI", "000C000000abcGaMAI"
), NumId = c("187639087", "237893456", "923785629", "298777656"
), BusinessName = c("HAPPY COMPANY", "K&W PHARMACY INC", 
"SCOTTISH INN", "CORY SMITH STUDIOS"), DBABusinessName = c("", 
"", "", ""), Phone = c("123456789", "987654321", "9999999999", 
"6086892577"), PhoneAreaCode = c("999", "123", "456", "678"), 
    FirstName = c("SAM", "KYLE", "TONY", "MIKE"), LastName = c("SAM", 
    "SMITH", "TRAVIS", "JOHNSON"), StreetNum = c("7585", "170", 
    "2457", ""), StreetName = c("Avoderm WAY", "Blue Buffalo U", "Farmina BLVD", 
    "PO BOX 0"), State = c("NJ", "NY", "AK", "PR"), City = c("P", 
    "X", "X", ""), Zip = c("19425", "08765", 
    "37355", "54632")), class = c("data.table", "data.frame"), row.names = c(NA,-4L) 

代码

for(row in 1:length(df1$Zip)) { 


  df1 <- inner_join(df1, df2, by = c('Zip')) 
  df1[] <- lapply(df1, as.character)

  df1$MatchBizName <- pmax(1-stringdist(df1$BusinessName, df1$BusinessName, method="jw", p=0.1),
                           1-stringdist(df1$DBAName, df1$DBAName, method="jw", p=0.1),
                           1-stringdist(df1$BusinessName, df1$DBABusinessName, method="jw", p=0.1),
                           1-stringdist(df1$DBAName, df1$BusinessName, method="jw", p=0.1))
  df1$MatchPhone <- ifelse(1-stringdist(df1$Phone, df1$Phone, method="jw", p=0.1)>=1,1,0)
  df1$MatchFirstName <- 1-stringdist(df1$FirstName, df1$FirstName, method="jw", p=0.1)
  df1$MatchLastName <- 1-stringdist(df1$LastName, df1$LastName, method="jw", p=0.1)
  df1$MatchStreetNum <- 1-stringdist(df1$StreetNum, df1$StreetNum, method="jw", p=0.1)
  df1$MatchStreetName <- 1-stringdist(df1$StreetName, df1$StreetName, method="jw", p=0.1)
  df1$MatchCity <- 1-stringdist(df1$City, df1$City, method="jw", p=0.1)

  df1matches <- subset(df1,(MatchBizName >= 0.9 & MatchCity == 1 & MatchStreetName >= 0.7 & MatchStreetNum >= 0.7) |
                         (MatchPhone == 1 & MatchFirstName == 1 & MatchLastName == 1 & MatchStreetNum == 1 & MatchStreetName == 1 & MatchCity == 1) |
                         (MatchBizName >= 0.9 & MatchStreetNum == 1 & MatchStreetName >= 0.9 & MatchCity == 1) |
                         (MatchStreetNum == 1 & MatchPhone == 1 & MatchStreetName == 1 & MatchCity == 1) |
                         (MatchLastName >= 0.9 & MatchPhone == 1 & MatchBizName >= 0.9 & MatchCity == 1) |
                         (MatchPhone == 1 & MatchCity == 1 & MatchStreetNum == 1 & MatchStreetName >= 0.9 & MatchBizName >= 0.6) )

  rm(df1)
  rm(df1matches)

  }

0 个答案:

没有答案