两个数据集的行的模糊匹配,不使用for循环

时间:2016-07-05 11:27:04

标签: r matching sapply fuzzy record-linkage

我有两个数据集A和B,每个数据集有8个coloumns。数据集A有942行,数据集B有5079行。我必须比较数据集A和数据集B并进行模糊匹配。如果在数据集B中匹配任何行,我必须标记"匹配"在附加列中的数据集A中。

我对R来说相对较新,并且不确定如何使用lapply,mapply或sapply而不是forloop来优化r代码。

以下是我的代码

##############################
# Install Necessary Packages #
##############################


#install.packages("openxlsx")
#install.packages("stringdist")
#install.packages("XLConnect")


##############################
#        Load Packages       #
##############################


library(openxlsx)
library(stringdist)
library(XLConnect)


cmd_newleads <- read.xlsx("Src/CMD - New Leads to Load.xlsx", sheet = "Top Leads Full Data", startRow = 1, colNames = TRUE)
cmd_newleads[c("Lead_Match","Opportunity_Match")] <- ""
c4c_leads <- read.xlsx("Src/C4C - Leads.xlsx", sheet = "Leads", startRow = 1, colNames = TRUE)
#c4c_opportunities <- read.xlsx("Src/C4C - Opportunities Data 6-24-16.xlsx", sheet = "Export 06-24-2016 04.55.46 PM", startRow = 1, colNames = TRUE)


cmd_newleads_selcols <- cmd_newleads[,c("project_name","project_address","project_city","project_state_province_region_code","project_postalcode","project_country","project_sector","project_type")]
cmd_newleads_selcols[is.na(cmd_newleads_selcols)] <- ""
#rownames(cmd_newleads_selcols)

c4cleads_selcols <- c4c_leads[,c("Lead","Address1.(Lead)","City.(Lead)","Region.(Lead)","Postal.Code.(Lead)","Country.(Lead)","Sector.(Lead)","Type.(Lead)")]
c4cleads_selcols[is.na(c4cleads_selcols)] <- ""
#cmd_c4copportunities_selcols <- c4c_opportunities[,c("project_name","project_address","project_city","project_state_province_region_code","project_postalcode","project_country","project_sector","project_type")]

rcount_cmdnewleads <- nrow(cmd_newleads)
rcount_c4cleads <- nrow(c4c_leads)
#rcount_c4copportunities <- nrow(c4c_opportunities)




for(i in 1:rcount_cmdnewleads)
{

    cmd_project_name <- cmd_newleads_selcols[i,1]
    cmd_project_address <- cmd_newleads_selcols[i,2]
    cmd_project_city <- cmd_newleads_selcols[i,3]
    cmd_project_region_code <- cmd_newleads_selcols[i,4]
    cmd_project_postalcode <- cmd_newleads_selcols[i,5]
    cmd_project_country <- cmd_newleads_selcols[i,6]
    cmd_project_sector <- cmd_newleads_selcols[i,7]
    cmd_project_type <- cmd_newleads_selcols[i,8]


    for(j in 1:rcount_c4cleads)
    {

      c4cleads_project_name <- c4cleads_selcols[j,1]
      c4cleads_project_address <- c4cleads_selcols[j,2]
      c4cleads_project_city <- c4cleads_selcols[j,3]
      c4cleads_project_region_code <- c4cleads_selcols[j,4]
      c4cleads_project_postalcode <- c4cleads_selcols[j,5]
      c4cleads_project_country <- c4cleads_selcols[j,6]
      c4cleads_project_sector <- c4cleads_selcols[j,7]
      c4cleads_project_type <- c4cleads_selcols[j,8]

      project_percent <- stringsim(cmd_project_name,c4cleads_project_name, method="dl", p=0.1)
      address_percent <- stringsim(cmd_project_address,c4cleads_project_address, method="dl", p=0.1)
      city_percent <- stringsim(cmd_project_city,c4cleads_project_city, method="dl", p=0.1)
      region_percent <- stringsim(cmd_project_region_code,c4cleads_project_region_code, method="dl", p=0.1)
      postalcode_percent <- stringsim(cmd_project_postalcode,c4cleads_project_postalcode, method="dl", p=0.1)
      country_percent <- stringsim(cmd_project_country,c4cleads_project_country, method="dl", p=0.1)
      sector_percent <- stringsim(cmd_project_sector,c4cleads_project_sector, method="dl", p=0.1)
      type_percent <- stringsim(cmd_project_type,c4cleads_project_type, method="dl", p=0.1)

      if(project_percent > 0.833 && address_percent > 0.833 && city_percent > 0.833 && region_percent > 0.833 && postalcode_percent > 0.833 && country_percent > 0.833 && sector_percent > 0.833 && type_percent > 0.833)
      {
        cmd_newleads[i,51] <- c4cleads[j,c4cleads$Lead.ID]
      }
      else 
      {
        cmd_newleads[i,51] <- "New Lead"
      }

    }

}

分别为cmd_newleads_selcols和c4cleads_selcols的示例数据

 project_name project_address project_city
1     Wynn Mystic Casino & Hotel  22 Chemical Ln      Everett
2 Northpoint Complex Development     East Street    Cambridge
3 Northpoint Complex Development     East Street    Cambridge
4 Northpoint Complex Development     East Street    Cambridge
5 Northpoint Complex Development     East Street    Cambridge
6 Northpoint Complex Development     East Street    Cambridge
  project_state_province_region_code project_postalcode
1                                 MA              02149
2                                 MA              02138
3                                 MA              02138
4                                 MA              02138
5                                 MA              02138
6                                 MA              02138
           project_country project_sector project_type
1 United States of America    Hospitality New Building
2 United States of America     Apartments New Building
3 United States of America     Apartments New Building
4 United States of America     Apartments New Building
5 United States of America     Apartments New Building
6 United States of America     Apartments New Building







Lead           Address1.(Lead) City.(Lead) Region.(Lead) Postal.Code.(Lead) Country.(Lead)
1 1 Hotel Brooklyn Bridge Park Old Fulton St & Furman St    Brooklyn      New York              11201  United States
2      10 Trinity Square Hotel         10 Trinity Square      London             #               EC3P United Kingdom
3                  100 Stewart           1900 1st Avenue     Seattle    Washington              98101  United States
4                1136 S Wabash                         #           #             #                  #   Not assigned
5          115-129 37th Street       115-129 37th Street  Union CIty    New Jersey                  #  United States
6               1418 W Addison            1418 w Addison     Chicago             #              60613   Not assigned
          Sector.(Lead)      Type.(Lead)
1           Hospitality     New Building
2           Hospitality Brand Conversion
3           Hospitality     New Building
4 High Rise Residential     New Building
5             Developer     New Building
6 High Rise Residential     New Building

2 个答案:

答案 0 :(得分:1)

如果您遇到效率问题,那不是因为您使用的是for循环。主要问题是,您正在为两个数据集中的每个可能的行组合做很多工作。使用更有效的语言功能可能会加快速度,但它不会改变您进行大量不必要计算的事实。

提高数据匹配问题效率的最佳方法之一是排除明显的不匹配以减少不必要的计算。例如,您可以更改内部循环以首先检查一些关键条件;如果分数很低(即它显然不匹配),您不需要计算其余属性的相似性分数。

例如:

for(i in 1:rcount_cmdnewleads)
{

    cmd_project_name <- cmd_newleads_selcols[i,1]
    ...

for(j in 1:rcount_c4cleads)
  {

  c4cleads_project_name <- c4cleads_selcols[j,1]
  project_percent <- stringsim(cmd_project_name,c4cleads_project_name, method="dl", p=0.1)
  if (project_percent < .83) {

      # you already know that this is a non-match, so go to the next one
      next

    } else {

      # check the rest of the values!
      ...

    }
  }
}

我不熟悉R RecordLinkage软件包,但Python recordlinkage软件包具有在过程早期排除明显不匹配以提高效率的工具。考虑通过排除明显的不匹配来查看this tutorial以了解有关加快记录链接的更多信息。

答案 1 :(得分:0)

您可能希望查看包RecordLinkage,它允许您执行语音匹配,概率记录链接和机器学习方法。