Question

在上一个问题（R function to failback in a left_join?）的基础上，我有24个不同的数据表，每个数据表都使用称为NAICS的行业分类系统，我希望在每个表中找到给定行业列表的最佳行业匹配。

行业代码越短，行业代码就越不详细，因此，如果没有完全匹配的内容，我希望目标的版本略短。例如，使用分类代码311111作为目标：

一个表可能具有完全匹配的内容：311111
一个表的详细程度可能要低一级：31111
一个表可能只包含不太详细的匹配项：31

当前方法（有关代码，请参见下文）： 遍历所有表，然后遍历每个代码长度（311111、31111、3111、311、31、3），然后尝试在该表中找到匹配项。

我的问题：

如何调整代码，以使匹配的多个实例不产生错误（如Supplied 261022 items to be assigned to 360 items of column 'match'）？一些数据是时间序列数据，因此将列出具有100个或更多观察值的相同行业代码。一些数据是横截面的，因此行业代码仅出现一次。

完整的上下文代码，但问题涉及步骤4 ：

library(data.table)

# Step 1: Load Table Data -------------------------------------------------
v_tablenames <- c("t_naics17index", "t_naics17def", "t_naics17cross", "t_naics17tree", 
                  "t_naics17isic4cross", "t_ios_2012", "t_iou_2012", "t_regdata6dig_2017", 
                  "t_brdis_2015", "t_mrkcon_2012", "t_matkind_2012", "t_ppiprice", 
                  "t_eximprice", "t_oes", "t_ces", "t_cps", "t_fed", "t_asm", "t_vps", 
                  "t_cbp", "t_exports", "t_imports", "t_expartner", "t_impartner")

for(tablename in v_tablenames){
  assign(tablename, readRDS(paste0("DataStore/", tablename, ".rds")))
}

# Step 2: Turn all of the tibbles into data.tables ------------------------
# Data wrangling done in the tidyverse; tibbles converted to data.tables
l_tables <- list(t_naics17index, t_naics17def, t_naics17cross, t_naics17tree, 
                 t_naics17isic4cross, t_ios_2012, t_iou_2012, t_regdata6dig_2017,
                 t_brdis_2015, t_mrkcon_2012, t_matkind_2012, t_ppiprice, 
                 t_eximprice, t_oes, t_ces, t_cps, t_fed, t_asm, t_vps, 
                 t_cbp, t_exports, t_imports, t_expartner, t_impartner)

lapply(l_tables, setDT)

# Step 3: Build Master Lookup Table ---------------------------------------
# Subset of classification codes I care about falls between 3----- and 4-----; pulled from t_naics17index, which has a complete list of codes
t_match <- unique(t_naics17index[NAICS17 >= "300000" & NAICS17 < "400000", c(1)])

# Step 4: Connect Data Tables ---------------------------------------------
code_len_count <- rev(seq_len(max(nchar(t_match$NAICS17))))

for (tablename in v_tablenames){
  t_match[, match := NA_character_]
  for (i in code_len_count){
    t_match[is.na(match), target := substr(NAICS17, 1, i)]
    t_match[is.na(match), match := get(tablename)[.SD, on=.(NAICS17 = target), mget("NAICS17")][]]
  }
  setnames(t_match, "match", paste0("m_", tablename))
}

数据示例：

# Table of target industry codes
t_match <- structure(list(NAICS17 = c("311111", "311119", "311211", "311212", 
"311213", "311221", "311224", "311225", "311230", "311313")), row.names = c(NA, 
-10L), class = "data.frame")

# NAICS17 column is unique:
t_naics17tree <- structure(list(NAICS17 = c("31-33", "311", "3111", "31111", "311111", 
"311119", "3112", "31121", "311211", "311212"), NAICS17Title = c("Manufacturing", 
"Food Manufacturing", "Animal Food Manufacturing", "Animal Food Manufacturing", 
"Dog and Cat Food Manufacturing", "Other Animal Food Manufacturing", 
"Grain and Oilseed Milling", "Flour Milling and Malt Manufacturing", 
"Flour Milling", "Rice Milling")), row.names = c(NA, 10L), class = "data.frame")

# NAICS17 column is NOT unique:
t_ppiprice <- structure(list(NAICS17 = c("311---", "311---", "311---", "311---", 
"311---", "311---", "311---", "311---", "311---", "311---"), 
    seriesID = c("PCU311---311---", "PCU311---311---", "PCU311---311---", 
    "PCU311---311---", "PCU311---311---", "PCU311---311---", 
    "PCU311---311---", "PCU311---311---", "PCU311---311---", 
    "PCU311---311---"), date = structure(c(17956, 17928, 17897, 
    17866, 17836, 17805, 17775, 17744, 17713, 17683), class = "Date"), 
    value = c(199.2, 198.9, 198.3, 197.9, 197.2, 197.4, 197.1, 
    197.7, 198.8, 200.2)), class = "data.frame", row.names = c(NA, 
-10L))

Answer 1

后代，我想通了...

for (tablename in v_tablenames){
  t_match[, match := NA_character_]
  for (i in code_len_count){
    t_match[is.na(match), target := substr(NAICS17, 1, i)]
    t_match[is.na(match), match := get(paste0("t_", tablename))[.SD, on=.(NAICS17 = target), mult = "first", mget("x.NAICS17")][]]
  }
  setnames(t_match, "match", paste0("m_", tablename))
}

在get()周围添加tablename可使循环引用变量和变量名。

添加mult = "first"允许联接仅进行第一个匹配

感谢@Cole的帮助！

匹配不同长度的data.tables循环

1 个答案: