具有链式连接的复杂条件

时间:2017-01-30 05:15:15

标签: r

我需要根据与第一个链接在一起的其他两个数据框中的值来更新数据框。

目标df t_offices在此有4个感兴趣的字段:

       administrative_area_level_1 administrative_area_level_2       country     locality
     1                     Arizona             Maricopa County United States      Phoenix
     2        District of Columbia                        <NA> United States   Washington
     3                        <NA>                        <NA>         India         <NA>
     4                    New York               Albany County United States       Albany
     5                     Utrecht                  Nieuwegein   Netherlands   Nieuwegein
     6                 Connecticut            Fairfield County United States     Stamford
   707                    Illinois                        <NA> United States         <NA>
  4241                    Illinois                        <NA> United States West Chicago
999998                     Alabama                        <NA> United States      Altoona
999999                Pennsylvania                        <NA> United States   Washington

我需要更新administrative_area_level_2中包含美国记录的县的NA值。值以df t_places

表示
      state_ab           place_name                  county_name place_nameshort
     1      AL           Abanda CDP              Chambers County          Abanda
     2      AL       Abbeville city                 Henry County       Abbeville
     3      AL      Adamsville city             Jefferson County      Adamsville
     4      AL         Addison town               Winston County         Addison 
     5      AL           Akron town                  Hale County           Akron
     6      AL       Alabaster city                Shelby County       Alabaster
    12      AL         Altoona town Blount County, Etowah County         Altoona
  4298      DC      Washington city         District of Columbia      Washington
  7527      IL    West Chicago city                DuPage County      Washington
 32611      PA  Washington township             Armstrong County    West Chicago
 32612      PA  Washington township                 Berks County      Washington

place_nameshortplace_name的截断版本,没有指定(例如“城市”,“城镇”等)

我在州和地方加入t_officest_places以获得正确的县。这可能会返回多个县1),因为county_name可以包含由逗号分隔的多个县,以及2)因为截断的place_nameshort可以在同一状态内返回同义词。我需要只是那些县是明确的(单县回归)。

由于t_places仅包含state_ab,我需要r_states的第三个数据框state_name

   state_ab             state_name
 1       AL                Alabama
 2       AK                 Alaska
 3       AZ                Arizona
 4       AR               Arkansas
 5       CA             California
 6       CO               Colorado
 9       DC   District of Columbia
17       IL               Illinois
42       PA           Pennsylvania

通过t_placesr_states加入state_ab,我可以state_namet_offices$administrative_area_level_1匹配。

这是我的尝试,这是不完整的,因为由于状态内同义词而无法控制多个县,并且无论如何都不起作用。

no_county <- (!is.na(t_offices$country) 
          & t_offices$country == "United States" 
          & !is.na(t_offices$administrative_area_level_1) 
          & is.na(t_offices$administrative_area_level_2) 
          & !is.na(t_offices$locality))

t_offices$administrative_area_level_2[no_county] <-
  t_places$county_name[!grepl(",", t_places$county_name) 
                       & match(t_places$place_nameshort, t_offices$locality[no_county]) 
                       & match(t_places$state_ab, 
                               r_states$state_ab[match(r_states$state_name, 
                                                       t_offices$administrative_area_level_1[no_county])])]

编辑:按照@ r2evans的建议,这是我的新编码尝试,但仍然不起作用:

# split multiple counties into columns
library(splitstackshape)
t_places <- cSplit(t_places, "county_name", sep = ", ", drop = F, type.convert = F)

# merge state names into places  
places_statename <- merge(t_places, r_states[,2:3])

# define condition to select t_offices records in U.S. with state and place but no county
no_county <- (
  # country is U.S.
  !is.na(t_offices$country)
  & t_offices$country == "United States"
  # with state
  & !is.na(t_offices$administrative_area_level_1)
  # blank county
  & is.na(t_offices$administrative_area_level_2)
  # with place
  & !is.na(t_offices$locality))

# update blank counties
t_offices$administrative_area_level_2[no_county] <-
  # unambiguous counties
  places_statename$county_name_1[is.na(places_statename$county_name_2)
                                 # locality matches place
                                 & match(t_offices$locality[no_county], places_statename$place_nameshort)
                                 # administrative_area_level_1 matches state
                                 & match(t_offices$administrative_area_level_1[no_county],places_statename$state_name)]

1 个答案:

答案 0 :(得分:0)

这是我的长期解决方案。可能更短,更优雅。

# split multiple counties into columns
library(splitstackshape)
t_places <- cSplit(t_places, "county_name", sep = ", ", drop = F, type.convert = F)
# subset original places with single county
places_singlecounty <- t_places[is.na(places_statename$county_name_2), c(1,8,9)]
# subset truncated places with single county
library(data.table)
setDT(places_singlecounty)
places_singlecounty <- merge(places_singlecounty, 
                             places_singlecounty[, .N, by = c("state_ab", "place_nameshort")][N == 1, 1:2])
# merge state names into single-county truncated places
places_statename <- merge(places_singlecounty, r_states[,2:3], by = "state_ab")

# define condition to select t_offices records in U.S. with state and place but no county
no_county <- (
  # country is U.S.
  !is.na(t_offices$country) 
  & t_offices$country == "United States" 
  # with state
  & !is.na(t_offices$administrative_area_level_1) 
  # NA county
  & is.na(t_offices$administrative_area_level_2) 
  # with place
  & !is.na(t_offices$locality))

# update t_offices NA counties based on single-county truncated places
setDT(t_offices)
t_offices[no_county, administrative_area_level_2 := 
           places_statename[.(.SD), county_name_1,
                            on = c(state_name = "administrative_area_level_1",  
                                   place_nameshort = "locality")]]