我正在根据不同的唯一标识符逐步匹配两个数据集。我想在每个步骤(迭代)中存储匹配和不匹配的行数。由于这个过程非常重复,我认为可以通过循环购买完成我无法做到(仍然是初学者)。这是当前(不是很优雅)的代码:
# Data
dput(a)
structure(list(V1 = c("96099", "14072", "177877", "192619", "3554",
"181507", "76016", "192123"), WAPC_gz = c("04LODGEAVENUERM82JA",
"06RIVERROADIG110EY", "07SPRINGPONDROADRM95DP", "09DAGENHAMAVENUERM96LD",
"09SKIPPERCOURTABBEYROADIG117GW", "1-11VINERIESCLOSERM95DA",
"1-2MORLANDROADRM109HW", "10-11THEVINERIESVINERIESCLOSERM95DA"
), WAWPC_gz = c("04LODGEAVENUE", "06RIVERROAD", "07SPRINGPONDROAD",
"09DAGENHAMAVENUE", "09SKIPPERCOURTABBEYROAD", "1-11VINERIESCLOSE",
"1-2MORLANDROAD", "10-11THEVINERIESVINERIESCLOSE"), T8PC_gz = c("04LODGERM82JA",
"06RIVERIG110EY", "07SPRINRM95DP", "09DAGENRM96LD", "09SKIPPIG117GW",
"1-11VINRM95DA", "1-2MORLRM109HW", "10-11THRM95DA"), T8_FPC_gz = c("04LODGERM82JA",
"06RIVERIG110EY", "07SPRINRM95DP", "09DAGENRM96LD", "09SKIPPIG117GW",
"1-11VINRM95DA", "1-2MORLRM109HW", "10-11THRM95DA"), PPNPC_gz = c("04RM82JA",
"06IG110EY", "07RM95DP", "09RM96LD", "09IG117GW", "111RM95DA",
"12RM109HW", "1011RM95DA"), adr_gz = c(" 04 LODGE AVENUE RM82JA",
" 06 RIVER ROAD IG110EY", " 07 SPRINGPOND ROAD RM95DP", " 09 DAGENHAM AVENUE RM96LD",
"09 SKIPPER COURT ABBEY ROAD IG117GW", " 1-11 VINERIES CLOSE RM95DA",
" 1-2 MORLAND ROAD RM109HW", "10-11 THE VINERIES VINERIES CLOSE RM95DA"
)), class = "data.frame", row.names = c(NA, -8L), .internal.selfref = <pointer: 0x00000000001f0788>, .Names = c("V1",
"WAPC_gz", "WAWPC_gz", "T8PC_gz", "T8_FPC_gz", "PPNPC_gz", "adr_gz"
))
dput(b)
structure(list(V1 = c("192619", "3554", "181507", "76016", "192123",
"121768", "8355", "124567"), WAPC_gp = c("09DAGENHAMAVENUERM96LD",
"09SKIPPERCOURTABBEYROADIG117GW", "1-11VINERIESCLOSERM95DA",
"1-2MORLANDROADRM109HW", "10-11THEVINERIESVINERIESCLOSERM95DA",
"1000GREENLANERM81BT", "1001LEMONADEBUILDING3ARBORETUMPLACEIG117PY",
"1002GREENLANERM81BT"), WAWPC_gp = c("09DAGENHAMAVENUE", "09SKIPPERCOURTABBEYROAD",
"1-11VINERIESCLOSE", "1-2MORLANDROAD", "10-11THEVINERIESVINERIESCLOSE",
"1000GREENLANE", "1001LEMONADEBUILDING3ARBORETUMPLACE", "1002GREENLANE"
), T8PC_gp = c("09DAGENRM96LD", "09SKIPPIG117GW", "1-11VINRM95DA",
"1-2MORLRM109HW", "10-11THRM95DA", "1000GRERM81BT", "1001LEMIG117PY",
"1002GRERM81BT"), T8_FPC_gp = c("09DAGENRM96LD", "09SKIPPIG117GW",
"1-11VINRM95DA", "1-2MORLRM109HW", "10-11THRM95DA", "1000GRERM81BT",
"1001LEMIG117PY", "1002GRERM81BT"), PPNPC_gp = c("09RM96LD",
"09IG117GW", "111RM95DA", "12RM109HW", "1011RM95DA", "1000RM81BT",
"10013IG117PY", "1002RM81BT"), adr_gp = c(" 09 DAGENHAM AVENUE RM96LD",
"09 SKIPPER COURT ABBEY ROAD IG117GW", " 1-11 VINERIES CLOSE RM95DA",
" 1-2 MORLAND ROAD RM109HW", "10-11 THE VINERIES VINERIES CLOSE RM95DA",
" 1000 GREEN LANE RM81BT", "1001 LEMONADE BUILDING 3 ARBORETUM PLACE IG117PY",
" 1002 GREEN LANE RM81BT")), class = "data.frame", row.names = c(NA,
-8L), .internal.selfref = <pointer: 0x00000000001f0788>, .Names = c("V1",
"WAPC_gp", "WAWPC_gp", "T8PC_gp", "T8_FPC_gp", "PPNPC_gp", "adr_gp"
))
## Steps order (these are the columns containing unique identifiers)
step1<-2
step2<-4
step3<-5
step4<-6
step5<-3
steps<-(c(2,4,5,6,3))
matching <- function (a,b) {
first_join<-merge(b,a,by.y=names(a)[step1], by.x=names(b)[step1], all.x = TRUE)
unmatch_1<- first_join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match_1<- first_join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))
second_join<-merge(unmatch_1,a,by.y=names(a)[step2], by.x=names(b)[step2], all.x = TRUE)
unmatch_2<- second_join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match_2<- second_join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))
third_join<-merge(unmatch_2,a,by.y=names(a)[step3], by.x=names(b)[step3], all.x = TRUE)
unmatch_3<- third_join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match_3<- third_join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))
fourth_join<-merge(unmatch_3,a,by.y=names(a)[step4], by.x=names(b)[step4], all.x = TRUE)
unmatch_4<- fourth_join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match_4<- fourth_join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))
fifth_join<-merge(unmatch_4,a,by.y=names(a)[step5], by.x=names(b)[step5], all.x = TRUE)
unmatch_5<- fifth_join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match_5<- fifth_join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))
matches<-c((nrow(match_1)),(nrow(match_2)),(nrow(match_3)),
(nrow(match_4)),(nrow(match_5)))
unmatches<-c((nrow(unmatch_1)),(nrow(unmatch_2)),(nrow(unmatch_3)),
(nrow(unmatch_4)),(nrow(unmatch_5)))
df<-as.data.frame(cbind(names(a)[steps],matches,unmatches))
print(df)
}
我认为代码可以更简单,做一个看起来像我想的循环:
`steps<-(c(2,4,5,6,3))
matching2 <- function (a,b) {
join<-NULL
unmatch<-NULL
match<-NULL
result<-NULL
match_row<- NULL
for(i in 1:length(steps))
join<-merge(b,a,by.y=names(a)[steps[i]], by.x=names(b)[steps[i]], all.x = TRUE)
unmatch<- join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match<- join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))
match_row<-nrow(match)
result= rbind(match_row,result)
print(result)
}
任何意见/建议都将非常感谢。