R - 在每次迭代中保存循环的输出

时间:2017-07-31 16:46:35

标签: r

我正在根据不同的唯一标识符逐步匹配两个数据集。我想在每个步骤(迭代)中存储匹配和不匹配的行数。由于这个过程非常重复,我认为可以通过循环购买完成我无法做到(仍然是初学者)。这是当前(不是很优雅)的代码:

# Data

dput(a)
structure(list(V1 = c("96099", "14072", "177877", "192619", "3554", 
"181507", "76016", "192123"), WAPC_gz = c("04LODGEAVENUERM82JA", 
"06RIVERROADIG110EY", "07SPRINGPONDROADRM95DP", "09DAGENHAMAVENUERM96LD", 
"09SKIPPERCOURTABBEYROADIG117GW", "1-11VINERIESCLOSERM95DA", 
"1-2MORLANDROADRM109HW", "10-11THEVINERIESVINERIESCLOSERM95DA"
), WAWPC_gz = c("04LODGEAVENUE", "06RIVERROAD", "07SPRINGPONDROAD", 
"09DAGENHAMAVENUE", "09SKIPPERCOURTABBEYROAD", "1-11VINERIESCLOSE", 
"1-2MORLANDROAD", "10-11THEVINERIESVINERIESCLOSE"), T8PC_gz = c("04LODGERM82JA", 
"06RIVERIG110EY", "07SPRINRM95DP", "09DAGENRM96LD", "09SKIPPIG117GW", 
"1-11VINRM95DA", "1-2MORLRM109HW", "10-11THRM95DA"), T8_FPC_gz = c("04LODGERM82JA", 
"06RIVERIG110EY", "07SPRINRM95DP", "09DAGENRM96LD", "09SKIPPIG117GW", 
"1-11VINRM95DA", "1-2MORLRM109HW", "10-11THRM95DA"), PPNPC_gz = c("04RM82JA", 
"06IG110EY", "07RM95DP", "09RM96LD", "09IG117GW", "111RM95DA", 
"12RM109HW", "1011RM95DA"), adr_gz = c(" 04 LODGE AVENUE RM82JA", 
" 06 RIVER ROAD IG110EY", " 07 SPRINGPOND ROAD RM95DP", " 09 DAGENHAM AVENUE RM96LD", 
"09 SKIPPER COURT ABBEY ROAD IG117GW", " 1-11 VINERIES CLOSE RM95DA", 
" 1-2 MORLAND ROAD RM109HW", "10-11 THE VINERIES VINERIES CLOSE RM95DA"
 )), class = "data.frame", row.names = c(NA, -8L), .internal.selfref = <pointer: 0x00000000001f0788>, .Names = c("V1", 
 "WAPC_gz", "WAWPC_gz", "T8PC_gz", "T8_FPC_gz", "PPNPC_gz", "adr_gz"
 ))

dput(b)
structure(list(V1 = c("192619", "3554", "181507", "76016", "192123", 
"121768", "8355", "124567"), WAPC_gp = c("09DAGENHAMAVENUERM96LD", 
"09SKIPPERCOURTABBEYROADIG117GW", "1-11VINERIESCLOSERM95DA", 
"1-2MORLANDROADRM109HW", "10-11THEVINERIESVINERIESCLOSERM95DA", 
"1000GREENLANERM81BT", "1001LEMONADEBUILDING3ARBORETUMPLACEIG117PY", 
"1002GREENLANERM81BT"), WAWPC_gp = c("09DAGENHAMAVENUE", "09SKIPPERCOURTABBEYROAD", 
"1-11VINERIESCLOSE", "1-2MORLANDROAD", "10-11THEVINERIESVINERIESCLOSE", 
"1000GREENLANE", "1001LEMONADEBUILDING3ARBORETUMPLACE", "1002GREENLANE"
 ), T8PC_gp = c("09DAGENRM96LD", "09SKIPPIG117GW", "1-11VINRM95DA", 
 "1-2MORLRM109HW", "10-11THRM95DA", "1000GRERM81BT", "1001LEMIG117PY", 
"1002GRERM81BT"), T8_FPC_gp = c("09DAGENRM96LD", "09SKIPPIG117GW", 
"1-11VINRM95DA", "1-2MORLRM109HW", "10-11THRM95DA", "1000GRERM81BT", 
 "1001LEMIG117PY", "1002GRERM81BT"), PPNPC_gp = c("09RM96LD", 
"09IG117GW", "111RM95DA", "12RM109HW", "1011RM95DA", "1000RM81BT", 
"10013IG117PY", "1002RM81BT"), adr_gp = c(" 09 DAGENHAM AVENUE RM96LD", 
"09 SKIPPER COURT ABBEY ROAD IG117GW", " 1-11 VINERIES CLOSE RM95DA", 
" 1-2 MORLAND ROAD RM109HW", "10-11 THE VINERIES VINERIES CLOSE RM95DA", 
" 1000 GREEN LANE RM81BT", "1001 LEMONADE BUILDING 3 ARBORETUM PLACE IG117PY", 
" 1002 GREEN LANE RM81BT")), class = "data.frame", row.names = c(NA, 
-8L), .internal.selfref = <pointer: 0x00000000001f0788>, .Names = c("V1", 
"WAPC_gp", "WAWPC_gp", "T8PC_gp", "T8_FPC_gp", "PPNPC_gp", "adr_gp"
))

功能

## Steps order (these are the columns containing unique identifiers)
step1<-2
step2<-4
step3<-5
step4<-6
step5<-3

steps<-(c(2,4,5,6,3))
matching <- function (a,b) {

first_join<-merge(b,a,by.y=names(a)[step1], by.x=names(b)[step1], all.x =   TRUE)
unmatch_1<- first_join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match_1<- first_join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))

second_join<-merge(unmatch_1,a,by.y=names(a)[step2], by.x=names(b)[step2], all.x = TRUE)
unmatch_2<- second_join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match_2<- second_join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))

third_join<-merge(unmatch_2,a,by.y=names(a)[step3], by.x=names(b)[step3], all.x = TRUE)
unmatch_3<- third_join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match_3<- third_join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))

fourth_join<-merge(unmatch_3,a,by.y=names(a)[step4], by.x=names(b)[step4], all.x = TRUE)
unmatch_4<- fourth_join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match_4<- fourth_join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))

fifth_join<-merge(unmatch_4,a,by.y=names(a)[step5], by.x=names(b)[step5], all.x = TRUE)
unmatch_5<- fifth_join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
match_5<- fifth_join %>% filter(!is.na(adr_gz)) %>%
select(grep("adr|UPRN|id|V1.x",names(first_join)))

matches<-c((nrow(match_1)),(nrow(match_2)),(nrow(match_3)),
           (nrow(match_4)),(nrow(match_5)))
unmatches<-c((nrow(unmatch_1)),(nrow(unmatch_2)),(nrow(unmatch_3)),
           (nrow(unmatch_4)),(nrow(unmatch_5)))
df<-as.data.frame(cbind(names(a)[steps],matches,unmatches))
print(df)
  }

我认为代码可以更简单,做一个看起来像我想的循环:

`steps<-(c(2,4,5,6,3))

matching2 <- function (a,b) {

join<-NULL
unmatch<-NULL
match<-NULL
result<-NULL
match_row<- NULL

    for(i in 1:length(steps))

join<-merge(b,a,by.y=names(a)[steps[i]], by.x=names(b)[steps[i]], all.x = TRUE)
  unmatch<- join %>% filter(is.na(adr_gz)) %>% select(1:length(names(b)))
  match<- join %>% filter(!is.na(adr_gz)) %>%
  select(grep("adr|UPRN|id|V1.x",names(first_join)))

  match_row<-nrow(match)

  result= rbind(match_row,result)
  print(result)

}

任何意见/建议都将非常感谢。

0 个答案:

没有答案