使用循环进行匹配编码的有效方法

时间:2013-11-04 15:27:50

标签: r for-loop performance

我有以下数据和代码(用于匹配两个数据集data1和data2),我想知道是否可以改进代码以实现效率。

kk<-structure(list(dummy = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), exact = c(4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), common1 = c(87L, 
79L, 82L, 87L, 94L, 68L, 67L, 83L, 73L, 83L, 83L, 87L, 66L, 87L, 
77L, 84L, 62L, 80L, 75L, 76L, 80L, 74L, 83L, 81L, 93L, 81L, 76L, 
84L, 73L, 52L, 73L, 87L, 69L, 81L, 87L, 79L, 66L, 63L, 63L, 83L, 
69L, 70L, 44L, 71L, 72L, 80L, 84L, 47L), common2 = c(5.70294879390762, 
9.13248693277132, 9.24850283307053, 9.525315331908, 9.7981270368783, 
10.2750511089686, 10.5186731916264, 10.2750511089686, 9.30565055178051, 
9.47270463644367, 9.74096862303835, 10.3417424834673, 10.0432494949113, 
9.99879773234045, 9.99879773234045, 9.30565055178051, 9.03598698483141, 
8.88183630500415, 9.74096862303835, 9.5468126085974, 9.90348755253613, 
8.9226582995244, 10.1266311038503, 9.7981270368783, 9.39266192877014, 
9.7981270368783, 9.21034037197618, 9.5468126085974, 10.3417424834673, 
9.5468126085974, 9.62362482913648, 9.61748739820009, 9.21830854162536, 
9.2259184019395, 8.75384509275524, 10.4777385781522, 9.51247992951689, 
9.07268620667739, 8.06463647577422, 9.7981270368783, 9.5468126085974, 
9.68034400122192, 9.04782144247841, 10.4631033404715, 9.21034037197618, 
10.2750511089686, 9.10497985631836, 9.04782144247841), y1 = c(NA, 
NA, NA, NA, 4400, 1000, 30150, 100, 30, 249000, 38400, 857000, 
1930, 18100, 5030, 140000, 380, 300, 120700, 2500, 35500, 200, 
500, 6600, 129000, 44000, 1000, 162230, 174010, 700, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, 810, 0, 250, 0, 14300, 5200, 19400, 
0, 0), y2 = c(NA, NA, NA, NA, 1e+05, 2e+05, 1e+05, 150000, 95000, 
1e+05, 50000, 1e+05, 51000, 1e+05, 73000, 125000, 55000, 17000, 
3e+05, 3000, 106000, 80000, 150000, 44000, 50000, 55000, 60000, 
4e+05, 130000, 60000, NA, NA, NA, NA, NA, NA, NA, NA, NA, 45000, 
3000, 45000, 7500, 60000, 120000, 1e+05, 40000, 10000)), .Names = c("dummy", 
"exact", "common1", "common2", "y1", "y2"), row.names = 65:112, class = "data.frame")

head(kk)
    dummy exact common1   common2   y1    y2
65     1     4      87  5.702949   NA    NA
66     1     4      79  9.132487   NA    NA
67     1     4      82  9.248503   NA    NA
68     1     4      87  9.525315   NA    NA
69     0     4      94  9.798127 4400 1e+05
70     0     4      68 10.275051 1000 2e+05

列:Dummy是一个变量,如果data为data1则取值为1,如果data为data2则取值为0。 Common1和Common2是data1和data2中常见的变量。 y1和y2是data2独有的变量,因此对于data1(dummy == 1),这些是NA。我试图使用StatMatch包中的mahalanobis距离(md)来找到基于变量“exact”给出的每个组的公共变量common1和common2的距离。之后,我试图找到(md.dif)的最小距离,然后选择data2(dummy == 0)的行,它具有数据的最小值(dummy == 0)。如果有平局,我将从最小值开始采样。

代码如下:

library(Statmatch) # for mahalanobis distance 


for (i in unique(kk$exact)){
cat("number of individuals in data1","\n")
   data1.length<-nrow(kk[kk$dummy==1 & kk$exact==i ,])
   show(data1.length)


  cat("number of individuals in data2","\n")
  data2.length<-nrow(kk[kk$dummy==0 & kk$exact==i ,])
  show(data2.length)

  cat("mahalanobis distance for individuals (data1 and data2) in each exact","\n")
  md<-mahalanobis.dist(kk[kk$dummy==1 & kk$exact==i,c("common1","common2")],kk[kk$dummy==0  & kk$exact==i,c("common1","common2")])
  show(md)     

  cat("minimum mahalanobis distance for individuals (data1 and data2) in each exact","\n")
  md.dif <-sapply(as.list(rownames(md)),function(x) min(md[x,]))
  show(md.dif)     

  #For each data1 individuals in each exact, there may be more than individuals in data2 that has the same minimum distance (or has same min). 
  # This reflects the ties 
  cat("matched data2 individuals for each individuals in data1 in each exact","\n") 
  nn<-lapply(as.list(rownames(md)),function(x) which(md[x,]==min(md[x,])))
  show(nn) 

  # If there is a tie (more than one individuals in data2 for each individual in data1), sample one of these; if there is no tie, then we have one data2 individual for each data1 individual
  cat("matched data2 individuals for each indiviudal in data1 in each exact with sample","\n") 
  set.seed(123) # for reproducibility
  mm<-list()
  for (j in (1:length(nn))){
    if (length(nn[[j]])>1)
      mm[[j]]<-sample(nn[[j]],1,replace=FALSE)
    else  mm[[j]]<-nn[[j]]
  }
  #names of mm gives the row index of matched data2 individual for each data1
  ss<-sapply(mm,names)
  show(ss)

  kk[kk$dummy==1 & kk$exact==i ,"data2row"]<-as.numeric(ss)
  kk[kk$dummy==1 & kk$exact==i,"md.dif"]<-md.dif

  # Imputting the data2 vars (y1 and y2) for matched individuals by creating the new vars; e.g. if data2 var is y1 then data2.y1
  # gives imputted y1 for matched data1 individuals      
 data2vars<-names(kk)[5:6]
  cat("imputting the data2 vars (y1 and y2)","\n") 

  for (k in data2vars){  
    kk[kk$dummy==1 & kk$exact==i, paste0("data2.",k)]<-kk[[k]][match(as.numeric(ss),rownames(kk))]
  }
}

上面的代码生成输出如下:

     dummy exact common1   common2   y1    y2 data2row    md.dif data2.y1 data2.y2
65     1     4      87  5.702949   NA    NA       82 3.7385027      300    17000
66     1     4      79  9.132487   NA    NA       82 0.3018370      300    17000
67     1     4      82  9.248503   NA    NA       80 0.2422656   140000   125000
68     1     4      87  9.525315   NA    NA       92 0.3312446   162230   400000
69     0     4      94  9.798127 4400 1e+05       NA        NA       NA       NA
70     0     4      68 10.275051 1000 2e+05       NA        NA       NA       NA

1 个答案:

答案 0 :(得分:1)

我会做这样的事情,希望有用的评论符合:

# useful function, sample on its own gets confused
resample  <- function(x, ...) x[sample.int(length(x), ...)]
# from OP's code
data2vars <- names(kk)[5:6]
# columns to compare
cmpcols <- c("common1","common2")
# doing a single write later should save memory, lets define the columns here
data1vars <- c("data2row","md.dif",paste0("data2.",data2vars))
rownums <- as.numeric(rownames(kk))

# preallocate the columns
kk[,data1vars] <- NA

# loop through every "exact" match
for (i in unique(kk$exact)) {
  # pull out all values that match this one, then just data1 and data2 items
  px <- kk$exact==i
  ix1 <- which(px & kk$dummy==1)
  ix2 <- which(px & kk$dummy==0)

  # calculate all pairwise distances
  md <- mahalanobis.dist(kk[ix1,cmpcols], kk[ix2,cmpcols])

  # sample the indexes we want to pick
  nn <- apply(md, 1, function(x) resample(which(x == min(x)))[[1]])

  # pull out the row indexes for these items
  ii <- ix2[nn]

  # write the data out
  kk[ix1, data1vars] <- cbind(data2row=rownums[ii],md.dif=apply(md,1,min),kk[ii,data2vars])
}

不确定它会有多大差异!