我有以下数据和代码(用于匹配两个数据集data1和data2),我想知道是否可以改进代码以实现效率。
kk<-structure(list(dummy = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), exact = c(4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), common1 = c(87L,
79L, 82L, 87L, 94L, 68L, 67L, 83L, 73L, 83L, 83L, 87L, 66L, 87L,
77L, 84L, 62L, 80L, 75L, 76L, 80L, 74L, 83L, 81L, 93L, 81L, 76L,
84L, 73L, 52L, 73L, 87L, 69L, 81L, 87L, 79L, 66L, 63L, 63L, 83L,
69L, 70L, 44L, 71L, 72L, 80L, 84L, 47L), common2 = c(5.70294879390762,
9.13248693277132, 9.24850283307053, 9.525315331908, 9.7981270368783,
10.2750511089686, 10.5186731916264, 10.2750511089686, 9.30565055178051,
9.47270463644367, 9.74096862303835, 10.3417424834673, 10.0432494949113,
9.99879773234045, 9.99879773234045, 9.30565055178051, 9.03598698483141,
8.88183630500415, 9.74096862303835, 9.5468126085974, 9.90348755253613,
8.9226582995244, 10.1266311038503, 9.7981270368783, 9.39266192877014,
9.7981270368783, 9.21034037197618, 9.5468126085974, 10.3417424834673,
9.5468126085974, 9.62362482913648, 9.61748739820009, 9.21830854162536,
9.2259184019395, 8.75384509275524, 10.4777385781522, 9.51247992951689,
9.07268620667739, 8.06463647577422, 9.7981270368783, 9.5468126085974,
9.68034400122192, 9.04782144247841, 10.4631033404715, 9.21034037197618,
10.2750511089686, 9.10497985631836, 9.04782144247841), y1 = c(NA,
NA, NA, NA, 4400, 1000, 30150, 100, 30, 249000, 38400, 857000,
1930, 18100, 5030, 140000, 380, 300, 120700, 2500, 35500, 200,
500, 6600, 129000, 44000, 1000, 162230, 174010, 700, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 810, 0, 250, 0, 14300, 5200, 19400,
0, 0), y2 = c(NA, NA, NA, NA, 1e+05, 2e+05, 1e+05, 150000, 95000,
1e+05, 50000, 1e+05, 51000, 1e+05, 73000, 125000, 55000, 17000,
3e+05, 3000, 106000, 80000, 150000, 44000, 50000, 55000, 60000,
4e+05, 130000, 60000, NA, NA, NA, NA, NA, NA, NA, NA, NA, 45000,
3000, 45000, 7500, 60000, 120000, 1e+05, 40000, 10000)), .Names = c("dummy",
"exact", "common1", "common2", "y1", "y2"), row.names = 65:112, class = "data.frame")
head(kk)
dummy exact common1 common2 y1 y2
65 1 4 87 5.702949 NA NA
66 1 4 79 9.132487 NA NA
67 1 4 82 9.248503 NA NA
68 1 4 87 9.525315 NA NA
69 0 4 94 9.798127 4400 1e+05
70 0 4 68 10.275051 1000 2e+05
列:Dummy是一个变量,如果data为data1则取值为1,如果data为data2则取值为0。 Common1和Common2是data1和data2中常见的变量。 y1和y2是data2独有的变量,因此对于data1(dummy == 1),这些是NA。我试图使用StatMatch包中的mahalanobis距离(md)来找到基于变量“exact”给出的每个组的公共变量common1和common2的距离。之后,我试图找到(md.dif)的最小距离,然后选择data2(dummy == 0)的行,它具有数据的最小值(dummy == 0)。如果有平局,我将从最小值开始采样。
代码如下:
library(Statmatch) # for mahalanobis distance
for (i in unique(kk$exact)){
cat("number of individuals in data1","\n")
data1.length<-nrow(kk[kk$dummy==1 & kk$exact==i ,])
show(data1.length)
cat("number of individuals in data2","\n")
data2.length<-nrow(kk[kk$dummy==0 & kk$exact==i ,])
show(data2.length)
cat("mahalanobis distance for individuals (data1 and data2) in each exact","\n")
md<-mahalanobis.dist(kk[kk$dummy==1 & kk$exact==i,c("common1","common2")],kk[kk$dummy==0 & kk$exact==i,c("common1","common2")])
show(md)
cat("minimum mahalanobis distance for individuals (data1 and data2) in each exact","\n")
md.dif <-sapply(as.list(rownames(md)),function(x) min(md[x,]))
show(md.dif)
#For each data1 individuals in each exact, there may be more than individuals in data2 that has the same minimum distance (or has same min).
# This reflects the ties
cat("matched data2 individuals for each individuals in data1 in each exact","\n")
nn<-lapply(as.list(rownames(md)),function(x) which(md[x,]==min(md[x,])))
show(nn)
# If there is a tie (more than one individuals in data2 for each individual in data1), sample one of these; if there is no tie, then we have one data2 individual for each data1 individual
cat("matched data2 individuals for each indiviudal in data1 in each exact with sample","\n")
set.seed(123) # for reproducibility
mm<-list()
for (j in (1:length(nn))){
if (length(nn[[j]])>1)
mm[[j]]<-sample(nn[[j]],1,replace=FALSE)
else mm[[j]]<-nn[[j]]
}
#names of mm gives the row index of matched data2 individual for each data1
ss<-sapply(mm,names)
show(ss)
kk[kk$dummy==1 & kk$exact==i ,"data2row"]<-as.numeric(ss)
kk[kk$dummy==1 & kk$exact==i,"md.dif"]<-md.dif
# Imputting the data2 vars (y1 and y2) for matched individuals by creating the new vars; e.g. if data2 var is y1 then data2.y1
# gives imputted y1 for matched data1 individuals
data2vars<-names(kk)[5:6]
cat("imputting the data2 vars (y1 and y2)","\n")
for (k in data2vars){
kk[kk$dummy==1 & kk$exact==i, paste0("data2.",k)]<-kk[[k]][match(as.numeric(ss),rownames(kk))]
}
}
上面的代码生成输出如下:
dummy exact common1 common2 y1 y2 data2row md.dif data2.y1 data2.y2
65 1 4 87 5.702949 NA NA 82 3.7385027 300 17000
66 1 4 79 9.132487 NA NA 82 0.3018370 300 17000
67 1 4 82 9.248503 NA NA 80 0.2422656 140000 125000
68 1 4 87 9.525315 NA NA 92 0.3312446 162230 400000
69 0 4 94 9.798127 4400 1e+05 NA NA NA NA
70 0 4 68 10.275051 1000 2e+05 NA NA NA NA
答案 0 :(得分:1)
我会做这样的事情,希望有用的评论符合:
# useful function, sample on its own gets confused
resample <- function(x, ...) x[sample.int(length(x), ...)]
# from OP's code
data2vars <- names(kk)[5:6]
# columns to compare
cmpcols <- c("common1","common2")
# doing a single write later should save memory, lets define the columns here
data1vars <- c("data2row","md.dif",paste0("data2.",data2vars))
rownums <- as.numeric(rownames(kk))
# preallocate the columns
kk[,data1vars] <- NA
# loop through every "exact" match
for (i in unique(kk$exact)) {
# pull out all values that match this one, then just data1 and data2 items
px <- kk$exact==i
ix1 <- which(px & kk$dummy==1)
ix2 <- which(px & kk$dummy==0)
# calculate all pairwise distances
md <- mahalanobis.dist(kk[ix1,cmpcols], kk[ix2,cmpcols])
# sample the indexes we want to pick
nn <- apply(md, 1, function(x) resample(which(x == min(x)))[[1]])
# pull out the row indexes for these items
ii <- ix2[nn]
# write the data out
kk[ix1, data1vars] <- cbind(data2row=rownums[ii],md.dif=apply(md,1,min),kk[ii,data2vars])
}
不确定它会有多大差异!