我有一个大型数据集,我需要根据区域列拆分成多个数据集,然后为每个区域执行一组代码并获取每个区域的输出数据。我试图使用for循环这样做但它似乎不起作用。如何实现这一目标。下面是我正在尝试的代码 -
for (a in c('10','11','14','20','30','40','50','61','64'))
{
paste0("data3_add_area",a) <- data3_add[AREACODE == a,]
paste0("in_add_area",a) <- in_add[AREA_CODE == a,]
source1 <- paste0("data3_add_area",a)$name
source2 <- paste0("in_add_area",a)$name
.....
paste0("match_",a) <- output
}
我们可以在R做这样的事情。抱歉,我还在学习R并且不确定可行性。实现这一目标的最佳方法是什么?
编辑 - 示例数据
Address1 <- c("786, GALI NO 5, XYZ","rambo, 45, strret 4, atlast, pqr","23/4, 23RD FLOOR, STREET 2, ABC-E, PQR","45-B, GALI NO5, XYZ","HECTIC, 99 STREET, PQR")
AREACODE <- c('10','11','14','20','30')
Year1 <- c(2001:2005)
Address2 <- c("abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR","abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR")
Year2 <- c(2001:2010)
AREA_CODE <- c('10','11','14','20','30','40','50','61','64')
data3_add <- data.table(Address1,Year1,AREACODE)
in_add <- data.table(Address2,Year2,AREA_CODE)
in_add [,unique_id := sprintf("%06d", 1:nrow(in_add))]
我想为每个区域复制的代码(参考此链接 - https://www.r-bloggers.com/fuzzy-string-matching-a-survival-skill-to-tackle-unstructured-information/)
使用method2我想复制每个区域的代码 -
#install.packages('stringdist')
library(stringdist)
distance.methods<-c('osa','lv','dl','hamming','lcs','qgram','cosine','jaccard','jw')
dist.methods<-list()
for(m in 1:length(distance.methods))
{
dist.name.enh<-matrix(NA, ncol = length(source2.devices$name),nrow = length(source1.devices$name))
for(i in 1:length(source2.devices$name)) {
for(j in 1:length(source1.devices$name)) {
dist.name.enh[j,i]<-stringdist(tolower(source2.devices[i,]$name),tolower(source1.devices[j,]$name),method = distance.methods[m])
#adist.enhance(source2.devices[i,]$name,source1.devices[j,]$name)
}
}
dist.methods[[distance.methods[m]]]<-dist.name.enh
}
match.s1.s2.enh<-NULL
for(m in 1:length(dist.methods))
{
dist.matrix<-as.matrix(dist.methods[[distance.methods[m]]])
min.name.enh<-apply(dist.matrix, 1, base::min)
for(i in 1:nrow(dist.matrix))
{
s2.i<-match(min.name.enh[i],dist.matrix[i,])
s1.i<-i
match.s1.s2.enh<-rbind(data.frame(s2.i=s2.i,s1.i=s1.i,s2name=source2.devices[s2.i,]$name, s1name=source1.devices[s1.i,]$name, adist=min.name.enh[i],method=distance.methods[m]),match.s1.s2.enh)
}
}
# Let's have a look at the results
library(reshape2)
matched.names.matrix<-dcast(match.s1.s2.enh,s2.i+s1.i+s2name+s1name~method, value.var = "adist")
View(matched.names.matrix)
答案 0 :(得分:0)
这是一个完整的脚本,我稍微更改了输入集,以演示距离矩阵有多行和一列的情况:
library("stringdist")
library("data.table")
Address1 <- c("786, GALI NO 5, XYZ","rambo, 45, strret 4, atlast, pqr","23/4, 23RD FLOOR, STREET 2, ABC-E, PQR","45-B, GALI NO5, XYZ","HECTIC, 99 STREET, PQR")
AREACODE <- c('10','10','14','20','30')
Year1 <- c(2001:2005)
Address2 <- c("abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR","abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR")
Year2 <- c(2001:2010)
AREA_CODE <- c('10','10','10','20','30','40','50','61','64', '99')
data1 <- data.table(Address1, Year1, AREACODE)
data2 <- data.table(Address2, Year2, AREA_CODE)
data2[, unique_id := sprintf("%06d", 1:nrow(data2))]
methods <- c('osa','lv','dl','hamming','lcs','qgram','cosine','jaccard','jw')
# split data.table into list of data.tables by area code
sdata1 <- split(data1, data1$AREACODE)
sdata2 <- split(data2, data2$AREA_CODE)
# find the unique codes
codes <- union(names(sdata1), names(sdata2))
dist_calc <- function(x, y, methods) {
distance_methods <- list()
# calculate distance for each method
for(m in seq_along(methods)) {
output <- matrix(NA, ncol = length(x), nrow = length(y))
for(i in seq_along(x)) {
for(j in seq_along(y)) {
# calculate the matrix of distances for each pairing
output[j, i] <- stringdist(tolower(x[i]), tolower(y[j]), method = methods[m])
}
}
# store the output matrix in a named list element
distance_methods[[methods[m]]] <- output
}
# return a list of lists, with each leaf element being a matrix
return(distance_methods)
}
# iterate over codes, use names to retain codes and do not simplify -- return a list
distances <- sapply(codes, function(code) {
x <- sdata1[[code]]$Address1
y <- sdata2[[code]]$Address2
# do not compute if the code is not present in either one, the other or both
if (is.null(x) | is.null(y))
return(NULL)
# calculate all the distances
dist_mats <- dist_calc(x, y, methods)
# calculate the minimum value and indicie for each method
method_mins <- sapply(methods, function(meth) {
min_col <- apply(dist_mats[[meth]], 1, which.min)
min_val <- apply(dist_mats[[meth]], 1, min)
# list the minimum match value, the string, the matching string
data.table(name = y, match_name = x[min_col], adist = min_val)
}, USE.NAMES = TRUE, simplify = FALSE)
# combine into a single data.table
rbindlist(method_mins, idcol = "method")
}, USE.NAMES = TRUE, simplify = FALSE)
all_distances <- rbindlist(Filter(function(x) !is.null(x), distances), idcol = "AREACODE")
# AREACODE method name match_name adist
# 1: 10 osa abc, pqr, xyz 786, GALI NO 5, XYZ 12.00000000
# 2: 10 osa 786, GALI NO 4 XYZ 786, GALI NO 5, XYZ 2.00000000
# 3: 10 osa 45B, GALI NO 5, XYZ 786, GALI NO 5, XYZ 3.00000000
# 4: 10 lv abc, pqr, xyz 786, GALI NO 5, XYZ 12.00000000
# 5: 10 lv 786, GALI NO 4 XYZ 786, GALI NO 5, XYZ 2.00000000
# 6: 10 lv 45B, GALI NO 5, XYZ 786, GALI NO 5, XYZ 3.00000000
# 7: 10 dl abc, pqr, xyz 786, GALI NO 5, XYZ 12.00000000
# 8: 10 dl 786, GALI NO 4 XYZ 786, GALI NO 5, XYZ 2.00000000
# 9: 10 dl 45B, GALI NO 5, XYZ 786, GALI NO 5, XYZ 3.00000000
#10: 10 hamming abc, pqr, xyz 786, GALI NO 5, XYZ Inf
#11: 10 hamming 786, GALI NO 4 XYZ 786, GALI NO 5, XYZ Inf
#12: 10 hamming 45B, GALI NO 5, XYZ 786, GALI NO 5, XYZ 3.00000000
#13: 10 lcs abc, pqr, xyz 786, GALI NO 5, XYZ 18.00000000
#14: 10 lcs 786, GALI NO 4 XYZ 786, GALI NO 5, XYZ 3.00000000
#15: 10 lcs 45B, GALI NO 5, XYZ 786, GALI NO 5, XYZ 6.00000000
#16: 10 qgram abc, pqr, xyz 786, GALI NO 5, XYZ 16.00000000
#17: 10 qgram 786, GALI NO 4 XYZ 786, GALI NO 5, XYZ 3.00000000
#18: 10 qgram 45B, GALI NO 5, XYZ 786, GALI NO 5, XYZ 6.00000000
#19: 10 cosine abc, pqr, xyz rambo, 45, strret 4, atlast, pqr 0.31400566
#20: 10 cosine 786, GALI NO 4 XYZ 786, GALI NO 5, XYZ 0.04653741
#21: 10 cosine 45B, GALI NO 5, XYZ 786, GALI NO 5, XYZ 0.08784068
#22: 10 jaccard abc, pqr, xyz rambo, 45, strret 4, atlast, pqr 0.63157895
#23: 10 jaccard 786, GALI NO 4 XYZ 786, GALI NO 5, XYZ 0.12500000
#24: 10 jaccard 45B, GALI NO 5, XYZ 786, GALI NO 5, XYZ 0.29411765
#25: 10 jw abc, pqr, xyz 786, GALI NO 5, XYZ 0.42535425
#26: 10 jw 786, GALI NO 4 XYZ 786, GALI NO 5, XYZ 0.05360624
#27: 10 jw 45B, GALI NO 5, XYZ 786, GALI NO 5, XYZ 0.10526316
#28: 20 osa del, 546, strret2, towards east, pqr 45-B, GALI NO5, XYZ 29.00000000
#29: 20 lv del, 546, strret2, towards east, pqr 45-B, GALI NO5, XYZ 29.00000000
#30: 20 dl del, 546, strret2, towards east, pqr 45-B, GALI NO5, XYZ 29.00000000
#31: 20 hamming del, 546, strret2, towards east, pqr 45-B, GALI NO5, XYZ Inf
#32: 20 lcs del, 546, strret2, towards east, pqr 45-B, GALI NO5, XYZ 41.00000000
#33: 20 qgram del, 546, strret2, towards east, pqr 45-B, GALI NO5, XYZ 35.00000000
#34: 20 cosine del, 546, strret2, towards east, pqr 45-B, GALI NO5, XYZ 0.46394373
#35: 20 jaccard del, 546, strret2, towards east, pqr 45-B, GALI NO5, XYZ 0.72000000
#36: 20 jw del, 546, strret2, towards east, pqr 45-B, GALI NO5, XYZ 0.54863548
#37: 30 osa 23/4, STREET 2, PQR HECTIC, 99 STREET, PQR 11.00000000
#38: 30 lv 23/4, STREET 2, PQR HECTIC, 99 STREET, PQR 11.00000000
#39: 30 dl 23/4, STREET 2, PQR HECTIC, 99 STREET, PQR 11.00000000
#40: 30 hamming 23/4, STREET 2, PQR HECTIC, 99 STREET, PQR Inf
#41: 30 lcs 23/4, STREET 2, PQR HECTIC, 99 STREET, PQR 15.00000000
#42: 30 qgram 23/4, STREET 2, PQR HECTIC, 99 STREET, PQR 13.00000000
#43: 30 cosine 23/4, STREET 2, PQR HECTIC, 99 STREET, PQR 0.21927994
#44: 30 jaccard 23/4, STREET 2, PQR HECTIC, 99 STREET, PQR 0.50000000
#45: 30 jw 23/4, STREET 2, PQR HECTIC, 99 STREET, PQR 0.31607428
# AREACODE method name match_name adist