我尝试使用stringdist来识别同一向量中最大距离为1的所有字符串,然后发布匹配项。以下是数据样本:
启动数据框:
a = c("tom", "tomm", "alex", "alexi", "chris", "jen", "jenn", "michell")
b = c(NA)
df = data.frame(a,b)
期望的结果:
a = c("tom", "tomm", "alex", "alexi", "chris", "jen", "jenn", "michell")
b = c("tomm", "tom", "alexi", "alex", 0, "jenn", "jen", 0)
df = data.frame(a,b)
我可以将stringdist用于两个向量,但是在将其用于一个向量时遇到了问题。感谢您的帮助,R社区。 p>
答案 0 :(得分:3)
您可以使用stringdistmatrix
和which.min
:
df = data.frame(a,b, stringsAsFactors = FALSE)
mat <- stringdistmatrix(df$a, df$a)
mat[mat==0] <- NA # ignore self
mat[mat>4] <- NA # cut level
amatch <- rowSums(mat, na.rm = TRUE)>0 # ignore no match
df$b[amatch] <- df$a[apply(mat[amatch,],1,which.min)]
a b
1 tom tomm
2 tomm tom
3 alex alexi
4 alexi alex
5 chris <NA>
6 jen jenn
7 jenn jen
8 michell <NA>
答案 1 :(得分:3)
这是一种可行的方法:
a = c("tom", "tomm", "alex", "alexi", "chris", "jen", "jenn", "michell")
min_dist <- function(x, method = "cosine", tol = .5){
y <- vector(mode = "character", length = length(x))
for(i in seq_along(x)){
dis <- stringdist(x[i], x[-i], method)
if (min(dis) > tol) {
y[i] <- "0"
} else {
y[i] <- x[-i][which.min(dis)]
}
}
y
}
min_dist(a, 'cosine', .4)
## [1] "tomm" "tom" "alexi" "alex" "0" "jenn" "jen" "0"
答案 2 :(得分:0)
我们也可以使用adist
中的base R
:
library(reshape2)
out <- as.data.frame(adist(df$a)) #as.matrix(stringdistmatrix(df[,1])))
out$names <- names(out) <- df$a
out <- subset(melt(out, id='names'), value==1)[1:2]
names(out) <- names(df)
out <- rbind(out, data.frame(a=setdiff(unique(df[,1]), out$a), b='0'))
out
# a b
#2 tomm tom
#9 tom tomm
#20 alexi alex
#27 alex alexi
#47 jenn jen
#54 jen jenn
#7 chris 0
#8 michell 0
答案 3 :(得分:0)
这是一个简短的解决方案:
df$b <- sapply(seq_along(df$a), function(i){
lookup <- a[-i]
j <- stringdist::amatch(a[i], lookup, maxDist = 1)
if (is.na(j)) NA_character_ else lookup[j]
})