Question

我尝试使用stringdist来识别同一向量中最大距离为1的所有字符串，然后发布匹配项。以下是数据样本：

启动数据框：

a = c("tom", "tomm", "alex", "alexi", "chris", "jen", "jenn", "michell") 
b = c(NA) 
df = data.frame(a,b)

期望的结果：

a = c("tom", "tomm", "alex", "alexi", "chris", "jen", "jenn", "michell") 
b = c("tomm", "tom", "alexi", "alex", 0, "jenn", "jen", 0) 
df = data.frame(a,b)

我可以将stringdist用于两个向量，但是在将其用于一个向量时遇到了问题。感谢您的帮助，R社区。

Answer 1

您可以使用stringdistmatrix和which.min：

df = data.frame(a,b, stringsAsFactors = FALSE)
mat <- stringdistmatrix(df$a, df$a)
mat[mat==0] <- NA # ignore self
mat[mat>4] <- NA  # cut level
amatch <- rowSums(mat, na.rm = TRUE)>0 # ignore no match
df$b[amatch] <- df$a[apply(mat[amatch,],1,which.min)]
        a     b
1     tom  tomm
2    tomm   tom
3    alex alexi
4   alexi  alex
5   chris  <NA>
6     jen  jenn
7    jenn   jen
8 michell  <NA>

Answer 2

这是一种可行的方法：

a = c("tom", "tomm", "alex", "alexi", "chris", "jen", "jenn", "michell") 

min_dist <- function(x, method = "cosine", tol = .5){
    y <- vector(mode = "character", length = length(x))
    for(i in seq_along(x)){
        dis <- stringdist(x[i], x[-i], method)
        if (min(dis) > tol) {
            y[i] <- "0"
        } else {
            y[i] <- x[-i][which.min(dis)]
        }
    }
    y
}

min_dist(a, 'cosine', .4)

## [1] "tomm"  "tom"   "alexi" "alex"  "0"      "jenn"  "jen"   "0"

Answer 3

我们也可以使用adist中的base R：

library(reshape2)
out <- as.data.frame(adist(df$a)) #as.matrix(stringdistmatrix(df[,1])))
out$names <- names(out) <- df$a
out <- subset(melt(out, id='names'), value==1)[1:2]
names(out) <- names(df)
out <- rbind(out, data.frame(a=setdiff(unique(df[,1]), out$a), b='0'))
out
#   a     b
#2     tomm   tom
#9      tom  tomm
#20   alexi  alex
#27    alex alexi
#47    jenn   jen
#54     jen  jenn
#7    chris     0
#8  michell     0

Answer 4

这是一个简短的解决方案：

df$b <- sapply(seq_along(df$a), function(i){ 
  lookup <- a[-i]
  j <- stringdist::amatch(a[i], lookup, maxDist = 1)
  if (is.na(j)) NA_character_ else lookup[j]
})

在一个向量上的字符串

4 个答案: