我正在使用Levenshtein距离测量进行大量的字符串比较,但因为我需要能够解释字符串潜在结构中的空间邻接,所以我必须制作自己的脚本,包括权重函数
我现在的问题是我的脚本非常低效。我必须做大约600,000次比较,脚本需要几个小时才能完成。我在那里寻求一种方法来使我的脚本更有效率,但作为一个自学成才,我不知道如何解决这个问题。
这是功能:
zeros <- function(lengthA,lengthB){
m <- matrix(c(rep(0,lengthA*lengthB)),nrow=lengthA,ncol=lengthB)
return(m)
}
weight <- function(A,B,weights){
if (weights == TRUE){
# cost_weight defines the matrix structure of the AOI-placement
cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
"m","n","o","p","q","r","s","t","u","v","w","x"),
nrow=6)
max_walk <- 8.00 # defined as the maximum posible distance between letters in
# the cost_weight matrix
indexA <- which(cost_weight==A, arr.ind=TRUE)
indexB <- which(cost_weight==B, arr.ind=TRUE)
walk <- abs(indexA[1]-indexB[1])+abs(indexA[2]-indexB[2])
w <- walk/max_walk
}
else {w <- 1}
return(w)
}
dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
D <- zeros(nchar(A)+1,nchar(B)+1)
As <- strsplit(A,"")[[1]]
Bs <- strsplit(B,"")[[1]]
# filling out the matrix
for (i in seq(to=nchar(A))){
D[i + 1,1] <- D[i,1] + deletion * weight(As[i],Bs[1], weights)
}
for (j in seq(to=nchar(B))){
D[1,j + 1] <- D[1,j] + insertion * weight(As[1],Bs[j], weights)
}
for (i in seq(to=nchar(A))){
for (j in seq(to=nchar(B))){
if (As[i] == Bs[j]){
D[i + 1,j + 1] <- D[i,j]
}
else{
D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight(As[i],Bs[j], weights),
D[i,j + 1] + deletion * weight(As[i],Bs[j], weights),
D[i,j] + substitution * weight(As[i],Bs[j], weights))
}
}
}
return(D)
}
levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
# Compute levenshtein distance between iterables A and B
if (nchar(A) == nchar(B) & A == B){
return(0)
}
if (nchar(B) > nchar(A)){
C <- A
A <- B
B <- A
#(A, B) <- (B, A)
}
if (nchar(A) == 0){
return (nchar(B))
}
else{
return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
}
}
将我的Levenshtein测量值与stringdist测量值的性能进行比较,性能差了83倍。
library (stringdist)
library(rbenchmark)
A <-"abcdefghijklmnopqrstuvwx"
B <-"xwvutsrqponmlkjihgfedcba"
benchmark(levenshtein(A,B), stringdist(A,B,method="lv"),
columns=c("test", "replications", "elapsed", "relative"),
order="relative", replications=10)
test replications elapsed relative
2 stringdist(A, B, method = "lv") 10 0.01 1
1 levenshtein(A, B) 10 0.83 83
有没有人有改进我的脚本的想法?
答案 0 :(得分:1)
以下代码已经有了一些改进(代码中的代码;与之前的代码计算方式相同,与stringdist
不同),但我确信它可以更加简化和加快。
zeros <- function(lengthA,lengthB){
m <- matrix(0, nrow=lengthA, ncol=lengthB)
return(m)
}
weight <- function(A,B,weights){
if (weights){
# cost_weight defines the matrix structure of the AOI-placement
cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
"m","n","o","p","q","r","s","t","u","v","w","x"),
nrow=6)
max_walk <- 8.00 # defined as the maximum posible distance between letters in
# the cost_weight matrix
amats <- lapply(A, `==`, y=cost_weight)
bmats <- lapply(B, `==`, y=cost_weight)
walk <- mapply(function(a, b){
sum(abs(which(a, arr.ind=TRUE) - which(b, arr.ind=TRUE)))
}, amats, bmats)
return(walk/max_walk)
}
else return(1)
}
dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
#browser()
D <- zeros(nchar(A)+1,nchar(B)+1)
As <- strsplit(A,"")[[1]]
Bs <- strsplit(B,"")[[1]]
# filling out the matrix
weight.mat <- outer(As, Bs, weight, weights=weights)
D[,1] <- c(0, deletion * cumsum(weight.mat[, 1]))
D[1,] <- c(0, insertion * cumsum(weight.mat[1,]))
for (i in seq(to=nchar(A))){
for (j in seq(to=nchar(B))){
if (As[i] == Bs[j]){
D[i + 1,j + 1] <- D[i,j]
}
else{
D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight.mat[i, j],
D[i,j + 1] + deletion * weight.mat[i, j],
D[i,j] + substitution * weight.mat[i, j])
}
}
}
return(D)
}
levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
# Compute levenshtein distance between iterables A and B
if (nchar(A) == nchar(B) & A == B){
return(0)
}
if (nchar(B) > nchar(A)){
C <- A
A <- B
B <- A
#(A, B) <- (B, A)
}
if (nchar(A) == 0){
return (nchar(B))
}
else{
return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
}
}