提高R中脚本(Levenshtein距离与权重)的性能

时间:2014-05-09 09:12:16

标签: r performance levenshtein-distance stringdist

我正在使用Levenshtein距离测量进行大量的字符串比较,但因为我需要能够解释字符串潜在结构中的空间邻接,所以我必须制作自己的脚本,包括权重函数

我现在的问题是我的脚本非常低效。我必须做大约600,000次比较,脚本需要几个小时才能完成。我在那里寻求一种方法来使我的脚本更有效率,但作为一个自学成才,我不知道如何解决这个问题。

这是功能:

zeros <- function(lengthA,lengthB){
  m <- matrix(c(rep(0,lengthA*lengthB)),nrow=lengthA,ncol=lengthB)
  return(m)
}


weight <- function(A,B,weights){
  if (weights == TRUE){

    # cost_weight defines the matrix structure of the AOI-placement
    cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
                           "m","n","o","p","q","r","s","t","u","v","w","x"),
                          nrow=6)

    max_walk <- 8.00  # defined as the maximum posible distance between letters in 
                      # the cost_weight matrix
    indexA <- which(cost_weight==A, arr.ind=TRUE)
    indexB <- which(cost_weight==B, arr.ind=TRUE)
    walk <- abs(indexA[1]-indexB[1])+abs(indexA[2]-indexB[2])
    w <- walk/max_walk
  }

  else {w <- 1}

  return(w)
}


dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
  D <- zeros(nchar(A)+1,nchar(B)+1)
  As <- strsplit(A,"")[[1]]
  Bs <- strsplit(B,"")[[1]]
  # filling out the matrix
  for (i in seq(to=nchar(A))){ 
    D[i + 1,1] <- D[i,1] + deletion * weight(As[i],Bs[1], weights)
  }
  for (j in seq(to=nchar(B))){ 
    D[1,j + 1] <- D[1,j] + insertion * weight(As[1],Bs[j], weights)
  }
  for (i in seq(to=nchar(A))){ 
    for (j in seq(to=nchar(B))){
      if (As[i] == Bs[j]){
        D[i + 1,j + 1] <- D[i,j]
        } 
      else{
        D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight(As[i],Bs[j], weights),
                              D[i,j + 1] + deletion * weight(As[i],Bs[j], weights),
                              D[i,j]     + substitution * weight(As[i],Bs[j], weights))
      }
    }
  }
  return(D)
}


levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
  # Compute levenshtein distance between iterables A and B

  if (nchar(A) == nchar(B) & A == B){
    return(0)
  }

  if (nchar(B) > nchar(A)){
    C <- A
    A <- B
    B <- A
    #(A, B) <- (B, A)
  }

  if (nchar(A) == 0){
    return (nchar(B))
  }

  else{
    return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
  }
}

将我的Levenshtein测量值与stringdist测量值的性能进行比较,性能差了83倍。

library (stringdist)
library(rbenchmark)

A <-"abcdefghijklmnopqrstuvwx"
B <-"xwvutsrqponmlkjihgfedcba"

benchmark(levenshtein(A,B), stringdist(A,B,method="lv"),
          columns=c("test", "replications", "elapsed", "relative"),
          order="relative", replications=10) 


                             test replications elapsed relative
2 stringdist(A, B, method = "lv")           10    0.01        1
1               levenshtein(A, B)           10    0.83       83

有没有人有改进我的脚本的想法?

1 个答案:

答案 0 :(得分:1)

以下代码已经有了一些改进(代码中的代码;与之前的代码计算方式相同,与stringdist不同),但我确信它可以更加简化和加快。

zeros <- function(lengthA,lengthB){
  m <- matrix(0, nrow=lengthA, ncol=lengthB)
  return(m)
}


weight <- function(A,B,weights){
  if (weights){
    # cost_weight defines the matrix structure of the AOI-placement
    cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
                            "m","n","o","p","q","r","s","t","u","v","w","x"),
                          nrow=6)

    max_walk <- 8.00  # defined as the maximum posible distance between letters in 
    # the cost_weight matrix
    amats <- lapply(A, `==`, y=cost_weight)
    bmats <- lapply(B, `==`, y=cost_weight)
    walk <- mapply(function(a, b){
      sum(abs(which(a, arr.ind=TRUE) - which(b, arr.ind=TRUE)))
    }, amats, bmats)
    return(walk/max_walk)
  }
  else return(1)
}


dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
  #browser()
  D <- zeros(nchar(A)+1,nchar(B)+1)
  As <- strsplit(A,"")[[1]]
  Bs <- strsplit(B,"")[[1]]
  # filling out the matrix
  weight.mat <- outer(As, Bs, weight, weights=weights)
  D[,1] <- c(0, deletion * cumsum(weight.mat[, 1]))
  D[1,] <- c(0, insertion * cumsum(weight.mat[1,]))

  for (i in seq(to=nchar(A))){ 
    for (j in seq(to=nchar(B))){
      if (As[i] == Bs[j]){
        D[i + 1,j + 1] <- D[i,j]
      } 
      else{
        D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight.mat[i, j],
                              D[i,j + 1] + deletion * weight.mat[i, j],
                              D[i,j]     + substitution * weight.mat[i, j])
      }
    }
  }
  return(D)
}


levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
  # Compute levenshtein distance between iterables A and B

  if (nchar(A) == nchar(B) & A == B){
    return(0)
  }

  if (nchar(B) > nchar(A)){
    C <- A
    A <- B
    B <- A
    #(A, B) <- (B, A)
  }

  if (nchar(A) == 0){
    return (nchar(B))
  }

  else{
    return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
  }
}