计算在两个数据帧之间的汉明距离方面匹配行的比特数

时间:2016-06-21 00:18:21

标签: r apply sapply tapply hamming-distance

我有两个相同列数(但不是行)df1df2的数据框。对于df2中的每一行,我能够在my previous post中找到df1中与汉明距离相关的最佳(和次佳)匹配行。在该帖子中,我们一直在使用以下示例数据:

set.seed(0)
df1 <- as.data.frame(matrix(sample(1:10), ncol = 2))  ## 5 rows 2 cols
df2 <- as.data.frame(matrix(sample(1:6), ncol = 2))  ## 3 rows 2 cols

我现在需要计算等于1的位数:

  1. df2
  2. 中的每一行
  3. df1
  4. 中最匹配的行
  5. df1
  6. 中的第二个匹配行

    等于整数a的1的位数可以计算为

    sum(as.integer(intToBits(a)))
    

    我已将此应用于@ ZheyuanLi的原始功能,因此我有第1项&gt;。但是,我无法应用相同的逻辑来获取第2项&gt;和@ 3>,通过简单修改@ ZheyuanLi的功能。

    以下是@ ZheyuanLi的修改功能:

    hmd <- function(x,y) {
        rawx <- intToBits(x)
        rawy <- intToBits(y)
        nx <- length(rawx)
        ny <- length(rawy)
        if (nx == ny) {
            ## quick return
            return (sum(as.logical(xor(rawx,rawy))))
        } else if (nx < ny) {
            ## pivoting
            tmp <- rawx; rawx <- rawy; rawy <- tmp
            tmp <- nx; nx <- ny; ny <- tmp
        }
        if (nx %% ny) stop("unconformable length!") else {
            nc <- nx / ny  ## number of cycles
            return(unname(tapply(as.logical(xor(rawx,rawy)), rep(1:nc, each=ny), sum)))
        }
    }
    
    foo <- function(df1, df2, p = 2) {
        ## check p
        if (p > nrow(df2)) p <- nrow(df2)
        ## transpose for CPU cache friendly code
        xt <- t(as.matrix(df1))
        yt <- t(as.matrix(df2))
        ## after transpose, we compute hamming distance column by column
        ## a for loop is decent; no performance gain from apply family
        n <- ncol(yt)
        id <- integer(n * p)
        d <- numeric(n * p)
        sb <- integer(n)
        k <- 1:p
        for (i in 1:n) {
            set.bits <- sum(as.integer(intToBits(yt[,i])))
            distance <- hmd(xt, yt[,i])
            minp <- order(distance)[1:p]
            id[k] <- minp
            d[k] <- distance[minp]
            sb[i] <- set.bits
            k <- k + p
        }
        ## recode "id", "d" and "sb" into data frame and return
        id <- as.data.frame(matrix(id, ncol = p, byrow = TRUE))
        colnames(id) <- paste0("min.", 1:p)
        d <- as.data.frame(matrix(d, ncol = p, byrow = TRUE))
        colnames(d) <- paste0("mindist.", 1:p)
        sb <- as.data.frame(matrix(sb, ncol = 1))  ## no need for byrow as you have only 1 column
        colnames(sb) <- "set.bits.1"
        list(id = id, d = d, sb = sb)
    }
    

    运行这些给出:

    > foo(df1, df2)
    $id
      min1 min2  ## row id for best/second best match in df1
    1    1    4
    2    2    3
    3    5    2
    
    $d
      mindist.1 mindist.2  ## minimum 2 hamming distance
    1         2         2
    2         1         3
    3         1         3
    
    $sb
      set.bits.1  ## number of bits equal to 1 for each row of df2
    1          3
    2          2
    3          4
    

1 个答案:

答案 0 :(得分:1)

好的,在重新编辑你的问题后阅读(很多次!),我想我知道你想要什么。基本上我们不需要改变hmd()。您需要的商品1&gt;,2&gt;,3&gt;可以在for中的foo()循环之后计算。

要获取您调用sb的第1项&gt;项,我们可以使用tapply()。但是,沿着sb循环计算for很好,所以我不会改变它。在下文中,我将演示获得第2项的基本程序&gt;和第3项&gt;。

id内的foo()向量存储df1中所有匹配的行:

id <- c(1, 4, 2, 3, 5, 2)

所以我们可以简单地提取df1的那些行(实际上是xt的列)来计算等于1的位数。正如你所看到的,id中存在许多重复{1}},因此我们只能在unique(id)上计算:

id0 <- sort(unique(id))
## [1] 1 2 3 4 5

我们现在提取xt的子集列:

sub_xt <- xt[, id0]
##    [,1] [,2] [,3] [,4] [,5]
## V1    9    3   10    5    6
## V2    2    4    8    7    1

要为sub_xt的每列计算等于1的位数,我们再次使用tapply()和矢量化方法。

rawbits <- as.integer(intToBits(as.numeric(sub_xt)))  ## convert sub_xt to binary
sbxt0 <- unname(tapply(X = rawbits,
                      INDEX =  rep(1:length(id0), each = length(rawbits) / length(id0)),
                      FUN = sum))
## [1] 3 3 3 5 3

现在我们需要将sbxt0映射到sbxt

sbxt <- sbxt0[match(id, id0)]
## [1] 3 5 3 3 3 3

然后我们可以将sbxt转换为数据框sb1

sb1 <- as.data.frame(matrix(sbxt, ncol = p, byrow = TRUE))
colnames(sb1) <- paste(paste0("min.", 1:p), "set.bits.1", sep = ".")
##   min.1.set.bits.1 min.2.set.bits.1
## 1                3                5
## 2                3                3
## 3                3                3

最后我们可以把这些东西组装起来:

foo <- function(df1, df2, p = 2) {
    ## check p
    if (p > nrow(df2)) p <- nrow(df2)
    ## transpose for CPU cache friendly code
    xt <- t(as.matrix(df1))
    yt <- t(as.matrix(df2))
    ## after transpose, we compute hamming distance column by column
    ## a for loop is decent; no performance gain from apply family
    n <- ncol(yt)
    id <- integer(n * p)
    d <- numeric(n * p)
    sb2 <- integer(n)
    k <- 1:p
    for (i in 1:n) {
        set.bits <- sum(as.integer(intToBits(yt[,i])))
        distance <- hmd(xt, yt[,i])
        minp <- order(distance)[1:p]
        id[k] <- minp
        d[k] <- distance[minp]
        sb2[i] <- set.bits
        k <- k + p
    }
    ## compute "sb1"
    id0 <- sort(unique(id))
    sub_xt <- xt[, id0]
    rawbits <- as.integer(intToBits(as.numeric(sub_xt)))  ## convert sub_xt to binary
    sbxt0 <- unname(tapply(X = rawbits,
                           INDEX =  rep(1:length(id0), each = length(rawbits) / length(id0)),
                           FUN = sum))
    sbxt <- sbxt0[match(id, id0)]
    sb1 <- as.data.frame(matrix(sbxt, ncol = p, byrow = TRUE))
    colnames(sb1) <- paste(paste0("min.", 1:p), "set.bits.1", sep = ".")
    ## recode "id", "d" and "sb2" into data frame and return
    id <- as.data.frame(matrix(id, ncol = p, byrow = TRUE))
    colnames(id) <- paste0("min.", 1:p)
    d <- as.data.frame(matrix(d, ncol = p, byrow = TRUE))
    colnames(d) <- paste0("mindist.", 1:p)
    sb2 <- as.data.frame(matrix(sb2, ncol = 1))  ## no need for byrow as you have only 1 column
    colnames(sb2) <- "set.bits.1"
    list(id = id, d = d, sb1 = sb1, sb2 = sb2)
}

现在,运行foo(df1, df2)会给出:

> foo(df1,df2)
$id
   min.1 min.2
 1     1     4
 2     2     3
 3     5     2

 $d
  mindist.1 mindist.2
1         2         2
2         1         3
3         1         3

$sb1
   min.1.set.bits.1 min.2.set.bits.1
 1                3                5
 2                3                3
 3                3                3

$sb2
  set.bits.1
1          3
2          2
3          4

请注意,我已将sb重命名为sb2