将类“dist”的对象转换为r中的数据帧

时间:2014-05-05 14:16:16

标签: r dataframe distance

如果可能将数据帧转换为类“dist”的对象,是否可以做相反的操作?将类“dist”转换为数据框?例如

< DIST(hasil)

   1            2            3           4
2  0.088814413                                    
3  0.084929382  0.030413813                        
4  0.063245553  0.029120440 0.044418465            
5  0.061983869  0.027018512 0.036400549 0.009055385

和数据框中的结果

<

   col          row          distance
   1            2            0.088814413
   1            3            0.084929382          
   1            4            0.063245553
   1            5            0.061983869
   2            3            0.030413813
   2            4            0.029120440
   2            5            0.027018512
   3            4            0.044418465
   3            5            0.036400549
   4            5            0.009055385

4 个答案:

答案 0 :(得分:13)

library(maps)
data(us.cities)

d <- dist(head(us.cities[c("lat", "long")]))

##           1         2         3         4         5
## 2 20.160489                                        
## 3 23.139853 40.874243                              
## 4 15.584303  9.865374 38.579820                    
## 5 27.880674  7.882037 48.707100 15.189882          
## 6 26.331187 41.720457  6.900101 41.036931 49.328558

library(reshape2)

df <- melt(as.matrix(d), varnames = c("row", "col"))

df[df$row > df$col,]
##    row col     value
## 2    2   1 20.160489
## 3    3   1 23.139853
## 4    4   1 15.584303
## 5    5   1 27.880674
## 6    6   1 26.331187
## 9    3   2 40.874243
## 10   4   2  9.865374
## 11   5   2  7.882037
## 12   6   2 41.720457
## 16   4   3 38.579820
## 17   5   3 48.707100
## 18   6   3  6.900101
## 23   5   4 15.189882
## 24   6   4 41.036931
## 30   6   5 49.328558

答案 1 :(得分:6)

我实际上会写一个像这样的函数:

myFun <- function(inDist) {
  if (class(inDist) != "dist") stop("wrong input type")
  A <- attr(inDist, "Size")
  B <- if (is.null(attr(inDist, "Labels"))) sequence(A) else attr(inDist, "Labels")
  if (isTRUE(attr(inDist, "Diag"))) attr(inDist, "Diag") <- FALSE
  if (isTRUE(attr(inDist, "Upper"))) attr(inDist, "Upper") <- FALSE
  data.frame(
    row = B[unlist(lapply(sequence(A)[-1], function(x) x:A))],
    col = rep(B[-length(B)], (length(B)-1):1),
    value = as.vector(inDist))
}

现在,想象一下我们开始(注意非数字行和列名称):

dd <- as.dist((1 - cor(USJudgeRatings)[1:5, 1:5])/2)
#            CONT       INTG       DMNR       DILG
# INTG 0.56659545                                 
# DMNR 0.57684427 0.01769236                      
# DILG 0.49380400 0.06424445 0.08157452           
# CFMG 0.43154385 0.09295712 0.09332092 0.02060062

我们可以通过简单的方式改变它:

myFun(dd)
#     row  col      value
# 1  INTG CONT 0.56659545
# 2  DMNR CONT 0.57684427
# 3  DILG CONT 0.49380400
# 4  CFMG CONT 0.43154385
# 5  DMNR INTG 0.01769236
# 6  DILG INTG 0.06424445
# 7  CFMG INTG 0.09295712
# 8  DILG DMNR 0.08157452
# 9  CFMG DMNR 0.09332092
# 10 CFMG DILG 0.02060062

快速的性能比较:

set.seed(1)
x <- matrix(rnorm(1000*1000), nrow = 1000)
dd <- dist(x)

## Jake's function
fun2 <- function(inDist) {
  df <- melt(as.matrix(inDist), varnames = c("row", "col"))
  df[as.numeric(df$row) > as.numeric(df$col), ]
}

all(fun2(dd) == myFun(dd))
# [1] TRUE
system.time(fun2(dd))
#    user  system elapsed 
#   0.346   0.002   0.349 
system.time(myFun(dd))
#    user  system elapsed 
#   0.012   0.000   0.015

答案 2 :(得分:0)

我会做这样的事情......

library(reshape)
set.seed(123)

mat <- matrix(rnorm(25),ncol=5,nrow=5,byrow=T) # Creating a 5 X 5 matrix
d   <- dist(mat)

df  <- melt(as.matrix(d)) #Converting the dist object to matrix while using melt

p    <- t(apply(df[,c(1,2)],1,FUN=sort))
rmv1 <- which(p[,1] == p[,2])

p    <- paste(p[,1],p[,2],sep="|")
rmv2 <- which(duplicated(p))

df   <- df[-c(rmv1,rmv2),] #removing self distances and duplicated distances

df
X1 X2    value
2  1 3.812287
3  1 2.311832
4  1 4.385048
5  1 2.854179
3  2 1.916895
4  2 1.557744
5  2 2.880357
4  3 2.509214
5  3 2.886526
5  4 3.408147

答案 3 :(得分:0)

这是另一种避免使用melt()as.matrix()的方法...最好也避免使用subset(),但我将其留作家庭作业

dist.to.df <- function(d){
  size <- attr(d, "Size")
  return(
    data.frame(
      subset(expand.grid(row=2:size, col=1:(size-1)), row > col),
      distance=as.numeric(d),
      row.names = NULL
    )
  )
}

这给...

library(maps)
data(us.cities)
d <- dist(head(us.cities[c("lat", "long")]))
dist.to.df(d)
##    row col  distance
## 1    2   1 20.160489
## 2    3   1 23.139853
## 3    4   1 15.584303
## 4    5   1 27.880674
## 5    6   1 26.331187
## 6    3   2 40.874243
## 7    4   2  9.865374
## 8    5   2  7.882037
## 9    6   2 41.720457
## 10   4   3 38.579820
## 11   5   3 48.707100
## 12   6   3  6.900101
## 13   5   4 15.189882
## 14   6   4 41.036931
## 15   6   5 49.328558