Question

此代码：

library(dplyr)
library(stringdist)

set.seed(42)
rm(list = ls())
options(scipen = 999)

data <- data.frame(string = c("world hello", "hello world", "hello vorld", "hello world 1", "hello world", "hello world hello world"))
data$string <- as.character(data$string)

distance_function <- function(string_1, string_2) {
    stringdist(string_1, string_2, method = "qgram")
}

combinations <- combn(nrow(data), 2)
distances <- matrix(, nrow = 1, ncol = ncol(combinations))

distance_matrix <- matrix(0, nrow = nrow(data), ncol = nrow(data), dimnames = list(data$string, data$string))

for (i in 1:ncol(combinations)) {

    distance <- distance_function(data[combinations[1, i], 1], data[combinations[2, i], 1])

    distance_matrix[combinations[1, i], combinations[2, i]] <- distance
    distance_matrix[combinations[2, i], combinations[1, i]] <- distance
}

dendo <- hclust(dist(1 - distance_matrix), method = "ward.D2")

grp <- cutree(dendo, k = 3)

grp[dendo$order]

结果：

hello world hello world             hello vorld           hello world 1             hello world             world hello             hello world 
                      3                       2                       1                       1                       1                       1

如何将其转换为像这样的数据帧（按“相似性”排序）：

hello world hello world 3
hello vorld 2
hello world 1 1
hello world 1
world hello 1
hello world 1

顺便说一句，为什么：

class(grp[dendo$order])

导致：

[1] "integer"

当然不是整数吗？

枢纽hclust和角质层结果

0 个答案: