我有一些看起来像下面的数据;
Docs annual appendix attach begin caption contain exhibit forth head
12355_2015 0.1385056 0.0000000 0.0000000 0.3203238 0.0000000 0.3203238 0.0000000 0.0000000 0.0000000
29905_2015 0.1269635 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.3769635 0.3769635 0.0000000
51143_2015 0.1385056 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
78003_2015 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.3265111
875570_2015 0.0000000 0.5872603 0.5872603 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
885639_2015 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1166691_2015 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
12355_2016 0.1385056 0.0000000 0.0000000 0.3203238 0.0000000 0.3203238 0.0000000 0.0000000 0.0000000
51143_2016 0.1385056 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
78003_2016 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.3265111
875570_2016 0.0000000 0.5872603 0.5872603 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1166691_2016 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1341439_2016 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
51143_2017 0.1385056 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
315189_2017 0.0000000 0.0000000 0.0000000 0.0000000 0.5872603 0.0000000 0.0000000 0.0000000 0.0000000
773910_2017 0.1904452 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1166691_2017 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1341439_2017 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
51143_2018 0.1385056 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
78003_2018 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.3265111
315189_2018 0.0000000 0.0000000 0.0000000 0.0000000 0.5872603 0.0000000 0.0000000 0.0000000 0.0000000
1166691_2018 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
rownames
具有IDs
和Years
。我想计算每年和上一年之间的余弦距离。每个ID分别是年份t
和t-1
。例如,取12355_2015
行和12355_2016
行之间的余弦距离。
我可以使用来计算某些行的余弦距离;
proxy::dist(dtms_matrix[1:3,], dtms_matrix[4:6,], pairwise=TRUE, method="cosine")
但是这是错误的,因为我正在计算观察值1:3
-12355_2015, 29905_2015, 51143_2015
与观察值4:6
-78003_2015, 875570_2015, 885639_2015
我试图计算每个IDs
年之间的余弦相似度(如果存在)(并不总是存在)
我可以使用以下方法“尝试”计算所有行的余弦距离:
proxy::dist(dtms_matrix[1:7,], dtms_matrix[8:13,],dtms_matrix[14:18,],dtms_matrix[19:23,], pairwise=TRUE, method="cosine")
出现以下错误;
Error in do.call(".Call", c(list(method), list(x), list(y), pairwise, :
invalid number of rows for pairwise mode
我认为这归因于某些“文档”丢失了几年的事实……
我可以使用来计算整个矩阵;
dist.matrix = proxy::dist(dtms_matrix, method = "cosine")
并且我有提取相关信息的方法,但是我的数据非常大,并且我对计算所有文档/年之间的余弦距离不感兴趣,因为文档不同-我对余弦结果不感兴趣-例如875570_2015
和1166691_2016
。 (我只对计算875570_2015
和875570_2016
或1166691_2015
和1166691_2016
之间的距离感兴趣。
我当前的方法创建一个数据框,以通过8000 * 8000
矩阵过滤/提取信息= 6400万个结果(99.99%没用)。
如果您有任何想法如何有效地计算文档t
和t-1
的成对相似度,并保留行名和列名的重要信息,请告诉我。
数据:
dtms_matrix <- structure(c(0.138505632368819, 0.126963496338084, 0.138505632368819,
0, 0, 0, 0, 0.138505632368819, 0.138505632368819, 0, 0, 0, 0,
0.138505632368819, 0, 0.190445244507127, 0, 0, 0.138505632368819,
0, 0, 0, 0, 0, 0, 0, 0, 0.587260326009502, 0, 0, 0, 0, 0, 0.587260326009502,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.587260326009502,
0, 0, 0, 0, 0, 0.587260326009502, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0.320323814187001, 0, 0, 0, 0, 0, 0, 0.320323814187001,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.587260326009502, 0, 0, 0, 0, 0, 0.587260326009502,
0, 0, 0.320323814187001, 0, 0, 0, 0, 0, 0, 0.320323814187001,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.376963496338084,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.376963496338084, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.326511050592873, 0, 0, 0,
0, 0, 0.326511050592873, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.326511050592873,
0, 0, 0, 0.176236314121442, 0, 0.176236314121442, 0, 0, 0, 0,
0.176236314121442, 0.176236314121442, 0, 0, 0, 0, 0.176236314121442,
0, 0, 0, 0, 0.176236314121442, 0, 0, 0, 0, 0, 0, 0.22941472327791,
0, 0, 0, 0, 0, 0.22941472327791, 0, 0, 0, 0, 0.22941472327791,
0, 0, 0, 0, 0.22941472327791, 0, 0, 0, 0, 0.0967391215836105,
0.0886775281183096, 0.0967391215836105, 0.118236704157746, 0,
0, 0, 0.0967391215836105, 0.0967391215836105, 0.118236704157746,
0, 0, 0, 0.0967391215836105, 0, 0.133016292177464, 0, 0, 0.0967391215836105,
0.118236704157746, 0, 0, 0, 0.109239441924514, 0.100136155097471,
0, 0.133514873463294, 0.200272310194942, 0, 0, 0.109239441924514,
0, 0.133514873463294, 0.200272310194942, 0, 0, 0, 0.200272310194942,
0, 0, 0, 0, 0.133514873463294, 0.200272310194942, 0, 0, 0.320323814187001,
0, 0, 0, 0, 0, 0, 0.320323814187001, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.210296829671418, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.420593659342835, 0, 0, 0, 0, 0.420593659342835,
0, 0, 0, 0, 0.420593659342835, 0, 0, 0.176236314121442, 0, 0,
0, 0, 0, 0.176236314121442, 0, 0, 0, 0, 0.176236314121442, 0.323099909222643,
0, 0, 0, 0.176236314121442, 0, 0.323099909222643, 0, 0, 0.0967391215836105,
0.0886775281183096, 0.193478243167221, 0.118236704157746, 0,
0, 0, 0.0967391215836105, 0.193478243167221, 0.118236704157746,
0, 0, 0, 0.193478243167221, 0, 0.133016292177464, 0, 0, 0.193478243167221,
0.118236704157746, 0, 0, 0, 0.0967391215836105, 0.0886775281183096,
0.0967391215836105, 0.118236704157746, 0, 0, 0, 0.0967391215836105,
0.0967391215836105, 0.118236704157746, 0, 0, 0, 0.0967391215836105,
0, 0.133016292177464, 0, 0, 0.0967391215836105, 0.118236704157746,
0, 0, 0, 0, 0, 0, 0.244625984574406, 0.366938976861608, 0, 0,
0, 0, 0.244625984574406, 0.366938976861608, 0, 0, 0, 0, 0, 0,
0, 0, 0.244625984574406, 0, 0, 0, 0, 0, 0, 0.326511050592873,
0, 0, 0, 0, 0, 0.326511050592873, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.326511050592873, 0, 0, 0, 0, 0.376963496338084, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.4512123182049, 0, 0, 0, 0, 0.4512123182049, 0.22560615910245,
0, 0.22560615910245, 0, 0.4512123182049, 0.22560615910245, 0,
0, 0.22560615910245, 0.4512123182049, 0.22560615910245, 0, 0.376963496338084,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.22941472327791, 0, 0, 0, 0, 0, 0.22941472327791, 0, 0,
0, 0, 0.22941472327791, 0, 0, 0, 0, 0.22941472327791, 0, 0, 0,
0), .Dim = 23:24, .Dimnames = list(Docs = c("12355_2015", "29905_2015",
"51143_2015", "78003_2015", "875570_2015", "885639_2015", "1166691_2015",
"12355_2016", "51143_2016", "78003_2016", "875570_2016", "1166691_2016",
"1341439_2016", "51143_2017", "315189_2017", "773910_2017", "1166691_2017",
"1341439_2017", "51143_2018", "78003_2018", "315189_2018", "1166691_2018",
"1341439_2018"), Terms = c("annual", "appendix", "attach", "begin",
"caption", "contain", "exhibit", "forth", "head", "herein", "ibm",
"incorpor", "inform", "mda", "page", "pages", "refer", "report",
"requir", "review", "section", "see", "set", "stockhold")))
编辑:一个自定义函数,用于测量两个文本之间的余弦相似度
cos_text <- function(x,y)
{
library(qdapDictionaries)
x <- unlist(str_extract_all(x, "\\w+"))
y <- unlist(str_extract_all(y, "\\w+"))
x <- x[x %in% GradyAugmented]
y <- y[y %in% GradyAugmented]
if(length(x) == 0) return(NA)
if(length(y) == 0) return(NA)
table_x <- as.data.frame(table(x))
table_y <- as.data.frame(table(y))
data_frame <- NULL
data_frame$vocab <- unique(sort(c(x,y)))
data_frame <- as.data.frame(data_frame)
match <- match(data_frame$vocab, table_x$x)
data_frame$x <- table_x$Freq[match]
data_frame$x[is.na(match)] <- 0
match <- match(data_frame$vocab, table_y$y)
data_frame$y <- table_y$Freq[match]
data_frame$y[is.na(match)] <- 0
norm <- function(v)
{
return(sqrt(sum(v^2)))
}
cos <- sum(data_frame$x*data_frame$y)/norm(data_frame$x)/norm(data_frame$y)
return(cos)
}
cos_text(string1, string2)