(道歉,我不确定这篇文章的最佳标题是什么,随时编辑)。
假设我在单词及其类型(即字典)之间有以下关系结构:
dictionary <- data.frame(level1=c(rep("Positive", 3), rep("Negative", 3)), level2 = c("happy", "fantastic", "great", "sad", "rubbish", "awful"))
# level1 level2
# 1 Positive happy
# 2 Positive fantastic
# 3 Positive great
# 4 Negative sad
# 5 Negative rubbish
# 6 Negative awful
我们已经计算了七个文件(即术语 - 文档矩阵)的出现次数:
set.seed(42)
range = 0:3
df <- data.frame(row.names = c("happy", "fantastic", "great", "sad", "rubbish", "awful"), doc1 = sample(x=range, size=6, replace=TRUE), doc2 = sample(x=range, size=6, replace=TRUE), doc3 = sample(x=range, size=6, replace=TRUE), doc4 = sample(x=range, size=6, replace=TRUE), doc5 = sample(x=range, size=6, replace=TRUE), doc6 = sample(x=range, size=6, replace=TRUE), doc7 = sample(x=range, size=6, replace=TRUE))
# doc1 doc2 doc3 doc4 doc5 doc6 doc7
# happy 3 2 3 1 0 2 0
# fantastic 3 0 1 2 2 3 0
# great 1 2 1 3 1 1 3
# sad 3 2 3 0 3 2 2
# rubbish 2 1 3 3 1 0 1
# awful 2 2 0 3 3 3 1
然后我可以很容易地计算出两个单词出现在同一个文档中的频率(即共现或邻接矩阵):
# binary to indicate a co-occurrence
df[df > 0] <- 1
# sum co-occurrences
m <- as.matrix(df) %*% t(as.matrix(df))
# happy fantastic great sad rubbish awful
# happy 5 4 5 4 4 4
# fantastic 4 5 5 4 4 4
# great 5 5 7 6 6 6
# sad 4 4 6 6 5 5
# rubbish 4 4 6 5 6 5
# awful 4 4 6 5 5 6
问题:我如何重构我的共生矩阵,以便我在字典中查看单词类型(level1)而不是单词本身(level2)?
即。我想:
data.frame(row.names = c("Positive", "Negative"), Positive = c(5+4+5+4+5+5+5+5+7, 4+4+6+4+4+6+4+4+6), Negative = c(4+4+4+4+4+4+6+6+6, 6+5+5+5+6+5+5+5+6))
# Positive Negative
# Positive 45 42
# Negative 42 48
到目前为止我做了什么:以前我曾希望能够从这个问题中推断出这个过程Sum together columns of data.frame based on name type
然而,虽然我可以减少行数:
require(data.table)
dt <- data.table(m)
dt[, level1:=c(rep("Positive", 3), rep("Negative", 3))]
dt[, lapply(.SD, sum), by = "level1"]
# level1 happy fantastic great sad rubbish awful
# 1: Positive 14 14 17 14 14 14
# 2: Negative 12 12 18 16 16 16
我无法确定如何根据需要减少列。
答案 0 :(得分:6)
继续df[df > 0] <- 1
library(reshape)
library(reshape2)
library(data.table)
# incorporating @RicardoSaporta's suggestion of using data.table(keep.rownames = TRUE)
dt <- data.table(as.matrix(df) %*% t(as.matrix(df)), keep.rownames = TRUE)
#reducing matrix format to plain data format, look at dt to see the change
dt <- melt(dt, "rn")
#getting positive/negative for word1 and word2
dt <- merge(dt,dictionary, all.x = TRUE, by.y = "level2", by.x = "rn")
dt <- merge(dt,dictionary, all.x = TRUE, by.y = "level2", by.x = "variable", suffixes = c("_1","_2"))
#getting counts for each positive/negative - positive/negative combination
dt <- data.table(dt)
dt[,list(value = sum(value)), by = c("level1_1","level1_2")]
#structuring
cast(dt,level1_1~level1_2, fun.aggregate=sum)
输出
> cast(dt,level1_1~level1_2, fun.aggregate=sum)
level1_1 Negative Positive
1 Negative 48 42
2 Positive 42 45
答案 1 :(得分:5)
与目前为止的其他两个基本相同的解决方案,只是更紧凑,可能更快一点:
library(reshape2)
library(data.table)
mdt = data.table(melt(m), key = 'Var1')
dic = data.table(dictionary, key = 'level2')
dcast(dic[setkey(dic[mdt], Var2)], level1 ~ level1.1, fun.aggregate = sum)
# level1 Negative Positive
#1 Negative 48 42
#2 Positive 42 45
答案 2 :(得分:4)
您可以返回一步,在创建共生矩阵之前对邻接矩阵进行聚合:
dict <- data.table(dictionary,key='level2')
adj2 <- data.table(df,keep.rownames=TRUE)
adj1 <- adj2[,lapply(.SD,sum),by=dict[rn]$level1]
# one tedious step:
adj1mat <- as.matrix(adj1[,-1,with=FALSE])
rownames(adj1mat) <- as.character(adj1$dict)
m1 <- adj1mat %*% t(adj1mat)
# Positive Negative
# Positive 45 42
# Negative 42 48
将字典存储为键控数据是有意义的。无论如何,我希望。
答案 3 :(得分:3)
我们可以使用aggregate
两次m
您的矩阵by
。我们必须首先使用match
将level2单词转换为level1单词。我很确定你可以在一个电话中做到这一点,但我无法理解它。两个电话并不是那么糟糕。
# Match Positive and Negative to words
colnames(m) <- dictionary$level1[ match( colnames( m ) , dictionary$level2 ) ]
rownames(m) <- dictionary$level1[ match( rownames( m ) , dictionary$level2 ) ]
# Aggregate down to desired result
tmp <- do.call( cbind , by( m , INDICES = colnames(m) , FUN=colSums ) )
do.call(cbind , by( tmp , INDICES = rownames(m) , FUN=colSums ) )
# Negative Positive
#Negative 48 42
#Positive 42 45