Question

我正在研究文本挖掘项目，并且我使用tm包在R中创建了一个稀疏矩阵。数据采用以下格式：

我希望它采用以下格式： Resultant Data Format

需要数据争论的帮助。

Answer 1

使用dplyr和tidyr，

的一个想法

library(dplyr)
library(tidyr)
df %>% 
 group_by(C1, C2, C3) %>% 
 summarise_each(funs(sum)) %>% 
 gather(word, freq, not:great)

#Source: local data frame [24 x 5]
#Groups: C1, C2 [4]

#      C1     C2    C3  word  freq
#   <dbl> <fctr> <dbl> <chr> <dbl>
#1      1      a     1   not     0
#2      1      a     2   not     1
#3      2      b     3   not     2
#4      2      d     2   not     0
#5      3      c     1   not     1
#6      3      c     2   not     0
#7      1      a     1  cant     1
#8      1      a     2  cant     0
#9      2      b     3  cant     0
#10     2      d     2  cant     0

数据

dput(df) structure(list(C1 = c(1, 2, 3, 2, 3, 2, 1), C2 = structure(c(1L, 2L, 3L, 2L, 3L, 4L, 1L), .Label = c("a", "b", "c", "d"), class = "factor"), C3 = c(2, 3, 2, 3, 1, 2, 1), not = c(1, 1, 0, 1, 1, 0, 0), cant = c(0, 0, 0, 0, 1, 0, 1), able = c(1, 0, 0, 0, 0, 0, 0), great = c(0, 0, 0, 0, 0, 1, 1)), .Names = c("C1", "C2", "C3", "not", "cant", "able", "great"), row.names = c(NA, -7L), class = "data.frame")

如何获得R

1 个答案: