我刚开始使用R进行文本挖掘并遇到了问题。
我已成功为我的数据集中的单个单词绘制了tf_idf,其中包含3个不同的列(正数,负数和库) - 列名称为“Box'”。
我正在尝试为bigrams和trigrams做同样的事情并使用相同的代码:
Trigram_tibble %>%
arrange(desc(tf_idf)) %>%
mutate(trigram = factor(trigram, levels = rev(unique(trigram)))) %>%
group_by(Box) %>%
top_n(10, tf_idf) %>%
ungroup %>%
ggplot(aes(trigram, tf_idf, fill = Box)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~Box, ncol = 2, scales = "free") +
coord_flip()
我发现(我认为)' top_n'函数返回具有最高排名的三元组,并且它自动使用tibble中的最后一个变量(在我的例子中,这是tf_idf,我选择了n-10)。然而,当为bigrams运行它时,我只能生成沿y轴似乎有几百(千?)个bigrams的图表。
在图片中你可以看到负面变量看起来很好(我已经将它编辑为数据保护),但其他两个都不是!
我最初从整洁的文本挖掘书中获取了这段代码。
编辑 - 添加数据样本我现在最好的猜测是' top_n' tf_idf得分碰巧有许多完全相同。在这种情况下,我现在不确定这是一个有用的计算,我想知道为什么它在整洁的教科书中工作得很好,但不是我的数据。
我将Trigram_tibble减少到50个观察值,这是dput的输出(Trigram_tibble)(我已经模糊了调查回复文本三元组)
a< -Trigram_tibble [1:50,1:8] dput(a)中 结构(答复者= c(1294L,2693L,42L,463L,463L, 1481L,1706L,1891L,1917L,2442L,2693L,3590L,3590L,3916L, 4454L,4682L,5996L,6283L,6283L,6568L,9101L,2L,3L,4L, 4L,4L,8L,12L,12L,13L,13L,13L,13L,13L,13L,13L,13L, 18L,18L,18L,18L,20L,21L,21L,21L,21L,21L,21L,21L,21L ),Box = c("肯定","否定","否定","否定","否定" , "否定","银行","肯定","否定","否定","否定& #34 ;, " Bank"," Bank"," Negative"," Positive"," Negative"," Negative& #34 ;, "否定","否定","否定","否定","否定","否定& #34 ;, "否定","否定","否定","否定","银行","银行& #34 ;, "否定","否定","否定","否定","否定","肯定& #34 ;, "肯定","肯定","否定","否定","否定","否定& #34 ;, " Negative"," Bank"," Bank"," Bank"," Negative"," Negative& #34;,"否定", "否定","否定"),trigram = c(" xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"," xxx xxx xxx", " xxx xxx xxx"," xxx xxx xxx"),n = c(4L, 3L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L, 2L,2L,2L,2L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L, 1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L, 1L),总= c(1714L,2899L,2899L,2899L,2899L,2899L,836L, 1714L,2899L,2899L,2899L,836L,836L,2899L,1714L,2899L, 2899L,2899L,2899L,2899L,2899L,2899L,2899L,2899L,2899L, 2899L,2899L,836L,836L,2899L,2899L,2899L,2899L,2899L, 1714L,1714L,1714L,2899L,2899L,2899L,2899L,2899L,836L, 836L,836L,2899L,2899L,2899L,2899L,2899L),tf = c(0.00233372228704784, 0.00103483959986202,0.000689893066574681,0.000689893066574681, 0.000689893066574681,0.000689893066574681,0.00239234449760766, 0.00116686114352392,0.000689893066574681,0.000689893066574681, 0.000689893066574681,0.00239234449760766,0.00239234449760766, 0.000689893066574681,0.00116686114352392,0.000689893066574681, 0.000689893066574681,0.000689893066574681,0.000689893066574681, 0.000689893066574681,0.000689893066574681,0.00034494653328734, 0.00034494653328734,0.00034494653328734,0.00034494653328734, 0.00034494653328734,0.00034494653328734,0.00119617224880383, 0.00119617224880383,0.00034494653328734,0.00034494653328734, 0.00034494653328734,0.00034494653328734,0.00034494653328734, 0.00058343057176196,0.00058343057176196,0.00058343057176196, 0.00034494653328734,0.00034494653328734,0.00034494653328734, 0.00034494653328734,0.00034494653328734,0.00119617224880383, 0.00119617224880383,0.00119617224880383,0.00034494653328734, 0.00034494653328734,0.00034494653328734,0.00034494653328734, 0.00034494653328734),idf = c(-2.07944154167984,0.405465108108164, 1.09861228866811,1.09861228866811,1.09861228866811,1.09861228866811, 0,1.09861228866811,1.09861228866811,-2.07944154167984,1.09861228866811, 0.405465108108164,1.09861228866811,-0.693147180559945,1.09861228866811, 1.09861228866811,0,1.09861228866811,1.09861228866811,-1.29928298413026, 1.09861228866811,1.09861228866811,1.09861228866811,1.09861228866811, 1.09861228866811,1.09861228866811,1.09861228866811,1.09861228866811, 1.09861228866811,1.09861228866811,1.09861228866811,1.09861228866811, 1.09861228866811,1.09861228866811,1.09861228866811,1.09861228866811, 1.09861228866811,1.09861228866811,1.09861228866811,1.09861228866811, 1.09861228866811,1.09861228866811,1.09861228866811,1.09861228866811, 1.09861228866811,1.09861228866811,1.09861228866811,1.09861228866811, 1.09861228866811,1.09861228866811),tf_idf = c(-0.00485283907043135, 0.000419591350232664,0.000757925000805871,0.000757925000805871, 0.000757925000805871,0.000757925000805871,0,0.0012819279914447, 0.000757925000805871,-0.00143459230195228,0.000757925000805871, 0.00097001222035446,0.00262825906379931,-0.000478197433984095, 0.0012819279914447,0.000757925000805871,0,0.000757925000805871, 0.000757925000805871,-0.000896366322269928,0.000757925000805871, 0.000378962500402935,0.000378962500402935,0.000378962500402935, 0.000378962500402935,0.000378962500402935,0.000378962500402935, 0.00131412953189965,0.00131412953189965,0.000378962500402935, 0.000378962500402935,0.000378962500402935,0.000378962500402935, 0.000378962500402935,0.000640963995722351,0.000640963995722351, 0.000640963995722351,0.000378962500402935,0.000378962500402935, 0.000378962500402935,0.000378962500402935,0.000378962500402935, 0.00131412953189965,0.00131412953189965,0.00131412953189965, 0.000378962500402935,0.000378962500402935,0.000378962500402935, 0.000378962500402935,0.000378962500402935)),row.names = c(NA, -50L),class = c(" tbl_df"," tbl"," data.frame"))