ngram在Quanteda中参考文档名

时间:2018-12-17 10:25:55

标签: r quanteda dfm

我正在尝试创建与quanteda::textstat_frequency的输出类似的数据表,但又增加了一列docnames,这是一串包含特定令牌的文档名称。 例如

a_corpus <- quanteda::corpus(c("some corpus text of no consequence that in practice is going to be very large",
                                   "and so one might expect a very large number of ngrams but for nlp purposes only care about top ten",
                                   "adding some corpus text word repeats to ensure ngrams top ten selection approaches are working"))

ngrams_dfm <- quanteda::dfm(a_corpus, tolower = T, stem = F, ngrams = 2)
freq = textstat_frequency(ngrams_dfm)
# freq's header has feature, frequency, rank, docfreq, group

data.table(feature = featnames(ngrams_dfm )[1:50], 
       frequency = colSums(ngrams_dfm)[1:50],
       doc_names = paste(docnames, collapse = ',')?, # what should be here?
       keep.rownames = F,
       stringsAsFactors = F)

3 个答案:

答案 0 :(得分:2)

另一种方法是使用udpipe R包。下面的示例-它具有轻松实现的优势 可以基于词性标签进行选择,或者也可以使用它来选择特定的依赖项解析结果,这比双字母组要好得多(但这是另一个问题)

library(udpipe)
library(data.table)
txt <- c("some corpus text of no consequence that in practice is going to be very large",
       "and so one might expect a very large number of ngrams but for nlp purposes only care about top ten",
       "adding some corpus text word repeats to ensure ngrams top ten selection approaches are working")
x <- udpipe(txt, "english", trace = TRUE) ## rich output, but takes a while for large volumes of text
x <- setDT(x)
x <- x[, bigram_lemma := txt_nextgram(lemma, n = 2, sep = "-"), by = list(doc_id, paragraph_id, sentence_id)]
x <- x[, upos_next := txt_next(upos, n = 1), by = list(doc_id, paragraph_id, sentence_id)]
x_nouns <- subset(x, upos %in% c("ADJ") & upos_next %in% c("NOUN"))
View(x)
freqs <- document_term_frequencies(x, document = "doc_id", term = c("bigram_lemma", "lemma"))
dtm <- document_term_matrix(freqs)

答案 1 :(得分:1)

首先,您可以将文档名称添加到语料库:

document_names <- c("doc1", "doc2", "doc3")

a_corpus <- quanteda::corpus(x = c("some corpus text of no consequence that in practice is going to be very large",
                               "and so one might expect a very large number of ngrams but for nlp purposes only care about top ten",
                               "adding some corpus text word repeats to ensure ngrams top ten selection approaches are working"),
                             docnames = document_names)

a_corpus
# Corpus consisting of 3 documents and 0 docvars.

现在您可以在后续的Quanteda函数调用中使用文档名称。

ngrams_dfm <- quanteda::dfm(a_corpus, tolower = T, stem = F, ngrams = 2)

ngrams_dfm 
# Document-feature matrix of: 3 documents, 43 features (63.6% sparse).

您还可以使用textstat_frequency中的groups选项在频率结果中获取文档名称

freq = textstat_frequency(ngrams_dfm, groups = docnames(ngrams_dfm))
head(freq)
           feature frequency rank docfreq group
1      some_corpus         1    1       1  doc1
2      corpus_text         1    2       1  doc1
3          text_of         1    3       1  doc1
4            of_no         1    4       1  doc1
5   no_consequence         1    5       1  doc1
6 consequence_that         1    6       1  doc1

如果要将数据从ngrams_dfm获取到data.frame,请在Quanteda中使用convert函数:

convert(ngrams_dfm, to = "data.frame")

 document some_corpus corpus_text text_of of_no no_consequence consequence_that that_in in_practice practice_is is_going going_to to_be
1     doc1           1           1       1     1              1                1       1           1           1        1        1     1
2     doc2           0           0       0     0              0                0       0           0           0        0        0     0
3     doc3           1           1       0     0              0                0       0           0           0        0        0     0

您可以重塑它以获得所需的内容:这是dplyr / tidyr的示例。

library(dplyr)

 convert(ngrams_dfm, to = "data.frame") %>% 
  tidyr::gather(feature, frequency, -document) %>% 
  group_by(document, feature) %>% 
  summarise(frequency = sum(frequency)) 

# A tibble: 129 x 3
# Groups:   document [?]
   document feature          frequency
   <chr>    <chr>                <dbl>
 1 doc1     a_very                   0
 2 doc1     about_top                0
 3 doc1     adding_some              0
 4 doc1     and_so                   0
 5 doc1     approaches_are           0
 6 doc1     are_working              0
 7 doc1     be_very                  1
 8 doc1     but_for                  0
 9 doc1     care_about               0
10 doc1     consequence_that         1
# ... with 119 more rows

或带有data.table:

out <- data.table(convert(ngrams_dfm, to = "data.frame"))
melt(out, id.vars = "document", 
     variable.name = "feature", value.name = "freq")
     document     feature freq
  1:     doc1 some_corpus    1
  2:     doc2 some_corpus    0
  3:     doc3 some_corpus    1
  4:     doc1 corpus_text    1
  5:     doc2 corpus_text    0
 ---                          
125:     doc2  care_about    1
126:     doc3  care_about    0
127:     doc1   about_top    0
128:     doc2   about_top    1
129:     doc3   about_top    0

答案 2 :(得分:1)

有趣的答案...但不是OP的问题。无需判断为什么为什么,使用 data.table 正是您想要的。

# set up the data.table without the doc_names
freq_dt <- textstat_frequency(ngrams_dfm) %>%
  data.table()
setkey(freq_dt, feature)

# do the docnames collapsing as a separate data.table
docnames_dt <-
  textstat_frequency(ngrams_dfm, groups = docnames(ngrams_dfm))[, c("feature", "group")] %>%
  data.table()
docnames_dt <- docnames_dt[, doc_names := paste(group, collapse = ","), by = feature]
docnames_dt <- unique(docnames_dt[, c("feature", "doc_names")])
setkey(docnames_dt, feature)

# quick merge
answerdt <- freq_dt[docnames_dt][, c("feature", "frequency", "doc_names")]

# show the results
setorder(answerdt, -frequency)
head(answerdt, 10)
##            feature frequency   doc_names
##  1:    corpus_text         2 text1,text3
##  2:    some_corpus         2 text1,text3
##  3:        top_ten         2 text2,text3
##  4:     very_large         2 text1,text2
##  5:         a_very         1       text2
##  6:      about_top         1       text2
##  7:    adding_some         1       text3
##  8:         and_so         1       text2
##  9: approaches_are         1       text3
## 10:    are_working         1       text3