我正在尝试创建与quanteda::textstat_frequency
的输出类似的数据表,但又增加了一列docnames
,这是一串包含特定令牌的文档名称。
例如
a_corpus <- quanteda::corpus(c("some corpus text of no consequence that in practice is going to be very large",
"and so one might expect a very large number of ngrams but for nlp purposes only care about top ten",
"adding some corpus text word repeats to ensure ngrams top ten selection approaches are working"))
ngrams_dfm <- quanteda::dfm(a_corpus, tolower = T, stem = F, ngrams = 2)
freq = textstat_frequency(ngrams_dfm)
# freq's header has feature, frequency, rank, docfreq, group
data.table(feature = featnames(ngrams_dfm )[1:50],
frequency = colSums(ngrams_dfm)[1:50],
doc_names = paste(docnames, collapse = ',')?, # what should be here?
keep.rownames = F,
stringsAsFactors = F)
答案 0 :(得分:2)
另一种方法是使用udpipe R包。下面的示例-它具有轻松实现的优势 可以基于词性标签进行选择,或者也可以使用它来选择特定的依赖项解析结果,这比双字母组要好得多(但这是另一个问题)
library(udpipe)
library(data.table)
txt <- c("some corpus text of no consequence that in practice is going to be very large",
"and so one might expect a very large number of ngrams but for nlp purposes only care about top ten",
"adding some corpus text word repeats to ensure ngrams top ten selection approaches are working")
x <- udpipe(txt, "english", trace = TRUE) ## rich output, but takes a while for large volumes of text
x <- setDT(x)
x <- x[, bigram_lemma := txt_nextgram(lemma, n = 2, sep = "-"), by = list(doc_id, paragraph_id, sentence_id)]
x <- x[, upos_next := txt_next(upos, n = 1), by = list(doc_id, paragraph_id, sentence_id)]
x_nouns <- subset(x, upos %in% c("ADJ") & upos_next %in% c("NOUN"))
View(x)
freqs <- document_term_frequencies(x, document = "doc_id", term = c("bigram_lemma", "lemma"))
dtm <- document_term_matrix(freqs)
答案 1 :(得分:1)
首先,您可以将文档名称添加到语料库:
document_names <- c("doc1", "doc2", "doc3")
a_corpus <- quanteda::corpus(x = c("some corpus text of no consequence that in practice is going to be very large",
"and so one might expect a very large number of ngrams but for nlp purposes only care about top ten",
"adding some corpus text word repeats to ensure ngrams top ten selection approaches are working"),
docnames = document_names)
a_corpus
# Corpus consisting of 3 documents and 0 docvars.
现在您可以在后续的Quanteda函数调用中使用文档名称。
ngrams_dfm <- quanteda::dfm(a_corpus, tolower = T, stem = F, ngrams = 2)
ngrams_dfm
# Document-feature matrix of: 3 documents, 43 features (63.6% sparse).
您还可以使用textstat_frequency
中的groups选项在频率结果中获取文档名称
freq = textstat_frequency(ngrams_dfm, groups = docnames(ngrams_dfm))
head(freq)
feature frequency rank docfreq group
1 some_corpus 1 1 1 doc1
2 corpus_text 1 2 1 doc1
3 text_of 1 3 1 doc1
4 of_no 1 4 1 doc1
5 no_consequence 1 5 1 doc1
6 consequence_that 1 6 1 doc1
如果要将数据从ngrams_dfm获取到data.frame,请在Quanteda中使用convert
函数:
convert(ngrams_dfm, to = "data.frame")
document some_corpus corpus_text text_of of_no no_consequence consequence_that that_in in_practice practice_is is_going going_to to_be
1 doc1 1 1 1 1 1 1 1 1 1 1 1 1
2 doc2 0 0 0 0 0 0 0 0 0 0 0 0
3 doc3 1 1 0 0 0 0 0 0 0 0 0 0
您可以重塑它以获得所需的内容:这是dplyr / tidyr的示例。
library(dplyr)
convert(ngrams_dfm, to = "data.frame") %>%
tidyr::gather(feature, frequency, -document) %>%
group_by(document, feature) %>%
summarise(frequency = sum(frequency))
# A tibble: 129 x 3
# Groups: document [?]
document feature frequency
<chr> <chr> <dbl>
1 doc1 a_very 0
2 doc1 about_top 0
3 doc1 adding_some 0
4 doc1 and_so 0
5 doc1 approaches_are 0
6 doc1 are_working 0
7 doc1 be_very 1
8 doc1 but_for 0
9 doc1 care_about 0
10 doc1 consequence_that 1
# ... with 119 more rows
或带有data.table:
out <- data.table(convert(ngrams_dfm, to = "data.frame"))
melt(out, id.vars = "document",
variable.name = "feature", value.name = "freq")
document feature freq
1: doc1 some_corpus 1
2: doc2 some_corpus 0
3: doc3 some_corpus 1
4: doc1 corpus_text 1
5: doc2 corpus_text 0
---
125: doc2 care_about 1
126: doc3 care_about 0
127: doc1 about_top 0
128: doc2 about_top 1
129: doc3 about_top 0
答案 2 :(得分:1)
有趣的答案...但不是OP的问题。无需判断为什么为什么,使用 data.table 正是您想要的。
# set up the data.table without the doc_names
freq_dt <- textstat_frequency(ngrams_dfm) %>%
data.table()
setkey(freq_dt, feature)
# do the docnames collapsing as a separate data.table
docnames_dt <-
textstat_frequency(ngrams_dfm, groups = docnames(ngrams_dfm))[, c("feature", "group")] %>%
data.table()
docnames_dt <- docnames_dt[, doc_names := paste(group, collapse = ","), by = feature]
docnames_dt <- unique(docnames_dt[, c("feature", "doc_names")])
setkey(docnames_dt, feature)
# quick merge
answerdt <- freq_dt[docnames_dt][, c("feature", "frequency", "doc_names")]
# show the results
setorder(answerdt, -frequency)
head(answerdt, 10)
## feature frequency doc_names
## 1: corpus_text 2 text1,text3
## 2: some_corpus 2 text1,text3
## 3: top_ten 2 text2,text3
## 4: very_large 2 text1,text2
## 5: a_very 1 text2
## 6: about_top 1 text2
## 7: adding_some 1 text3
## 8: and_so 1 text2
## 9: approaches_are 1 text3
## 10: are_working 1 text3