使用PoS标记的文本相似性

时间:2018-05-16 19:31:02

标签: r quanteda udpipe

我想通过仅使用特定POS标签的单词来计算文本相似度。目前我正在使用余弦方法计算相似度,但它没有考虑到POS标记。

A <- data.frame(name = c(
  "X-ray right leg arteries",
  "consultation of gynecologist",
  "x-ray leg arteries",
  "x-ray leg with 20km distance"
), stringsAsFactors = F)

B <- data.frame(name = c(
  "X-ray left leg arteries",
  "consultation (inspection) of gynecalogist",
  "MRI right leg arteries",
  "X-ray right leg arteries with special care"
), stringsAsFactors = F)

corp1 <- corpus(A, text_field = "name")
corp2 <- corpus(B, text_field = "name")

docnames(corp1) <- paste("A", seq_len(ndoc(corp1)), sep = ".")
docnames(corp2) <- paste("B", seq_len(ndoc(corp2)), sep = ".")

dtm3 <- rbind(dfm(corp1, ngrams=2), dfm(corp2, ngrams=2))
cosines <- lapply(docnames(corp2), 
                  function(x) textstat_simil(dtm3[c(x, docnames(corp1)), ],
                                             method = "cosine",
                                             selection = x)[-1, , drop = FALSE])
do.call(cbind, cosines)

在上面的例子中,&#34; X射线右腿动脉&#34;不应该映射到&#34; MRI右腿动脉&#34;因为这是两类不同的服务。不幸的是,我没有明确的服务分类。我只有服务文本。是否有可能通过使用POS标签我可以更加重视这些词 - &#34; X射线&#34;,&#34;咨询&#34;,&#34;腿&#34;和&#34;动脉&#34;。代码中提到的服务只是一个示例。实际上,我有超过10K的服务。我探索了用于PoS标记的udpipe包,但没有取得多大成功。

1 个答案:

答案 0 :(得分:2)

为了使用udpipe进行pos标记,您可以执行以下操作(根据您的示例数据A和B)。

library(udpipe)
library(magrittr)
library(data.table)
txt <- rbindlist(list(A = A, B = B), idcol = "dataset")
txt$id <- sprintf("dataset%s_id%s", txt$dataset, seq_len(nrow(txt)))

# Tag using udpipe version 0.6 on CRAN which allows to show annotation progress
udmodel <- udpipe_download_model("english")
udmodel <- udpipe_load_model(udmodel$file_model)
txt_anno <- udpipe_annotate(udmodel, x = txt$name, doc_id = txt$id, trace = 5)
txt_anno <- as.data.table(txt_anno)

如果您想根据引理的文档术语矩阵计算相似度,请执行以下操作(使用sim2 R包中的text2vec

# construct DTM with only nouns based on lemmas
dtm1 <- subset(txt_anno, upos %in% c("NOUN"), select = c("doc_id", "lemma")) %>% 
  document_term_frequencies %>% 
  document_term_matrix
library(text2vec)
sim2(dtm1, dtm1, method = "cosine")

如果您还想在游戏中添加ngram的名词,请执行以下操作。相互提取名词,创建此新复合词的文档/术语/矩阵,并将其与现有文档术语矩阵相结合,以便轻松运行文档相似性。

# Add ngrams of nouns in the game (2 nouns following one another with an optional punctuation in between)
keyw <- txt_anno[, keywords_phrases(x = upos, term = lemma, pattern = "NOUN(PUNCT)*NOUN", is_regex = TRUE), by = "doc_id"]
keyw <- keyw[, list(freq = .N), by = c("keyword", "ngram")]

# add a new column of this n-gram and create DTM
txt_anno <- txt_anno[, term := txt_recode_ngram(x = lemma, compound = keyw$keyword, ngram = keyw$ngram), by = "doc_id"]

dtm2 <- subset(txt_anno, term %in% keyw$keyword, select = c("doc_id", "term")) %>% 
  document_term_frequencies %>% 
  document_term_matrix

dtmcombined <- dtm_cbind(dtm1, dtm2)
colnames(dtmcombined)
sim2(dtmcombined, dtmcombined, method = "cosine")