get_dictionary()
包中有一个fastrtext
函数,我认为它将返回字典中的所有单词。但是,当我将wordNgrams
设置为2或3时,它返回的单词列表与将wordNgrams
设置为1时得到的单词列表完全相同。有人可以告诉我这里发生了什么吗?谢谢!
答案 0 :(得分:0)
当您在 n -grams中增加 n 时,对于所有情况,您的快速文本分类算法都在同一词典上运行。但是,不是训练单独的单词(“ I”,“ love”,“ NY”),而是训练单词的串联(“ I love”,“ love NY”-这是一个双语法例)。为了演示,我训练了5克(五克);当然,在-克中的索引 n 越大,计算时间越长,但句法结构被更好地捕获。
library(fastrtext)
data("train_sentences")
data("test_sentences")
# prepare data
tmp_file_model <- tempfile()
train_labels <- paste0("__label__", train_sentences[,"class.text"])
train_texts <- tolower(train_sentences[,"text"])
train_to_write <- paste(train_labels, train_texts)
train_tmp_file_txt <- tempfile()
writeLines(text = train_to_write, con = train_tmp_file_txt)
test_labels <- paste0("__label__", test_sentences[,"class.text"])
test_texts <- tolower(test_sentences[,"text"])
test_to_write <- paste(test_labels, test_texts)
# learn model 1 1-grams
library(microbenchmark)
microbenchmark(execute(commands = c("supervised", "-input", train_tmp_file_txt,
"-output", tmp_file_model, "-dim", 20, "-lr", 1,
"-epoch", 20, "-wordNgrams", 1, "-verbose", 1)), times = 5)
# mean time: 1.229228 seconds
model1 <- load_model(tmp_file_model)
# learn model 2 5-grams)
microbenchmark(execute(commands = c("supervised", "-input", train_tmp_file_txt,
"-output", tmp_file_model, "-dim", 20, "-lr", 1,
"-epoch", 20, "-wordNgrams", 5, "-verbose", 1)), times = 5)
# mean time: 2.659191
model2 <- load_model(tmp_file_model)
str(get_dictionary(model1))
# chr [1:5060] "the" "</s>" "of" "to" "and" "in" "a" "that" "is" "for" ...
str(get_dictionary(model2))
# chr [1:5060] "the" "</s>" "of" "to" "and" "in" "a" "that" "is" "for" ...