在R rword2vec包中,对于训练文件,我想使用已经在全局环境中的语料库。似乎应该有可能,但我无法弄清楚。目前,我必须将语料库写入文本文件,然后将其调用到word2vec函数中,如下所示:
library(tm)
library(rword2vec)
removeURL <- function(x) gsub("http:[[:alnum:]]*", "", x)
processCorpus <- function (corpus)
{
corpus <- tm_map(corpus, content_transformer(removeURL))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument, language = "english")
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, function (x) {
gsub("\\s*(?<!\\B|-)\\d+(?!\\B|-)\\s*", "", x, perl = TRUE) })
corpus <- tm_map(corpus, str_trim)
return (corpus)
}
# corpus_file_path <- ...
# corpus_file_path_2 <- ...
df_corpus <- read.table(corpus_file_path)
corpus <- Corpus(VectorSource(df_corpus$x))
corpus <- processCorpus(corpus)
write.table(corpus$content, corpus_file_path_2, sep = "\t")
model <- word2vec(train_file = corpus_file_path_2, output_file = "vec.bin", binary = 1, num_threads = 128)
代替
model <- word2vec(train_file = corpus_file_path_2, output_file = "vec.bin", binary = 1, num_threads = 128)
看来我应该可以做类似的事情
model <- word2vec(train_file = corpus$content, output_file = "vec.bin", binary = 1, num_threads = 128)