我正在开发一个项目来预测文本中的下一个单词。我在R中使用了quanteda
包来生成三元组和二元组。我知道我们需要最大化持有测试集中句子的概率。但是我不知道该如何去做。任何帮助都会很棒:)
答案 0 :(得分:0)
R中的下一个单词预测是Coursera JHU数据科学MOOC系列的顶点项目,因此您可以在之前的学生在线发布的R中找到至少几个如何做到这一点的例子。
one example的摘录是:
predict_trigrams <- function(input.dt, ngram4.dt){
### Returns top3grams, column w4 by frequency.
setkey(ngram4.dt,w1,w2,w3)
top3grams <- ngram4.dt[.(input.dt$w1,input.dt$w2,input.dt$w3)]
top3grams <- top3grams[order(-tots)]
top3grams$w4
}
predict_bigrams <- function(input.dt, ngram4.dt){
### Compare the input bigram (last two words)
### to every combo of bigrams: columns 1:2, 1:3, 2:3.
### Returns top_bigrams, w3 & w4 by frequency (w3 renamed to w4).
### bigrams input w2 & w3 to w1 & w2, returning w3 ###
setkey(ngram4.dt,w1,w2)
### binary search .() is alias for list()
top12grams <- ngram4.dt[.(input.dt$w2,input.dt$w3)]
top12grams <- top12grams[,ngrams:=NULL]
top12grams <- top12grams[,w4:=NULL]
setkey(top12grams,w1,w2,w3)
top12s <- top12grams[,sum(tots),by=w3]
top12s <- top12s[order(-V1)]
### bigrams input w2 & w3 to w2 & w3, returning w4 ###
setkey(ngram4.dt,w2,w3)
### binary search .() is alias for list()
top23grams <- ngram4.dt[.(input.dt$w2,input.dt$w3)]
top23grams <- top23grams[,ngrams:=NULL]
top23grams <- top23grams[,w1:=NULL]
setkey(top23grams,w2,w3,w4)
top23s <- top23grams[,sum(tots),by=w4]
top23s <- top23s[order(-V1)]
### Combine results ###
setnames(top12s, "w3", "w4")
top_bigrams <- rbind(top12s,top23s)
top_bigrams <- top_bigrams[,sum(V1),by=w4]
top_bigrams <- top_bigrams[order(-V1)]
top_bigrams$w4
}
pred1 <- function(w,input.dt,ngram4.dt){
### To be used within `predict_unigrams()`.
### w is the column to predict from: "w1", "w2", or "w3"
### the subsequent column will be returned by frequency.
setkeyv(ngram4.dt,w) ## setkeyv() for variables
### binary search .() is alias for list()
top1grams <- ngram4.dt[.(input.dt[,w3])]
top1grams <- top1grams[,ngrams:=NULL]
if(w=="w3"){
ww <- "w4"
top1grams <- top1grams[,w1:=NULL]
top1grams <- top1grams[,w2:=NULL]
}
if(w=="w2"){
ww <- "w3"
top1grams <- top1grams[,w1:=NULL]
top1grams <- top1grams[,w4:=NULL]
}
if(w=="w1"){
ww <- "w2"
top1grams <- top1grams[,w3:=NULL]
top1grams <- top1grams[,w4:=NULL]
}
top1s <- top1grams[,sum(tots),by=ww]
top1s <- top1s[order(-V1)]
setnames(top1s, ww, "w4")
top1s
}
predict_unigrams <- function(input.dt, ngram4.dt){
### Requires function `pred1()`.
### Compare the input unigram (last word)
### to every combo of unigrams: columns 1, 2, & 3.
### Returns top_unigrams: w2, w3, w4 by frequency (w2 & w3 renamed to w4).
### predict w4 from w3 ###
top1s4 <- pred1("w3",input.dt,ngram4.dt)
#head(top1s4)
### predict w3 from w2 ###
top1s3 <- pred1("w2",input.dt,ngram4.dt)
#head(top1s3)
### predict w2 from w1 ###
top1s2 <- pred1("w1",input.dt,ngram4.dt)
#head(top1s2)
### Combine results ###
top_unigrams <- rbind(top1s4,top1s3)
top_unigrams <- rbind(top_unigrams,top1s2)
top_unigrams <- top_unigrams[,sum(V1),by=(w4)]
top_unigrams <- top_unigrams[order(-V1)]
top_unigrams$w4
}
话虽如此,如果你为这门课程或其他任何课程做过这样的事情,如果你试图根据你在学到的东西中找到自己的想法,这真的会是一个更好的学习经历。当然。