Question

我正在开发一个项目来预测文本中的下一个单词。我在R中使用了quanteda包来生成三元组和二元组。我知道我们需要最大化持有测试集中句子的概率。但是我不知道该如何去做。任何帮助都会很棒：）

Answer 1

R中的下一个单词预测是Coursera JHU数据科学MOOC系列的顶点项目，因此您可以在之前的学生在线发布的R中找到至少几个如何做到这一点的例子。

predict_trigrams <- function(input.dt, ngram4.dt){
    ### Returns top3grams, column w4 by frequency.
setkey(ngram4.dt,w1,w2,w3)
top3grams <- ngram4.dt[.(input.dt$w1,input.dt$w2,input.dt$w3)]
top3grams <- top3grams[order(-tots)]
top3grams$w4
}

predict_bigrams <- function(input.dt, ngram4.dt){
    ### Compare the input bigram (last two words) 
    ### to every combo of bigrams: columns 1:2, 1:3, 2:3.
    ### Returns top_bigrams, w3 & w4 by frequency (w3 renamed to w4).

    ### bigrams input w2 & w3 to w1 & w2, returning w3 ###
    setkey(ngram4.dt,w1,w2)
    ### binary search .() is alias for list()
    top12grams <- ngram4.dt[.(input.dt$w2,input.dt$w3)]
    top12grams <- top12grams[,ngrams:=NULL]
    top12grams <- top12grams[,w4:=NULL]
    setkey(top12grams,w1,w2,w3)
    top12s <- top12grams[,sum(tots),by=w3]
    top12s <- top12s[order(-V1)]

    ### bigrams input w2 & w3 to w2 & w3, returning w4 ###
    setkey(ngram4.dt,w2,w3)
    ### binary search .() is alias for list()
    top23grams <- ngram4.dt[.(input.dt$w2,input.dt$w3)]
    top23grams <- top23grams[,ngrams:=NULL]
    top23grams <- top23grams[,w1:=NULL]
    setkey(top23grams,w2,w3,w4)
    top23s <- top23grams[,sum(tots),by=w4]
    top23s <- top23s[order(-V1)]

    ### Combine results ###
    setnames(top12s, "w3", "w4")
    top_bigrams <- rbind(top12s,top23s)
    top_bigrams <- top_bigrams[,sum(V1),by=w4]
    top_bigrams <- top_bigrams[order(-V1)]
    top_bigrams$w4
}

pred1 <- function(w,input.dt,ngram4.dt){
    ### To be used within `predict_unigrams()`.
    ### w is the column to predict from: "w1", "w2", or "w3"
    ### the subsequent column will be returned by frequency.
    setkeyv(ngram4.dt,w)  ## setkeyv() for variables
    ### binary search .() is alias for list()
    top1grams <- ngram4.dt[.(input.dt[,w3])]
    top1grams <- top1grams[,ngrams:=NULL]
    if(w=="w3"){
        ww <- "w4"
        top1grams <- top1grams[,w1:=NULL]
        top1grams <- top1grams[,w2:=NULL]
    }
    if(w=="w2"){
        ww <- "w3"
        top1grams <- top1grams[,w1:=NULL]
        top1grams <- top1grams[,w4:=NULL]
    }
    if(w=="w1"){
        ww <- "w2"
        top1grams <- top1grams[,w3:=NULL]
        top1grams <- top1grams[,w4:=NULL]
    }
    top1s <- top1grams[,sum(tots),by=ww]
    top1s <- top1s[order(-V1)]
    setnames(top1s, ww, "w4")
    top1s  
}

predict_unigrams <- function(input.dt, ngram4.dt){
    ### Requires function `pred1()`.
    ### Compare the input unigram (last word) 
    ### to every combo of unigrams: columns 1, 2, & 3.
    ### Returns top_unigrams: w2, w3, w4 by frequency (w2 & w3 renamed to w4).

    ### predict w4 from w3 ###
    top1s4 <- pred1("w3",input.dt,ngram4.dt)
    #head(top1s4)
    ### predict w3 from w2 ###
    top1s3 <- pred1("w2",input.dt,ngram4.dt)
    #head(top1s3)
    ### predict w2 from w1 ###
    top1s2 <- pred1("w1",input.dt,ngram4.dt)
    #head(top1s2)
    ### Combine results ###
    top_unigrams <- rbind(top1s4,top1s3)
    top_unigrams <- rbind(top_unigrams,top1s2)
    top_unigrams <- top_unigrams[,sum(V1),by=(w4)]
    top_unigrams <- top_unigrams[order(-V1)]
    top_unigrams$w4
}

话虽如此，如果你为这门课程或其他任何课程做过这样的事情，如果你试图根据你在学到的东西中找到自己的想法，这真的会是一个更好的学习经历。当然。

如何训练我的跪式模型以进行下一个单词预测来估算我的折扣参数

1 个答案: