Question

特别是，搜索长度为3或更长的ngram。

我目前的实施如下。 joinwords是一个40个单词的列表，可以出现在字符串的中间，但不在字符串的末尾。数据的大小可以是50 MB到3 GB以上的文本，因此我试图找出一种简单的方法来扫描所有文本。有没有人有什么建议。下面是代码的主要部分。 user_dict是shelve模块对象。

ngram_dict=dict()
outer_stopwords=[]
inner_stopwords=[]
for word in stopwords:
    if word not in joinwords:
        inner_stopwords.append(word)
outer_stopwords.append(' ')
inner_stopwords.append(' ')
outer_stopwords.append('')
inner_stopwords.append('')
nusers=0
ncomments=0
for user in user_dict:
    nusers+=1
    comments=user_dict[user][0]
    for comment in comments:
        comment_text=comments[comment]['COMMENT_TEXT']
        comment_text=cleanse_text(comment_text)
        comment_text=str(comment_text.lower())
        words=re.split(' ',comment_text)
        comment_dict=dict()#used to filter out replicates within a comment
        ncomments+=1
        nwords=len(words)
        for i in range(0,nwords-N+1):
            current_ngram=[]
            skip_ngram=False
            for u in range(i,i+N):
                current_word=words[u]
                if (len(current_word)>MAX_WORD_LENGTH):
                    skip_ngram=True
                    break
                current_ngram.append(current_word)
                if u in [i,i+N-1] and current_word in outer_stopwords:
                    skip_ngram=True
                    break
            if skip_ngram:
                continue
            ngram_string=' '.join(current_ngram)
            if ngram_string in comment_dict:
                continue
            comment_dict[ngram_string]=1
            if ngram_string in ngram_dict:
                ngram_dict[ngram_string]+=1
            else:
                ngram_dict[ngram_string]=1
    if nusers%100==0:
        print('Users parsed: '+str(nusers))
    #this removes uncommon elements
    if len(ngram_dict) > 100000:
        ngram_dict = reduce_dict(ngram_dict,2)

def reduce_dict(dictionary, nmin):
    newdict = dict()
    dk = dictionary.keys()
    for item in dk:
        if dictionary[item] >= nmin:
            newdict[item] = dictionary[item]-1
    return newdict

什么是ngram在大量文本中搜索的有效算法

0 个答案: