特别是,搜索长度为3或更长的ngram。
我目前的实施如下。 joinwords
是一个40个单词的列表,可以出现在字符串的中间,但不在字符串的末尾。数据的大小可以是50 MB到3 GB以上的文本,因此我试图找出一种简单的方法来扫描所有文本。有没有人有什么建议。下面是代码的主要部分。 user_dict
是shelve
模块对象。
ngram_dict=dict()
outer_stopwords=[]
inner_stopwords=[]
for word in stopwords:
if word not in joinwords:
inner_stopwords.append(word)
outer_stopwords.append(' ')
inner_stopwords.append(' ')
outer_stopwords.append('')
inner_stopwords.append('')
nusers=0
ncomments=0
for user in user_dict:
nusers+=1
comments=user_dict[user][0]
for comment in comments:
comment_text=comments[comment]['COMMENT_TEXT']
comment_text=cleanse_text(comment_text)
comment_text=str(comment_text.lower())
words=re.split(' ',comment_text)
comment_dict=dict()#used to filter out replicates within a comment
ncomments+=1
nwords=len(words)
for i in range(0,nwords-N+1):
current_ngram=[]
skip_ngram=False
for u in range(i,i+N):
current_word=words[u]
if (len(current_word)>MAX_WORD_LENGTH):
skip_ngram=True
break
current_ngram.append(current_word)
if u in [i,i+N-1] and current_word in outer_stopwords:
skip_ngram=True
break
if skip_ngram:
continue
ngram_string=' '.join(current_ngram)
if ngram_string in comment_dict:
continue
comment_dict[ngram_string]=1
if ngram_string in ngram_dict:
ngram_dict[ngram_string]+=1
else:
ngram_dict[ngram_string]=1
if nusers%100==0:
print('Users parsed: '+str(nusers))
#this removes uncommon elements
if len(ngram_dict) > 100000:
ngram_dict = reduce_dict(ngram_dict,2)
def reduce_dict(dictionary, nmin):
newdict = dict()
dk = dictionary.keys()
for item in dk:
if dictionary[item] >= nmin:
newdict[item] = dictionary[item]-1
return newdict