这是我的代码:
def build_trigram_model(corpus):
corpus = lemmer(nlp(corpus))
bigram = phrases.Phrases(corpus, min_count=2, threshold=40)
bigram_phraser = phrases.Phraser(bigram)
trigram = phrases.Phrases(bigram_phraser[corpus], min_count=2, threshold=50)
trigram_phraser = phrases.Phraser(trigram)
return bigram_phraser, trigram_phraser
def punct_space(token):
"""
helper function to eliminate punctuation, spaces and numbers.
"""
return token.is_punct or token.is_space or token.like_num
def lemmer(tokens):
word_space = []
stemmer = SnowballStemmer("english")
for token in tokens:
if not punct_space(token):
word_space.append(stemmer.stem(str(token).lower()))
return word_space
bigram_phraser, trigram_phraser = build_trigram_model(corpus)
返回类似于:
的内容bigram_phraser.phrasegrams
{(b'\xef\xa3\xaf', b'\xef\xa3\xb0'): (189, 49.569954185342624),
(b'\xef\xa3\xba', b'\xef\xa3\xbb'): (189, 49.162979913552),
(b'\xce\xbf', b'\xcf\x82'): (11, 52.7203947368421),
(b'\xef\xa3\xb1', b'\xef\xa3\xb2'): (13, 78.54622410336378),
(b'\xef\xa3\xb2', b'\xef\xa3\xb3'): (12, 74.75279850746269),
(b'\xef\xa3\xbc', b'\xef\xa3\xbd'): (8, 73.94232987312573),
(b'\xef\xa3\xbd', b'\xef\xa3\xbe'): (7, 65.46977124183007),
(b'\xc9\xa1', b'\xcc\x8a'): (29, 64.73618071658315),
(b'\xef\x9d\xb4', b'\xef\x9d\xb2'): (51, 105.61094674556212),
(b'\xef\x9d\xa3', b'\xef\x9d\xac'): (26, 53.88736340711684)}
通过模型运行新文本时,找不到搭配。
但是,当我使用以下词干分析器时:
def lemmer(tokens):
"""lemmatize words"""
word_space = []
for sent in tokens.sents:
sentence = []
for token in sent:
if not punct_space(token):
if token.lemma_=='-PRON-':
sentence.append(token.lower_)
else:
sentence.append(token.lemma_)
word_space.append(sentence)
return word_space
一切都按照预期的方式运作。两个词干分析器返回的dtype是字符串列表,因此不能成为问题。任何想法为什么会这样?谢谢!