Question

import nltk
import string
from nltk.corpus import stopwords


from collections import Counter

def get_tokens():
    with open('comet_interest.xml','r') as bookmark:
        text=bookmark.read()
        lowers=text.lower()

        no_punctuation=lowers.translate(None,string.punctuation)
        tokens=nltk.word_tokenize(no_punctuation)
        return tokens
#remove stopwords
tokens=get_tokens()
filtered = [w for w in tokens if not w in stopwords.words('english')]
count = Counter(filtered)
print count.most_common(10)

#stemming
from nltk.stem.porter import *

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)
count = Counter(stemmed)
print count.most_common(10)

结果显示如下：

[（'analysis'，13），（'spatial'，11），（'feb'，8），（'cdata'，8），（'description'，7），（'item'，6 ），（'很多'，6），（'pm'，6），（'link'，6），（'research'，5）]

[（u'analysi'，13），（u'spatial'，11），（u'use'，11），（u'feb'，8），（u'cdata'，8），（ u'scienc'，7），（u'descript'，7），（u'item'，6），（u'inclu'，6），（u'mani'，6）]

第二个问题是什么问题，为什么每个单词都有一个“你”头？

Answer 1

正如@kindall所说，这是因为unicode字符串。

但更具体地说，因为NLTK默认使用from __future__ import unicode_literals将所有字符串转换为unicode，请参阅https://github.com/nltk/nltk/blob/develop/nltk/stem/porter.py#L87

所以让我们在python 2.x中尝试一下实验：

$ python
>>> from nltk.stem import PorterStemmer
>>> porter = PorterStemmer()
>>> word = "analysis"
>>> word
'analysis'
>>> porter.stem(word)
u'analysi'

我们看到突然出现的词语变成了一个unicode。

然后，让我们尝试导入unicode_literals：

>>> from nltk.stem import PorterStemmer
>>> porter = PorterStemmer()
>>> word = "analysis"
>>> word
'analysis'
>>> porter.stem(word)
u'analysi'
>>> from __future__ import print_function, unicode_literals
>>> word
'analysis'
>>> word2 = "analysis"
>>> word2
u'analysis'

请注意，所有字符串仍保留为字符串，但导入unicode_literals后的任何新变量字符串变量将默认为unicode。

为什么NLTK中的PortStemmer将我的“字符串”转换为u“string”

1 个答案: