在sklearn.feature_extraction.text.TfidfVectorizer中使用nltk.stem.SnowballStemmer时

时间:2018-01-28 07:04:07

标签: scikit-learn nltk tfidfvectorizer

我在sklearn.feature_extraction.text.TfidfVectorizer中使用nltk.stem.SnowballStemmer来提高效率,但是存在问题。

from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
    analyzer = super(StemmedTfidfVectorizer,self).build_analyzer()
    return lambda doc:(english_stemmer.stem(word) for word in   analyzer(doc))


#list all the sentence
posts=["How to format my disks","hard disk formating at","How to formated my disks"]

#use tfidf class
vectorizer_tfidf=StemmedTfidfVectorizer(min_df=1,stop_words="english")

#
x_tfidf=vectorizer_tfidf.fit_transform(posts_root)

print("feature_name:%s" % vectorizer_tfidf.get_feature_names())
#
num_samples,num_features=x_tfidf.shape
print("samples_noroot: %d ,#features_noroot: %d" % (num_samples,num_features))

print(x_tfidf.toarray())

输出如下:

feature_name:[u'disk', u'format', u'hard']
samples_noroot: 3 ,#features_noroot: 3
[[ 0.70710678  0.70710678  0.        ]
 [ 0.45329466  0.45329466  0.76749457]
 [ 0.70710678  0.70710678  0.        ]]

单词" disk"在所有句子中," disk"的重量应该是0。 如何修复代码

1 个答案:

答案 0 :(得分:0)

我在sklearn lib中得到了将splitng句子翻译成单词的方法。 像:

import re
    def build_tokenizer(doc):
        token_pattern=r"(?u)\b\w\w+\b"
        token_pattern = re.compile(token_pattern)
        return token_pattern.findall(doc)

经过测试,结果相同。 问题将会结束。

整个代码是:

from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer,self).build_analyzer()
        return lambda doc:(english_stemmer.stem(word) for word in analyzer(doc))


#列出所有的帖子
posts=["How to format my disks","hard disk formating at","How to formated my disks"]


import re
def build_tokenizer(doc):
    token_pattern=r"(?u)\b\w\w+\b"
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(doc)
posts_root=[]
for post in posts:
    #print build_tokenizer(post)
    #print " ".join([english_stemmer.stem(word) for word in build_tokenizer(post)])
    posts_root.append( " ".join([english_stemmer.stem(word) for word in build_tokenizer(post)]) )
print posts
print posts_root
#posts_root = [ " ".join(english_stemmer.stem(word)) for doc in posts for word in build_tokenizer(doc)]

#采用TFIDF处理方式
vectorizer_tfidf=StemmedTfidfVectorizer(min_df=1,stop_words="english")

#对于帖子进行词袋转化
x_tfidf=vectorizer_tfidf.fit_transform(posts_root)

print("feature_name:%s" % vectorizer_tfidf.get_feature_names())
#获取样本数和特征个数
num_samples,num_features=x_tfidf.shape
print("samples_noroot: %d ,#features_noroot: %d" % (num_samples,num_features))

print(x_tfidf.toarray())

word=vectorizer_tfidf.get_feature_names()#获取词袋模型中的所有词语  
weight=x_tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重  
for i in range(len(weight)):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重  
    print u"-------这里输出第",i,u"类文本的词语tf-idf权重------"  
    for j in range(len(word)):  
        print word[j],weight[i][j]