我在sklearn.feature_extraction.text.TfidfVectorizer中使用nltk.stem.SnowballStemmer来提高效率,但是存在问题。
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(StemmedTfidfVectorizer,self).build_analyzer()
return lambda doc:(english_stemmer.stem(word) for word in analyzer(doc))
#list all the sentence
posts=["How to format my disks","hard disk formating at","How to formated my disks"]
#use tfidf class
vectorizer_tfidf=StemmedTfidfVectorizer(min_df=1,stop_words="english")
#
x_tfidf=vectorizer_tfidf.fit_transform(posts_root)
print("feature_name:%s" % vectorizer_tfidf.get_feature_names())
#
num_samples,num_features=x_tfidf.shape
print("samples_noroot: %d ,#features_noroot: %d" % (num_samples,num_features))
print(x_tfidf.toarray())
输出如下:
feature_name:[u'disk', u'format', u'hard']
samples_noroot: 3 ,#features_noroot: 3
[[ 0.70710678 0.70710678 0. ]
[ 0.45329466 0.45329466 0.76749457]
[ 0.70710678 0.70710678 0. ]]
单词" disk"在所有句子中," disk"的重量应该是0。 如何修复代码
答案 0 :(得分:0)
我在sklearn lib中得到了将splitng句子翻译成单词的方法。 像:
import re
def build_tokenizer(doc):
token_pattern=r"(?u)\b\w\w+\b"
token_pattern = re.compile(token_pattern)
return token_pattern.findall(doc)
经过测试,结果相同。 问题将会结束。
整个代码是:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(StemmedTfidfVectorizer,self).build_analyzer()
return lambda doc:(english_stemmer.stem(word) for word in analyzer(doc))
#列出所有的帖子
posts=["How to format my disks","hard disk formating at","How to formated my disks"]
import re
def build_tokenizer(doc):
token_pattern=r"(?u)\b\w\w+\b"
token_pattern = re.compile(token_pattern)
return token_pattern.findall(doc)
posts_root=[]
for post in posts:
#print build_tokenizer(post)
#print " ".join([english_stemmer.stem(word) for word in build_tokenizer(post)])
posts_root.append( " ".join([english_stemmer.stem(word) for word in build_tokenizer(post)]) )
print posts
print posts_root
#posts_root = [ " ".join(english_stemmer.stem(word)) for doc in posts for word in build_tokenizer(doc)]
#采用TFIDF处理方式
vectorizer_tfidf=StemmedTfidfVectorizer(min_df=1,stop_words="english")
#对于帖子进行词袋转化
x_tfidf=vectorizer_tfidf.fit_transform(posts_root)
print("feature_name:%s" % vectorizer_tfidf.get_feature_names())
#获取样本数和特征个数
num_samples,num_features=x_tfidf.shape
print("samples_noroot: %d ,#features_noroot: %d" % (num_samples,num_features))
print(x_tfidf.toarray())
word=vectorizer_tfidf.get_feature_names()#获取词袋模型中的所有词语
weight=x_tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
for i in range(len(weight)):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
print u"-------这里输出第",i,u"类文本的词语tf-idf权重------"
for j in range(len(word)):
print word[j],weight[i][j]