from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.feature_extraction.text import TfidfVectorizer
def number_normalizer(tokens):
""" Map all numeric tokens to a placeholder.
For many applications, tokens that begin with a number are not directly
useful, but the fact that such a token exists can be relevant. By applying
this form of dimensionality reduction, some methods may perform better.
"""
return ("#NUMBER" if token[0].isdigit() else token for token in tokens)
class NumberNormalizingVectorizer(TfidfVectorizer):
def build_tokenizer(self):
tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
return lambda doc: list(number_normalizer(tokenize(doc)))
vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)
cocluster = SpectralCoclustering(n_clusters=5, svd_method='arpack', random_state=0)
X = vectorizer.fit_transform(data)
cocluster.fit(X)
我选择SpectralCoclustering来聚集大约30k条推文,在将数据X放入“cocluster”之前一切顺利。
它会引发以下错误。
.env/lib/python3.5/site-packages/sklearn/utils/validation.py", line 43, in _assert_all_finite
" or a value too large for %r." % X.dtype)
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
当我输入代码时报告错误,但它是“假”。发生错误时应该是True,对吧?
那么还有什么可以找到这个bug吗?谢谢!
https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py#L43
X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum()) and not np.isfinite(X).all()
假