我使用https://gist.github.com/zacstewart/5978000示例创建了一个分类器。 为了训练分类器,我使用以下代码
import os
import numpy
NEWLINE = '\n'
SKIP_FILES = set(['cmds'])
def read_files(path):
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
if file_name not in SKIP_FILES:
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
past_header, lines = False, []
f = open(file_path)
for line in f:
if past_header:
lines.append(line)
elif line == NEWLINE:
past_header = True
f.close()
yield file_path, NEWLINE.join(lines).decode('cp1252', 'ignore')
from pandas import DataFrame
def build_data_frame(path, classification):
data_frame = DataFrame({'text': [], 'class': []})
for file_name, text in read_files(path):
data_frame = data_frame.append(
DataFrame({'text': [text], 'class': [classification]}, index=[file_name]))
return data_frame
HAM = 0
SPAM = 1
SOURCES = [
('data/spam', SPAM),
('data/easy_ham', HAM),
('data/hard_ham', HAM),
('data/beck-s', HAM),
('data/farmer-d', HAM),
('data/kaminski-v', HAM),
('data/kitchen-l', HAM),
('data/lokay-m', HAM),
('data/williams-w3', HAM),
('data/BG', SPAM),
('data/GP', SPAM),
('data/SH', SPAM)
]
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data = data.append(build_data_frame(path, classification))
data = data.reindex(numpy.random.permutation(data.index))
import numpy
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(numpy.asarray(data['text']))
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
targets = numpy.asarray(data['class'])
clf = classifier.fit(counts, targets)
from sklearn.externals import joblib
joblib.dump(clf, 'my_trained_data.pkl', compress=9)
如果我在此文件中测试一个示例,那么它可以正常工作。 但我试图将分类器保存到my_trained_data.pkl,然后将其称为屁股
from sklearn.externals import joblib
clf = joblib.load('my_trained_data.pkl')
examples = ['Free Viagra call today!', "I'm going to attend the Linux users group tomorrow."]
predictions = clf.predict(examples)
这会产生以下错误。
TypeError: Cannot cast array data from dtype('float64') to dtype('S32') according to the rule 'safe'
以下是追踪
In [12]: runfile('/home/harpreet/Machine_learning/untitled0.py', wdir='/home/harpreet/Machine_learning') MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) Traceback (most recent call last):
File "<ipython-input-12-521f3ed1e6da>", line 1, in <module>
runfile('/home/harpreet/Machine_learning/untitled0.py', wdir='/home/harpreet/Machine_learning')
File "/home/harpreet/anaconda/lib/python2.7/site-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 682, in runfile
execfile(filename, namespace)
File "/home/harpreet/anaconda/lib/python2.7/site-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 78, in execfile
builtins.execfile(filename, *where)
File "/home/harpreet/Machine_learning/untitled0.py", line 13, in <module>
clf.predict(examples)
File "/home/harpreet/anaconda/lib/python2.7/site-packages/sklearn/naive_bayes.py", line 62, in predict
jll = self._joint_log_likelihood(X)
File "/home/harpreet/anaconda/lib/python2.7/site-packages/sklearn/naive_bayes.py", line 441, in _joint_log_likelihood
return (safe_sparse_dot(X, self.feature_log_prob_.T)
File "/home/harpreet/anaconda/lib/python2.7/site-packages/sklearn/utils/extmath.py", line 180, in safe_sparse_dot
return fast_dot(a, b)
TypeError: Cannot cast array data from dtype('float64') to dtype('S32') according to the rule 'safe'
答案 0 :(得分:0)
您需要使用相同的vectorizer
实例转换测试文档:
examples_vectors = count_vectorizer.transform(examples)
clf.predict(examples_vectors)
通常,使用管道更容易:
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(CountVectorizer(), MultinomialNB())
pipeline.fit(data['text'].values, data['class'].values)
然后:
pipeline.predict(examples)