我使用partial_fit()进行增量训练,代码为:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import metrics
import pandas as pd
import numpy as np
import os
import csv
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def iter_minibatches(data_stream, minibatch_size=1024):
X = []
y = []
cur_line_num = 0
csvfile = open(data_stream,mode='r')
reader = csv.reader(csvfile)
index = 0
for line in reader:
if index == 0:
continue
index +=1
y.append(float(line[-1]))
X.append(line[:len(line)-1]) # 这里要将数据转化成float类型
cur_line_num += 1
if cur_line_num >= minibatch_size:
X, y = np.array(X), np.array(y) # 将数据转成numpy的array类型并返回
X = X.astype('float32')
y = y.astype('float32')
yield X, y
X, y = [], []
cur_line_num = 0
csvfile.close()
if __name__=="__main__":
......
# 生成测试文件
minibatch_test_iterators = iter_minibatches(test_file_name, minibatch_size=test_batch_size)
algorithms={'SGDClassifier':SGDClassifier(), # SGDClassifier的参数设置可以参考sklearn官网
'Perceptron':Perceptron(),
# 'PassiveAggressiveClassifier':PassiveAggressiveClassifier(),
'MultinomialNB':MultinomialNB(),
'BernoulliNB':BernoulliNB()
}
per_batch_scroes = {}
plt.figure(figsize=(10,10))
for algo_name in algorithms.keys():
model = algorithms[algo_name]
minibatch_train_iterators = iter_minibatches(train_file_name, minibatch_size=train_batch_size)
per_scroes =[]
for i, (X_train, y_train) in enumerate(minibatch_train_iterators):
# 得到一份测试文件
X_test, y_test = minibatch_test_iterators.__next__()
# 使用 partial_fit ,并在第一次调用 partial_fit 的时候指定 classes
model.partial_fit(X_train, y_train, classes=np.array([0, 1]))
# 当前次数
print("{} time".format(i))
# 在测试集上看效果
y_test.reshape(-1,1)
curr_score = model.score(X_test, y_test)
print("{} score".format(curr_score))
per_scroes.append(curr_score)
per_batch_scroes[algo_name] = per_scroes
sk_test_y = test['class']
sk_test_x = test.drop(['class'],axis=1)
test_class_preds = model.predict(sk_test_x.values)
fpr, tpr, thresh = metrics.roc_curve(sk_test_y.values,test_class_preds)
auc = metrics.roc_auc_score(sk_test_y.values,test_class_preds)
plt.plot(fpr,tpr,label=algo_name+", auc="+str(float('%.4f'%auc)))
但是我得到了错误:
Traceback (most recent call last):
File "skOnline_murray.py", line 106, in <module>
test_class_preds = model.predict(sk_test_x.values)
File "/usr/local/lib/python3.5/dist-packages/sklearn/naive_bayes.py", line 66, in predict
jll = self._joint_log_likelihood(X)
File "/usr/local/lib/python3.5/dist-packages/sklearn/naive_bayes.py", line 722, in _joint_log_likelihood
check_is_fitted(self, "classes_")
File "/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py", line 768, in check_is_fitted
raise NotFittedError(msg % {'name': type(estimator).__name__})
sklearn.exceptions.NotFittedError: This MultinomialNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.