我正在编写一个代码来实现Naive Bayes分类器的文本分类。我做了一个非常小的例子please refer page 44,它似乎正在发挥作用。
以下是代码 NB标头文件:
#pragma once
#include<iostream>
#include<fstream>
#include<string>
#include<vector>
#include<map>
using namespace std;
class NB
{
public:
NB(NB& cl1, NB& cl2, string className);
NB(string className);
NB(string className, int classType);
vector <string> combineClassText();
void bagOfWords(string classCombine, bool isTotal = false);
void calcProb(NB& total);
float totalProb(NB& prob, NB& total);
int classType;
private:
int _len = 0;
float _prob = 1.0f;
int _voc = 0;
int _nOfClass = 0;
int _tnClass = 0;
int _totalWordsinC = 0;
int _wordCounter = 0;
bool _isDone = false;
ifstream _in;
ofstream _out;
//string _classCombine;
string _className;
string _fileName;
vector <string> _combined;
map<string, string> _category;
map<string, int> _bow;
map<string, float> _probCalc;
};
NB.cpp 文件:
#include "NB.h"
#include<cmath>
NB::NB(NB& cl1, NB& cl2, string className)
{
_className = className;
_out.open("combineAll.txt");
if (_out.fail()) {
perror("cannot write to combineAll.txt");
}
_len = cl1.combineClassText().size();
for (int i = 0; i < _len; i++) {
_combined.push_back(cl1.combineClassText()[i]);
}
_len = cl2.combineClassText().size();
for (int i = 0; i < _len; i++) {
_combined.push_back(cl2.combineClassText()[i]);
}
_len = _combined.size();
for (int i = 0; i < _len; i++) {
_out << _combined[i] << endl;
//cout << i + 1 << ". " << _combined[i] << endl;
}
_out.close();
_tnClass = cl1._tnClass + cl2._tnClass;
bagOfWords("combineAll.txt", true);
}
NB::NB(string className, int classType) {
NB::classType = classType;
_className = className;
cout << "Enter a filename for " + _className << endl;
cin >> _fileName;
_category[_fileName] = _className;
combineClassText();
bagOfWords(_className + ".txt");
}
NB::NB(string className)
{
_className = className;
while (_isDone == false) {
cout << "Enter a filename for " + _className << endl;
cin >> _fileName;
if (_fileName != "q") {
_category[_fileName] = _className;
_nOfClass++;
_tnClass++;
} else {
_isDone = true;
}
}
combineClassText();
bagOfWords(_className + ".txt");
}
vector<string> NB::combineClassText() {
string temp;
string classCombine = _className + ".txt";
vector <string> tmp;
map<string, string>::iterator it;
_out.open(classCombine);
if (_out.fail()) {
perror("cannot write to");
}
for (it = _category.begin(); it != _category.end(); it++) {
_in.open(it->first);
if (_in.fail()) {
perror("cannot read from");
}
while (_in >> temp) {
_out << temp << endl;
tmp.push_back(temp);
}
_in.close();
}
_out.close();
return tmp;
}
void NB::bagOfWords(string classCombine, bool isTotal) {
map<string, int>::iterator it;
string temp;
vector<string> tp;
string name = _className + "_bow.txt";
int len;
_in.open(classCombine);
if (_in.fail()) {
perror("cannot read from");
}
_out.open(name);
if (_out.fail()) {
perror("cannot write to");
}
while (_in >> temp) {
tp.push_back(temp);
}
for (int i = 0; i < tp.size(); i++) {
for (int j = 0; j < tp[i].size(); j++) {
if (tp[i][j] == '.' || tp[i][j] == ',') {
tp[i][j] = ' ';
}
}
}
len = tp.size();
vector<int> count(len, 1);
for (int i = 0; i < len; i++) {
for (int j = 0; j < (len - i - 1); j++) {
if (tp[i] == tp[j + i + 1]) {
count[i]++;
}
}
}
for (int i = len - 1; i >= 0; i--) {
_bow[tp[i]] = count[i];
}
for (it = _bow.begin(); it != _bow.end(); it++) {
_out << it->first << ": " << it->second << endl;
//cout << it->first << ": " << it->second << endl;
}
//cout << endl;
if (isTotal == true) {
for (it = _bow.begin(); it != _bow.end(); it++) {
_voc += 1;
//cout << _voc << endl;
}
} else {
for (it = _bow.begin(); it != _bow.end(); it++) {
_totalWordsinC += it->second;
}
//cout << _totalWordsinC << endl;
}
_in.close();
_out.close();
}
void NB::calcProb(NB& total) {
map<string, int> ::iterator it;
map<string, int> ::iterator it2;
map<string, float> ::iterator it3;
_out.open(_className + "_prob.txt");
if (_out.fail()) {
perror("cannot write to");
}
for (it = total._bow.begin(); it != total._bow.end(); it++) {
for (it2 = _bow.begin(); it2 != _bow.end(); it2++) {
if (it->first == it2->first) {
_probCalc[it->first] = (float)((it2->second) + 1) / (_totalWordsinC + total._voc);
break;
} else {
_probCalc[it->first] = (float)(1) / (_totalWordsinC + total._voc);
}
}
}
for (it3 = _probCalc.begin(); it3 != _probCalc.end(); it3++) {
//cout << it3->first << ": " << it3->second << endl;
_out << it3->first << ": " << it3->second << endl;
}
_out.close();
}
float NB::totalProb(NB& prob, NB& total) {
map<string, int> ::iterator it;
map<string, int> ::iterator it2;
map<string, float> ::iterator it3;
_out.open(_className + "_" + prob._className + "_prob.txt");
if (_out.fail()) {
perror("cannot write to");
}
_prob = 1.0f;
for (it = _bow.begin(); it != _bow.end(); it++) {
for (it3 = prob._probCalc.begin(); it3 != prob._probCalc.end(); it3++) {
if (it->first == it3->first) {
_wordCounter = 0;
_prob = (_prob * pow((it3->second), (it->second)));
break;
} else {
_wordCounter++;
if (_wordCounter == prob._probCalc.size()) {
_prob = _prob * ((float)1 / (prob._totalWordsinC + total._voc));
}
}
}
}
_prob = (_prob * ((float)(prob._nOfClass) / total._tnClass));
cout << _prob << endl;
_out << "The probalility of the " << _className << " beloning to " << prob._className << " is: " << _prob << endl;
_out.close();
return _prob;
}
最后 main.cpp :
#include<iostream>
#include<vector>
#include"NB.h"
using namespace std;
int main() {
NB class1("class1");
NB class2("class2");
NB total(class1, class2, "all_combined");
class1.calcProb(total);
class2.calcProb(total);
int nOfTestDocs = 0;
int corrClass = 0;
float accurancy = 0.0f;
cout << "Enter the number of test documents\n";
cin >> nOfTestDocs;
NB test("test", 1);
if (test.totalProb(class1, total) >= test.totalProb(class2, total)) {
cout << "The test data belongs to class 1\n";
if (test.classType == 1) {
corrClass++;
accurancy = (float)corrClass / nOfTestDocs;
cout << "The accurancy is: " << accurancy << endl;
}
}
else {
cout << "The test data belongs to class 2\n";
if (test.classType == 1) {
corrClass++;
accurancy = (float)corrClass / nOfTestDocs;
cout << "The accurancy is: " << accurancy << endl;
}
}
system("PAUSE");
return 0;
}