用朴素贝叶斯实现C ++中的文本分类

时间:2015-12-29 16:26:43

标签: c++11 naivebayes

我正在编写一个代码来实现Naive Bayes分类器的文本分类。我做了一个非常小的例子please refer page 44,它似乎正在发挥作用。

  1. 但我想知道实施是否正确,它是否适用于其他培训和测试集?我并没有尝试实现商业级别的Naive Bayes,只是一个小任务,来学习一些C ++。
  2. 我想知道代码是怎么回事?就像我编写代码的方式一样,它是一个很好的C ++实践吗?
  3. 我知道可以做很多改进,例如目前我只测试一个测试文件,所以测试多个文件的方法是我将来想做的事情,目前我也是我只做2级分类,将来可能是多级分类。但其他任何改进代码都明智吗?
  4. 以下是代码 NB标头文件:

    #pragma once
    
    #include<iostream>
    #include<fstream>
    #include<string>
    #include<vector>
    #include<map>
    using namespace std;
    
    class NB
    {
    public:
        NB(NB& cl1, NB& cl2, string className);
        NB(string className);
        NB(string className, int classType);
        vector <string> combineClassText();
        void bagOfWords(string classCombine, bool isTotal = false);
        void calcProb(NB& total);
        float totalProb(NB& prob, NB& total);
        int classType;
    
    private:
        int _len = 0;
        float _prob = 1.0f;
        int _voc = 0;
        int _nOfClass = 0;
        int _tnClass = 0;
        int _totalWordsinC = 0;
        int _wordCounter = 0;
        bool _isDone = false;
        ifstream _in;
        ofstream _out;
        //string _classCombine;
        string _className;
        string _fileName;
        vector <string> _combined;
        map<string, string> _category;
        map<string, int> _bow;
        map<string, float> _probCalc;
    };
    

    NB.cpp 文件:

    #include "NB.h"
    #include<cmath>
    
    NB::NB(NB& cl1, NB& cl2, string className)
    {
        _className = className;
        _out.open("combineAll.txt");
        if (_out.fail()) {
            perror("cannot write to combineAll.txt");
        }
        _len = cl1.combineClassText().size();
        for (int i = 0; i < _len; i++) {
            _combined.push_back(cl1.combineClassText()[i]);
        }
    
        _len = cl2.combineClassText().size();
        for (int i = 0; i < _len; i++) {
            _combined.push_back(cl2.combineClassText()[i]);
        }
    
        _len = _combined.size();
        for (int i = 0; i < _len; i++) {
            _out << _combined[i] << endl;
            //cout << i + 1 << ". " << _combined[i] << endl;
        }
        _out.close();
        _tnClass = cl1._tnClass + cl2._tnClass;
        bagOfWords("combineAll.txt", true);
    }
    
    NB::NB(string className, int classType) {
        NB::classType = classType;
        _className = className;
        cout << "Enter a filename for " + _className << endl;
        cin >> _fileName;
        _category[_fileName] = _className;
        combineClassText();
        bagOfWords(_className + ".txt");
    }
    
    NB::NB(string className)
    {
        _className = className;
        while (_isDone == false) {
            cout << "Enter a filename for " + _className << endl;
            cin >> _fileName;
            if (_fileName != "q") {
                _category[_fileName] = _className;
                _nOfClass++;
                _tnClass++;
            } else {
                _isDone = true;
            }
        }
        combineClassText();
        bagOfWords(_className + ".txt");
    }
    
    vector<string> NB::combineClassText() {
    
        string temp;
        string classCombine = _className + ".txt";
        vector <string> tmp;
        map<string, string>::iterator it;
    
        _out.open(classCombine);
        if (_out.fail()) {
            perror("cannot write to");
        }
        for (it = _category.begin(); it != _category.end(); it++) {
            _in.open(it->first);
            if (_in.fail()) {
                perror("cannot read from");
            }
            while (_in >> temp) {
                _out << temp << endl;
                tmp.push_back(temp);            
            }
            _in.close();
        }
        _out.close();
        return tmp;
    }
    
    void NB::bagOfWords(string classCombine, bool isTotal) {
    
        map<string, int>::iterator it;
        string temp;
        vector<string> tp;
        string name = _className + "_bow.txt";
        int len;
    
        _in.open(classCombine);
        if (_in.fail()) {
            perror("cannot read from");
        }
    
        _out.open(name);
        if (_out.fail()) {
            perror("cannot write to");
        }
    
        while (_in >> temp) {
            tp.push_back(temp);
        }
    
        for (int i = 0; i < tp.size(); i++) {
            for (int j = 0; j < tp[i].size(); j++) {
                if (tp[i][j] == '.' || tp[i][j] == ',') {
                    tp[i][j] = ' ';
                }
            }
        }
    
        len = tp.size();
        vector<int> count(len, 1);
    
        for (int i = 0; i < len; i++) {
            for (int j = 0; j < (len - i - 1); j++) {
                if (tp[i] == tp[j + i + 1]) {
                    count[i]++;
                }
            }
        }
    
        for (int i = len - 1; i >= 0; i--) {
            _bow[tp[i]] = count[i];
        }
    
        for (it = _bow.begin(); it != _bow.end(); it++) {
            _out << it->first << ": " << it->second << endl;
            //cout << it->first << ": " << it->second << endl;
        }
        //cout << endl;
    
        if (isTotal == true) {
            for (it = _bow.begin(); it != _bow.end(); it++) {
                _voc += 1;
                //cout << _voc << endl;
            }
        } else {
            for (it = _bow.begin(); it != _bow.end(); it++) {
                _totalWordsinC += it->second;
            }
            //cout << _totalWordsinC << endl;
        }
        _in.close();
        _out.close();
    }
    
    void NB::calcProb(NB& total) {
    
        map<string, int> ::iterator it;
        map<string, int> ::iterator it2;
        map<string, float> ::iterator it3;
    
            _out.open(_className + "_prob.txt");
            if (_out.fail()) {
                perror("cannot write to");
            }
            for (it = total._bow.begin(); it != total._bow.end(); it++) {
                for (it2 = _bow.begin(); it2 != _bow.end(); it2++) {
                    if (it->first == it2->first) {
                        _probCalc[it->first] = (float)((it2->second) + 1) / (_totalWordsinC + total._voc);
                        break;
                    } else {
                        _probCalc[it->first] = (float)(1) / (_totalWordsinC + total._voc);
                    }
                }
            }
    
            for (it3 = _probCalc.begin(); it3 != _probCalc.end(); it3++) {
                //cout << it3->first << ": " << it3->second << endl;
                _out << it3->first << ": " << it3->second << endl;
            }
            _out.close();
        }
    
    float NB::totalProb(NB& prob, NB& total) {
    
        map<string, int> ::iterator it;
        map<string, int> ::iterator it2;
        map<string, float> ::iterator it3;
    
        _out.open(_className + "_" + prob._className + "_prob.txt");
        if (_out.fail()) {
            perror("cannot write to");
        }
        _prob = 1.0f;
        for (it = _bow.begin(); it != _bow.end(); it++) {
            for (it3 = prob._probCalc.begin(); it3 != prob._probCalc.end(); it3++) {
                if (it->first == it3->first) {
                    _wordCounter = 0;
                    _prob = (_prob * pow((it3->second), (it->second)));
                    break;
                } else {
                    _wordCounter++;
                    if (_wordCounter == prob._probCalc.size()) {
                        _prob = _prob * ((float)1 / (prob._totalWordsinC + total._voc));
                    }
                }
            }
        }
        _prob = (_prob * ((float)(prob._nOfClass) / total._tnClass));
        cout << _prob << endl;
        _out << "The probalility of the " << _className << " beloning to " << prob._className << " is: " << _prob << endl;
        _out.close();
        return _prob;
    }
    

    最后 main.cpp

    #include<iostream>
    #include<vector>
    #include"NB.h"
    
    using namespace std;
    
    int main() {
    
        NB class1("class1");
        NB class2("class2");
        NB total(class1, class2, "all_combined");
    
        class1.calcProb(total);
        class2.calcProb(total);
    
        int nOfTestDocs = 0;
        int corrClass = 0;
        float accurancy = 0.0f;
        cout << "Enter the number of test documents\n";
        cin >> nOfTestDocs;
    
        NB test("test", 1);
        if (test.totalProb(class1, total) >= test.totalProb(class2, total)) {
            cout << "The test data belongs to class 1\n";
            if (test.classType == 1) {
                corrClass++;
                accurancy = (float)corrClass / nOfTestDocs;
                cout << "The accurancy is: " << accurancy << endl;
            }
        }
        else {
            cout << "The test data belongs to class 2\n";
            if (test.classType == 1) {
                corrClass++;
                accurancy = (float)corrClass / nOfTestDocs;
                cout << "The accurancy is: " << accurancy << endl;
            }
        }
        system("PAUSE");
        return 0;
    }
    

0 个答案:

没有答案