分层抽样/朴素贝叶斯C ++

时间:2015-11-10 03:52:13

标签: c++ bayesian sampling

对于我的项目,我需要从一组约45000个元素(清理不完整元素后为32000个)创建三个训练数据集。每一行都是这样的:

38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K

我应该采用分层抽样对它们进行抽样,我相信我的实施是正确的,但是它很慢,因为完成分层抽样部分需要30秒以上。

之后,我必须在每个测试数据集中挑选出20个随机数据点(针对每个训练集),并在每个测试数据集上实施我的朴素贝叶斯分类器,以确定它是否为正值(即工资是&gt; 50k)或负数(即工资<= 50k。

我的实施在这里。基本上我都在问我如何对代码进行调整,以便在对采样进行分层时使其运行得更快。

另外,我是怎么用上帝的名字来分类使用朴素贝叶斯?我正在考虑从测试数据集中随机选择的每个点,对于每个属性,我找到在训练数据集中具有该属性值的匹配点的数量。然后使用该值成功计算所有点,然后计算失败的点数。然后计算该类的概率。在为每个属性执行此操作后,我将概率相乘,并查看哪一个更接近失败概率或失败概率。然后看看我的猜测是否与实际结果相符。

#include <iostream> //std::cout
#include <fstream> //std::ifstream
#include <string> //std::string
#include <sstream> //std::istringstream
#include <vector> //std::vector
#include <algorithm> //std::remove
#include <ctime> //std::time

struct Person{
    int age;
    std::string workclass;
    int fnlwgt;
    std::string education;
    int educationNum;
    std::string maritalStatus;
    std::string occupation;
    std::string relationship;
    std::string race;
    std::string sex;
    int capitalGain;
    int capitalLoss;
    int hoursPerWeek;
    std::string nativeCountry;
    std::string salary;
};

std::vector<Person> testData;

//Prints data of single person
void printPerson(Person person){
    std::cout << person.age << " " << person.workclass << 
        " " << person.fnlwgt << " " << person.education << 
        " " << person.educationNum << " " << person.maritalStatus <<
        " " << person.occupation << " " << person.relationship <<
        " " << person.race << " " << person.sex << 
        " " << person.capitalGain << " " << person.capitalLoss << 
        " " << person.hoursPerWeek << " " << person.nativeCountry <<
        std::endl;
}

//Converts int to string
std::string convertInt(int x){
    std::string result;
    std::ostringstream convert;

    convert << x;
    result = convert.str();
    return result;
}

//Generates
int randNumGenerator(int max){
    int num = (rand() % max);
    //std::cout << num << std::endl;
    return num;
}

//Sets up all persons in data set with complete values
void setData(Person person, std::string line){

        line.erase(std::remove(line.begin(), line.end(), ','), line.end());

        std::stringstream s(line);
        std::string str;

        //Nominal attributes
        std::string workclass;
        std::string education;
        std::string maritalStatus;
        std::string occupation;
        std::string relationship;
        std::string race;
        std::string sex;
        std::string nativeCountry;

        //class label
        std::string salary;

        //Continuous attributes
        int age;
        std::string ageStr;


        int fnlwgt;
        std::string fnlwgtStr;

        int educationNum;
        std::string educationNumStr;

        int capitalGain;
        std::string capitalGainStr;

        int capitalLoss;
        std::string capitalLossStr;

        int hoursPerWeek;
        std::string hoursPerWeekStr;

        //Read in values into stringstream
        if(s >> age >> workclass >> fnlwgt >> education >> educationNum >>
            maritalStatus >> occupation >> relationship >> race >> sex >> capitalGain >>
            capitalLoss >> hoursPerWeek >> nativeCountry >> salary){

                //Convert ints into strings
                ageStr = convertInt(age);
                fnlwgtStr = convertInt(fnlwgt);
                educationNumStr = convertInt(educationNum);
                capitalGainStr = convertInt(capitalGain);
                capitalLossStr = convertInt(capitalLoss);
                hoursPerWeekStr = convertInt(hoursPerWeek);

                //Check if values are missing
                if(ageStr == "?" || workclass == "?" ||
                    fnlwgtStr == "?" || education == "?" || 
                    educationNumStr == "?" || maritalStatus == "?" ||
                    occupation == "?" || relationship == "?" ||
                    race== "?" || sex == "?" || 
                    capitalGainStr == "?" || capitalLossStr == "?" ||
                    hoursPerWeekStr == "?" || nativeCountry == "?"){

                }
                else{
                    person.age = age;
                    person.workclass = workclass;
                    person.fnlwgt = fnlwgt;
                    person.education = education;
                    person.educationNum = educationNum;
                    person.maritalStatus = maritalStatus;
                    person.occupation = occupation;
                    person.relationship = relationship;
                    person.race = race;
                    person.sex = sex;
                    person.capitalGain = capitalGain;
                    person.capitalLoss = capitalLoss;
                    person.hoursPerWeek = hoursPerWeek;
                    person.nativeCountry = nativeCountry;
                    person.salary = salary;
                    testData.push_back(person);
                }
        }

        //printPerson(person);
}

//Sets up strata for positive values
std::vector<Person> setPositive(std::vector<Person> data){
    for(int i = 0; i < testData.size(); i++){
        if(testData[i].salary == ">50K"){
            data.push_back(testData[i]);
        }
    }
    return data;
}

//Sets up strata for negative values
std::vector<Person> setNegative(std::vector<Person> data){
    for(int i = 0; i < testData.size(); i++){
        if(testData[i].salary == "<=50K"){
            data.push_back(testData[i]);
        }
    }
    return data;
}

std::vector<Person> sample(std::vector<Person> &wholeDataSet, int percentage){
    int wholeDataSize = wholeDataSet.size();

    std::vector<Person> stratifiedSet;

    int limit = (wholeDataSize * percentage) / 100;

    int randNum= 0;

    std::vector<bool> numsUsedAlready(wholeDataSize);

    for(int i = 0; i < limit; i++){
        randNum = randNumGenerator(wholeDataSize);
        while(numsUsedAlready[randNum]){
            randNum = randNumGenerator(wholeDataSize);
        }
        numsUsedAlready[randNum] = true;
        stratifiedSet.push_back(wholeDataSet[randNum]); 
        wholeDataSet.erase(wholeDataSet.begin() + randNum);
        //std::cout << i << std::endl;
    } 

    //delete numsUsedAlready
    return stratifiedSet;
}

std::vector<Person> concatVectors(std::vector<Person> a, std::vector<Person> b){
    std::vector<Person> ab;
    ab.reserve(a.size() + b.size());
    ab.insert(ab.end(), a.begin(), a.end()); //Add a
    ab.insert(ab.end(), b.begin(), b.end()); //Add b
    return ab;
}

bool classifier(Person person){

}

float naiveBayesian(std::vector<Person> trainingSet, std::vector<Person> testingSet){
    float accuracy = 0;
    int randNum = 0;

    std::vector<Person> sampleSet;
    std::vector<bool> numsUsedAlready(testingSet.size());

    for(int i = 0; i < 20; i++){
        randNum = randNumGenerator(testingSet.size());
        while(numsUsedAlready[randNum]){
            randNum = randNumGenerator(testingSet.size());
        }
        numsUsedAlready[randNum] = true;
        sampleSet.push_back(testingSet[randNum]);
    }

    float ageProb;
    float workclassProb;
    float fnlwgtProb;
    float educationProb;
    float educationNumProb;
    std::string maritalStatus;
    std::string occupation;
    std::string relationship;
    std::string race;
    std::string sex;
    int capitalGain;
    int capitalLoss;
    int hoursPerWeek;
    std::string nativeCountry;
    std::string salary;


    bool salaryGreaterThan50k = false;
    for(int i = 0; i < sampleSet.size(); i++){
        //salaryGreaterThan50k = classifier(sampleSet[i]);
    }

    return accuracy;
}

void stratifiedSample(){
    srand(time(NULL)); 

    std::vector<Person> positiveSamples;
    std::vector<Person> negativeSamples;
    positiveSamples = setPositive(positiveSamples);
    negativeSamples = setNegative(negativeSamples);

    std::vector<Person> posTestingSet10 = positiveSamples;
    std::vector<Person> posTestingSet30 = positiveSamples;
    std::vector<Person> posTestingSet50 = positiveSamples;

    std::vector<Person> negTestingSet10 = negativeSamples;
    std::vector<Person> negTestingSet30 = negativeSamples;
    std::vector<Person> negTestingSet50 = negativeSamples;

    std::vector<Person> posStratifiedSet_10;
    std::vector<Person> posTesting_10;
    std::vector<Person> negStratifiedSet_10;

    std::vector<Person> posStratifiedSet_30;
    std::vector<Person> negStratifiedSet_30;

    std::vector<Person> posStratifiedSet_50;
    std::vector<Person> negStratifiedSet_50;

    std::vector<Person> stratifiedSet_10;
    std::vector<Person> stratifiedSet_30;
    std::vector<Person> stratifiedSet_50;

    std::vector<Person> testingSet_10;
    std::vector<Person> testingSet_30;
    std::vector<Person> testingSet_50;


    posStratifiedSet_10 = sample(posTestingSet10, 10);
    //std::cout << "size of stratified " << posStratifiedSet_10.size() << " size of testing set10 " << posTestingSet10.size() << std::endl;

    negStratifiedSet_10 = sample(negTestingSet10, 10);
    std::cout << "Done w/ 10" << std::endl;

    posStratifiedSet_30 = sample(posTestingSet30, 30);
    negStratifiedSet_30 = sample(negTestingSet30, 30);
    std::cout << "Done w/ 30" << std::endl;

    posStratifiedSet_50 = sample(posTestingSet50, 50);
    negStratifiedSet_50 = sample(negTestingSet50, 50);
    std::cout << "Done w/ 50" << std::endl;

    stratifiedSet_10 = concatVectors(posStratifiedSet_10, negStratifiedSet_10);
    stratifiedSet_30 = concatVectors(posStratifiedSet_30, negStratifiedSet_30);
    stratifiedSet_50 = concatVectors(posStratifiedSet_50, negStratifiedSet_50);

    testingSet_10 = concatVectors(posTestingSet10, negTestingSet10);
    testingSet_30 = concatVectors(posTestingSet30, negTestingSet30);
    testingSet_50 = concatVectors(posTestingSet50, negTestingSet50);

    //std::cout << "size10: " << stratifiedSet_10.size() << " testingSet10: " << testingSet_10.size() << std::endl;
    float accuracy10 = 0;
    float accuracy30 = 0;
    float accuracy50 = 0;

    accuracy10 = naiveBayesian(stratifiedSet_10, testingSet_10);


}

//Reads the file
void readInputFile(std::ifstream &file){
    std::string line;
    while(getline(file,line)){
        Person person;
        setData(person, line);
    }
}

//Prints the usage string
void usageString(){
    std::cout << "Usage: myProgram.exe <input_file>" << std::endl;
}


int main(int argc, char** argv){
    const char *inputfile;

    if (argc < 2){
        usageString();
        return EXIT_FAILURE;
    }
    else{
        inputfile = argv[1];
    }

    std::ifstream input(inputfile);
    if(!input.is_open()){
        std::cerr << "Error: Data file doesn't exist" << std::endl;
        return EXIT_FAILURE;
    }

    readInputFile(input);
    stratifiedSample();
    return 1;
}

0 个答案:

没有答案