对于我的项目,我需要从一组约45000个元素(清理不完整元素后为32000个)创建三个训练数据集。每一行都是这样的:
38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
我应该采用分层抽样对它们进行抽样,我相信我的实施是正确的,但是它很慢,因为完成分层抽样部分需要30秒以上。
之后,我必须在每个测试数据集中挑选出20个随机数据点(针对每个训练集),并在每个测试数据集上实施我的朴素贝叶斯分类器,以确定它是否为正值(即工资是&gt; 50k)或负数(即工资<= 50k。
我的实施在这里。基本上我都在问我如何对代码进行调整,以便在对采样进行分层时使其运行得更快。
另外,我是怎么用上帝的名字来分类使用朴素贝叶斯?我正在考虑从测试数据集中随机选择的每个点,对于每个属性,我找到在训练数据集中具有该属性值的匹配点的数量。然后使用该值成功计算所有点,然后计算失败的点数。然后计算该类的概率。在为每个属性执行此操作后,我将概率相乘,并查看哪一个更接近失败概率或失败概率。然后看看我的猜测是否与实际结果相符。
#include <iostream> //std::cout
#include <fstream> //std::ifstream
#include <string> //std::string
#include <sstream> //std::istringstream
#include <vector> //std::vector
#include <algorithm> //std::remove
#include <ctime> //std::time
struct Person{
int age;
std::string workclass;
int fnlwgt;
std::string education;
int educationNum;
std::string maritalStatus;
std::string occupation;
std::string relationship;
std::string race;
std::string sex;
int capitalGain;
int capitalLoss;
int hoursPerWeek;
std::string nativeCountry;
std::string salary;
};
std::vector<Person> testData;
//Prints data of single person
void printPerson(Person person){
std::cout << person.age << " " << person.workclass <<
" " << person.fnlwgt << " " << person.education <<
" " << person.educationNum << " " << person.maritalStatus <<
" " << person.occupation << " " << person.relationship <<
" " << person.race << " " << person.sex <<
" " << person.capitalGain << " " << person.capitalLoss <<
" " << person.hoursPerWeek << " " << person.nativeCountry <<
std::endl;
}
//Converts int to string
std::string convertInt(int x){
std::string result;
std::ostringstream convert;
convert << x;
result = convert.str();
return result;
}
//Generates
int randNumGenerator(int max){
int num = (rand() % max);
//std::cout << num << std::endl;
return num;
}
//Sets up all persons in data set with complete values
void setData(Person person, std::string line){
line.erase(std::remove(line.begin(), line.end(), ','), line.end());
std::stringstream s(line);
std::string str;
//Nominal attributes
std::string workclass;
std::string education;
std::string maritalStatus;
std::string occupation;
std::string relationship;
std::string race;
std::string sex;
std::string nativeCountry;
//class label
std::string salary;
//Continuous attributes
int age;
std::string ageStr;
int fnlwgt;
std::string fnlwgtStr;
int educationNum;
std::string educationNumStr;
int capitalGain;
std::string capitalGainStr;
int capitalLoss;
std::string capitalLossStr;
int hoursPerWeek;
std::string hoursPerWeekStr;
//Read in values into stringstream
if(s >> age >> workclass >> fnlwgt >> education >> educationNum >>
maritalStatus >> occupation >> relationship >> race >> sex >> capitalGain >>
capitalLoss >> hoursPerWeek >> nativeCountry >> salary){
//Convert ints into strings
ageStr = convertInt(age);
fnlwgtStr = convertInt(fnlwgt);
educationNumStr = convertInt(educationNum);
capitalGainStr = convertInt(capitalGain);
capitalLossStr = convertInt(capitalLoss);
hoursPerWeekStr = convertInt(hoursPerWeek);
//Check if values are missing
if(ageStr == "?" || workclass == "?" ||
fnlwgtStr == "?" || education == "?" ||
educationNumStr == "?" || maritalStatus == "?" ||
occupation == "?" || relationship == "?" ||
race== "?" || sex == "?" ||
capitalGainStr == "?" || capitalLossStr == "?" ||
hoursPerWeekStr == "?" || nativeCountry == "?"){
}
else{
person.age = age;
person.workclass = workclass;
person.fnlwgt = fnlwgt;
person.education = education;
person.educationNum = educationNum;
person.maritalStatus = maritalStatus;
person.occupation = occupation;
person.relationship = relationship;
person.race = race;
person.sex = sex;
person.capitalGain = capitalGain;
person.capitalLoss = capitalLoss;
person.hoursPerWeek = hoursPerWeek;
person.nativeCountry = nativeCountry;
person.salary = salary;
testData.push_back(person);
}
}
//printPerson(person);
}
//Sets up strata for positive values
std::vector<Person> setPositive(std::vector<Person> data){
for(int i = 0; i < testData.size(); i++){
if(testData[i].salary == ">50K"){
data.push_back(testData[i]);
}
}
return data;
}
//Sets up strata for negative values
std::vector<Person> setNegative(std::vector<Person> data){
for(int i = 0; i < testData.size(); i++){
if(testData[i].salary == "<=50K"){
data.push_back(testData[i]);
}
}
return data;
}
std::vector<Person> sample(std::vector<Person> &wholeDataSet, int percentage){
int wholeDataSize = wholeDataSet.size();
std::vector<Person> stratifiedSet;
int limit = (wholeDataSize * percentage) / 100;
int randNum= 0;
std::vector<bool> numsUsedAlready(wholeDataSize);
for(int i = 0; i < limit; i++){
randNum = randNumGenerator(wholeDataSize);
while(numsUsedAlready[randNum]){
randNum = randNumGenerator(wholeDataSize);
}
numsUsedAlready[randNum] = true;
stratifiedSet.push_back(wholeDataSet[randNum]);
wholeDataSet.erase(wholeDataSet.begin() + randNum);
//std::cout << i << std::endl;
}
//delete numsUsedAlready
return stratifiedSet;
}
std::vector<Person> concatVectors(std::vector<Person> a, std::vector<Person> b){
std::vector<Person> ab;
ab.reserve(a.size() + b.size());
ab.insert(ab.end(), a.begin(), a.end()); //Add a
ab.insert(ab.end(), b.begin(), b.end()); //Add b
return ab;
}
bool classifier(Person person){
}
float naiveBayesian(std::vector<Person> trainingSet, std::vector<Person> testingSet){
float accuracy = 0;
int randNum = 0;
std::vector<Person> sampleSet;
std::vector<bool> numsUsedAlready(testingSet.size());
for(int i = 0; i < 20; i++){
randNum = randNumGenerator(testingSet.size());
while(numsUsedAlready[randNum]){
randNum = randNumGenerator(testingSet.size());
}
numsUsedAlready[randNum] = true;
sampleSet.push_back(testingSet[randNum]);
}
float ageProb;
float workclassProb;
float fnlwgtProb;
float educationProb;
float educationNumProb;
std::string maritalStatus;
std::string occupation;
std::string relationship;
std::string race;
std::string sex;
int capitalGain;
int capitalLoss;
int hoursPerWeek;
std::string nativeCountry;
std::string salary;
bool salaryGreaterThan50k = false;
for(int i = 0; i < sampleSet.size(); i++){
//salaryGreaterThan50k = classifier(sampleSet[i]);
}
return accuracy;
}
void stratifiedSample(){
srand(time(NULL));
std::vector<Person> positiveSamples;
std::vector<Person> negativeSamples;
positiveSamples = setPositive(positiveSamples);
negativeSamples = setNegative(negativeSamples);
std::vector<Person> posTestingSet10 = positiveSamples;
std::vector<Person> posTestingSet30 = positiveSamples;
std::vector<Person> posTestingSet50 = positiveSamples;
std::vector<Person> negTestingSet10 = negativeSamples;
std::vector<Person> negTestingSet30 = negativeSamples;
std::vector<Person> negTestingSet50 = negativeSamples;
std::vector<Person> posStratifiedSet_10;
std::vector<Person> posTesting_10;
std::vector<Person> negStratifiedSet_10;
std::vector<Person> posStratifiedSet_30;
std::vector<Person> negStratifiedSet_30;
std::vector<Person> posStratifiedSet_50;
std::vector<Person> negStratifiedSet_50;
std::vector<Person> stratifiedSet_10;
std::vector<Person> stratifiedSet_30;
std::vector<Person> stratifiedSet_50;
std::vector<Person> testingSet_10;
std::vector<Person> testingSet_30;
std::vector<Person> testingSet_50;
posStratifiedSet_10 = sample(posTestingSet10, 10);
//std::cout << "size of stratified " << posStratifiedSet_10.size() << " size of testing set10 " << posTestingSet10.size() << std::endl;
negStratifiedSet_10 = sample(negTestingSet10, 10);
std::cout << "Done w/ 10" << std::endl;
posStratifiedSet_30 = sample(posTestingSet30, 30);
negStratifiedSet_30 = sample(negTestingSet30, 30);
std::cout << "Done w/ 30" << std::endl;
posStratifiedSet_50 = sample(posTestingSet50, 50);
negStratifiedSet_50 = sample(negTestingSet50, 50);
std::cout << "Done w/ 50" << std::endl;
stratifiedSet_10 = concatVectors(posStratifiedSet_10, negStratifiedSet_10);
stratifiedSet_30 = concatVectors(posStratifiedSet_30, negStratifiedSet_30);
stratifiedSet_50 = concatVectors(posStratifiedSet_50, negStratifiedSet_50);
testingSet_10 = concatVectors(posTestingSet10, negTestingSet10);
testingSet_30 = concatVectors(posTestingSet30, negTestingSet30);
testingSet_50 = concatVectors(posTestingSet50, negTestingSet50);
//std::cout << "size10: " << stratifiedSet_10.size() << " testingSet10: " << testingSet_10.size() << std::endl;
float accuracy10 = 0;
float accuracy30 = 0;
float accuracy50 = 0;
accuracy10 = naiveBayesian(stratifiedSet_10, testingSet_10);
}
//Reads the file
void readInputFile(std::ifstream &file){
std::string line;
while(getline(file,line)){
Person person;
setData(person, line);
}
}
//Prints the usage string
void usageString(){
std::cout << "Usage: myProgram.exe <input_file>" << std::endl;
}
int main(int argc, char** argv){
const char *inputfile;
if (argc < 2){
usageString();
return EXIT_FAILURE;
}
else{
inputfile = argv[1];
}
std::ifstream input(inputfile);
if(!input.is_open()){
std::cerr << "Error: Data file doesn't exist" << std::endl;
return EXIT_FAILURE;
}
readInputFile(input);
stratifiedSample();
return 1;
}