Coursera机器学习:HomeWork 2 Q16

QUESTION 16~18

根据老师的要求，基于[-1,1]上的均匀分布随机生成20个样本，相应的输出有20%的错误率；20个样本将数轴分为21段，中间有19段，θ取这19段的中点当中的一个，s取1或-1。遍历所有的θ和s，找到E-in最小的（θ，s）组合就得到了那个g函数。

#include <iostream>#include <ctime>#include <vector>#include <algorithm> using namespace std; #define SAMPLE_SIZE 20//样本量 struct Hypothesis{int coef;double threshold; }; //求数字的符号int sign(double x){if(x<0)return -1;else if(x>0) return 1;elsereturn -1;} //反转数字的符号int flipSign(int num){return num * (-1); } //计算样本错误率double calErrInSample(vector<double>& inputVec, vector<int>& outputVec, Hypothesis & hypo){int errCount = 0;for(int i=0;i<SAMPLE_SIZE;++i){if(outputVec[i] != hypo.coef*sign(inputVec[i]-hypo.threshold)){errCount++;}}return double(errCount)/double(SAMPLE_SIZE);} //计算全局错误率double calErrOutSample(Hypothesis & hypo){return 0.5 + 0.3 * double(hypo.coef) * (abs(hypo.threshold)-1.0); } //产生[-1,1]之间均匀分布的随机数double getRand() {return 2.0 * double(rand()) / double(RAND_MAX) – 1.0; } //生成训练数据void getTrainingData(vector<double>& inputVec){for(int i=0;i<SAMPLE_SIZE;++i){inputVec.push_back(getRand());}//对inputVec做升序排列(sort函数默认升序)sort(inputVec.begin(),inputVec.end());} //根据f(x)产生输出，有20%的出错率void calOutput(vector<double>& inputVec, vector<int>& outputVec){int output;double randNum;for(int i=0;i<SAMPLE_SIZE;++i){randNum = double(rand()) / double(RAND_MAX); //[0,1];output = sign(inputVec[i]);if(randNum<=0.2){output = flipSign(output);}outputVec.push_back(output);}} //遍历所有θ，找到最小的E-in并返回double getMinErrIn(vector<double>& inputVec, vector<int>& outputVec, Hypothesis & hypo, double & bestThres ){double minErrIn = 1.0;double curErrIn;for(int i=0;i<SAMPLE_SIZE-1;++i){hypo.threshold = double(inputVec[i]+inputVec[i+1])/2.0;curErrIn = calErrInSample(inputVec,outputVec,hypo);if(curErrIn<minErrIn){minErrIn = curErrIn;bestThres = hypo.threshold;}}return minErrIn;} //Decision Stump 算法, 确定s和θdouble decisionStump( vector<double>& inputVec, vector<int>& outputVec, Hypothesis & hypo ){double minErrInPositive = 1.0;double minErrInNegtive = 1.0;double minErrIn;double bestThresPositive;double bestThresNegtive;hypo.coef = 1;minErrInPositive = getMinErrIn(inputVec,outputVec,hypo,bestThresPositive);hypo.coef = -1;minErrInNegtive = getMinErrIn(inputVec,outputVec,hypo,bestThresNegtive);if(minErrInPositive<minErrInNegtive){hypo.coef = 1;minErrIn = minErrInPositive;hypo.threshold = bestThresPositive;}else{hypo.coef = -1;minErrIn = minErrInNegtive;hypo.threshold = bestThresNegtive;}return minErrIn;}void main(){srand((unsigned)time(NULL));double errInTotal = 0.0;double errOutTotal = 0.0;for(int i=0;i<5000;++i){vector<double> inputVec;vector<int> outputVec;Hypothesis hypo;getTrainingData(inputVec);calOutput(inputVec,outputVec);errInTotal += decisionStump(inputVec,outputVec,hypo);errOutTotal += calErrOutSample(hypo);cout<<"—————–第"<<i+1<<"次计算结束——————-\n";cout<<"s = "<<hypo.coef<<endl;cout<<"θ= "<<hypo.threshold<<endl;}cout<<"Average E-in = "<<errInTotal/5000<<"\n";cout<<"Average E-out = "<<errOutTotal/5000<<"\n";}输出结果

QUESTION 19~20

这一题把16题中的 decision stump 拓展到多维，要求找出E-in最小的那一维并在测试数据上计算对应维度的E-out:

#include <iostream>#include <ctime>#include <cmath>#include <vector>#include <algorithm> using namespace std; #define DEMENSION 9//数据维度 char *file = "training.txt";char *file_test = "testing.txt"; struct record {double input[DEMENSION];int output;}; struct singleDemensionRecord {double input;int output;}; struct Hypothesis{int coef;double threshold; }; //求数字的符号int sign(double x){if(x<0)return -1;else if(x>0) return 1;elsereturn -1;} //从文件读取数据void getData(ifstream & dataFile, vector<record> &data){while(!dataFile.eof()){record curRecord;for(int i=0;i<DEMENSION;++i){ dataFile>>curRecord.input[i]; }dataFile>>curRecord.output;data.push_back(curRecord);}dataFile.close(); } //计算指定维度的样本错误率double calErr(vector<singleDemensionRecord>& singleDemensionVec, vector<Hypothesis>& hypo, int demension){int errCount = 0;int length = singleDemensionVec.size();for(int i=0;i<length;++i){if(singleDemensionVec[i].output != hypo[demension-1].coef*sign(singleDemensionVec[i].input-hypo[demension-1].threshold)){errCount++;}}return double(errCount)/double(length);} //single demension record的比较函数bool recCompare(singleDemensionRecord & a, singleDemensionRecord & b){return a.input<b.input;} //将指定维度的数据提取出来并升序排列void getInputByDemension(vector<record>& dataSet, vector<singleDemensionRecord>& singleDemensionVec, int demension){int recordSize = dataSet.size();singleDemensionRecord curRec;for(int i=0;i<recordSize;++i){curRec.input = dataSet[i].input[demension-1];curRec.output = dataSet[i].output;singleDemensionVec.push_back(curRec);}sort(singleDemensionVec.begin(),singleDemensionVec.end(),recCompare);} //遍历所有θ，找到最小的E-in并返回double getMinErrIn(vector<singleDemensionRecord> & singleDemensionVec, vector<Hypothesis>& hypo, int demension, double & bestThres){double minErrIn = 1.0;double curErrIn;int recordSize = singleDemensionVec.size();for(int i=0;i<recordSize-1;++i){hypo[demension-1].threshold = double(singleDemensionVec[i].input+singleDemensionVec[i+1].input)/2.0;curErrIn = calErr(singleDemensionVec,hypo,demension);if(curErrIn<minErrIn){minErrIn = curErrIn;bestThres = hypo[demension-1].threshold;}}return minErrIn;} //Decision Stump 算法, 确定s和θvoid decisionStump(vector<record>& trainingSet, vector<record>& testSet, vector<Hypothesis>& hypo){int recordSize = trainingSet.size();int minErrInDem;double minErrIn = 1.1;for(int dem=0;dem<DEMENSION;++dem){vector<singleDemensionRecord> singleDemensionVec;double curMinErrIn;double bestThresPositive;double bestThresNegtive;double minErrInPositive;double minErrInNegtive;getInputByDemension(trainingSet,singleDemensionVec,dem+1);hypo[dem].coef = 1;minErrInPositive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresPositive);hypo[dem].coef = -1;minErrInNegtive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresNegtive);if(minErrInPositive<minErrInNegtive){hypo[dem].coef = 1;curMinErrIn = minErrInPositive;hypo[dem].threshold = bestThresPositive;}else{hypo[dem].coef = -1;curMinErrIn = minErrInNegtive;hypo[dem].threshold = bestThresNegtive;}if(minErrIn>curMinErrIn){minErrIn = curMinErrIn;minErrInDem = dem+1;}}cout<<"The demension with min error is : "<<minErrInDem<<endl;cout<<"min E-in = "<<minErrIn<<endl;vector<singleDemensionRecord> singleDemensionTestVec;getInputByDemension(testSet,singleDemensionTestVec,minErrInDem);cout<<"min E-out = "<<calErr(singleDemensionTestVec,hypo,minErrInDem)<<endl<<endl;}void main(){srand((unsigned)time(NULL));vector<record> trainingSet;//训练数据vector<record> testSet;//测试数据vector<Hypothesis> hypoVec(DEMENSION);//每个维度一个hypothesisifstream dataFile(file);ifstream testDataFile(file_test);if( dataFile.is_open() && testDataFile.is_open() ){getData(dataFile,trainingSet);getData(testDataFile,testSet);}else{cerr<<"ERROR —> 文件打开失败"<<endl;exit(1);}decisionStump(trainingSet,testSet,hypoVec);}输出结果:

关于Machine Learning更多讨论与交流，，敬请关注本博客和新浪微博songzi_tea.

有些人注定是等待别人的，有些人是注定被人等的。

相关文章：

你感兴趣的文章：

标签云：