基于AdaBoost的二分类算法
这是从github上看到的一个代码,原作者实现的方法可以实现多分类也可实现回归,我将代码做了下修改只实现二分类问题。
基于思想就是AdaBoost算法,在这里主要介绍下如何实现弱分类器。
两个数据:
vector<vector<double>> samples;//原数据,每一行是一个样本
vector<vector<int>>sortedSamplesSlices;//将每一维特征排序后的结果,每一列是排序后的特征,每一列的元素是样本索引号
每个弱分类器参数:
featureIndex:特征索引
threshold:阈值
在当前阈值下,当样本特征值大于阈值时,输出outputLarger
如果大于阈值样本的权值和大于小于阈值样本的权值和,outputLarger=1,反之outputLarger=-1,
outputLarger:样本特征值大于阈值时的输出
outputSmaller:样本特征值小于阈值时的输出
构造的弱分类器:
1、选择一个特征
2、在该特征下选择一个阈值
3、由于该特征所包含的所有值已排序,因此从小到大(或从大到小)依次遍历元素,将其作为阈值
4、在该阈值下,正负样本权值和分别为:positiveWeightSum,negativeWeightSum
正负样本大于阈值的权值和:positiveWeightSumLarger(即true positive样本权值和),negativeWeightSumLarger(即false positive 样本权值和)
5、错误率error=positiveWeightSumLarger*(1.0-outputLarger))
+(poitiveWeightSum-positiveWeightSumLarger)*(1.0-outputSmaller)
+negativeWeightSumLarger*(-1.0-outputLarger)
+(negativeWeightSum-negativeWeightSumLarger)*(-1.0-outputSmaller)
6、根据错误率选择最优分类器,依次循环
AdaBoost.h
1 #ifndef ADABOOST_H 2 #define ADABOOST_H 3 #include<vector> 4 #include<string> 5 #include<iostream> 6 using namespace std; 7 class AdaBoost 8 { 9 public: 10 AdaBoost():sampleTotal(0),featureTotal(0){} 11 void train(int roundTotal);//trainging 12 double predict(vector<double> featureVector);//predict result with weight:alpha_m*1 or alpha_m*(-1) 13 void setTrainingSamples(string filename);//input samples 14 void writeModelToFile(string modleFilename);//output models 15 void readModelFromFile(string modleFilenmae);//read models 16 int featureTotal;//number of features 17 private: 18 class DecisionStump 19 { 20 public: 21 DecisionStump():featureIndex_(-1),error_(-1){}; 22 void set(int featureIndex,double threshold,double error,double outputLarger,double outputSmaller); 23 double evaluate(double featureValue);//give a input,the classifier give the corresponding outputs 24 double evaluate(vector<double> featureVector); 25 int featureIndex(){return featureIndex_;} 26 double threshold(){return threshold_;} 27 double error(){return error_;} 28 double outputLarger(){return outputLarger_;} 29 double outputSmaller(){return outputSmaller_;} 30 private: 31 int featureIndex_; 32 double threshold_; 33 double error_; 34 /**************************** 35 通常情况下,我们认为当特征值大于阈值时,输出outputLarger,反之输出outputSmaller. 36 在这里为了求解最优分类器,如果在当前阈值下,大于阈值的样本的权值和大于小于阈值的样本的权值和时,outputLarger=1,反之,outputSmaller=-1; 37 outputSmaller含义类似 38 ********************************/ 39 double outputLarger_;//if featureVlaue>threshold,the classifier's result=outputLarger. 40 double outputSmaller_;//if featureValue<threshold,the classifier's result=outputSmaller 41 }; 42 void sortSampleIndices();//基于每个特征对样本排序 43 void initializeWeights();//初始化权值 44 void trainRound();//一次迭代,结果为一个弱分类器 45 void calcWeightSum();//计算所有样本权值和,正负样本大于阈值样本的权值和 46 DecisionStump learnOptimalClassifier(int featureIndex);//在固定特征下,求解最优分类器 47 void computeClassifierOutput(double weightSumLarger,double &outputLarger,double &outputSmaller);//根据正负样本权值分布,设置分类器输出 48 double computeError(double positiveWeightSumLarger,double negativeWeightSumLarger,double outputLarger,double outputSmaller);//计算错误率 49 void updateWeight(DecisionStump bestClassifier);//更新权值 50 51 int sampleTotal; 52 vector<DecisionStump> weakClassifier; 53 /********training data*******/ 54 vector<vector<double>> samples; 55 vector<bool> labels; 56 vector<double> weights; 57 /*********traing parameters*****************/ 58 vector<vector<int>> sortedSampleIndices; 59 double weightSum; 60 double weightLabelSum; 61 double positiveWeightSum; 62 double negativeWeightSum; 63 }; 64 65 #endif
AdaBoost.cpp
1 #include"stdafx.h" 2 #include"AdaBoost.h" 3 #include"readSampleFromFile.h" 4 #include<iostream> 5 #include<fstream> 6 #include<vector> 7 #include<algorithm> 8 #include<string> 9 #include<cmath> 10 using namespace std; 11 12 struct SampleElement 13 { 14 int sampleIndex; 15 double sampleValue; 16 double operator<(SampleElement &comparisionElement){return sampleValue<comparisionElement.sampleValue;} 17 }; 18 int cmp(SampleElement a,SampleElement b) 19 { 20 return a.sampleValue<b.sampleValue; 21 } 22 void AdaBoost::train(int roundTotal) 23 { 24 for(int roundIndex=0;roundIndex<roundTotal;++roundIndex) 25 { 26 trainRound(); 27 } 28 } 29 30 void AdaBoost::setTrainingSamples(const string filename) 31 { 32 readSampleDataFile(filename,samples,labels); 33 sampleTotal=samples.size(); 34 if(sampleTotal<=0) 35 { 36 cerr<<"no training samples"<<endl; 37 exit(1); 38 } 39 featureTotal=samples[0].size(); 40 initializeWeights(); 41 sortSampleIndices(); 42 weakClassifier.clear(); 43 } 44 45 double AdaBoost::predict(const vector<double>featureVector) 46 { 47 double score=0; 48 for(int classifierIndex=0;classifierIndex<weakClassifier.size();++classifierIndex) 49 { 50 score+=weakClassifier[classifierIndex].evaluate(featureVector); 51 } 52 return score; 53 } 54 void AdaBoost::readModelFromFile(string modelFilename) 55 { 56 ifstream inputModelStream(modelFilename.c_str(),ios_base::in); 57 if(inputModelStream.fail()) 58 { 59 cerr<<"error:can't open file "<<modelFilename<<endl; 60 exit(1); 61 } 62 int roundTotal; 63 inputModelStream>>roundTotal; 64 weakClassifier.resize(roundTotal); 65 for(int roundIndex=0;roundIndex<roundTotal;++roundIndex) 66 { 67 int featureIndex; 68 double threshold,outputLarger,outputSmaller,error; 69 inputModelStream>>featureIndex; 70 inputModelStream>>threshold; 71 inputModelStream>>outputLarger; 72 inputModelStream>>outputSmaller; 73 weakClassifier[roundIndex].set(featureIndex,threshold,0,outputLarger,outputSmaller); 74 75 } 76 inputModelStream.close(); 77 } 78 void AdaBoost::writeModelToFile(string modelFilename) 79 { 80 ofstream outputModelStream(modelFilename.c_str(),ios_base::out); 81 if(outputModelStream.fail()) 82 { 83 cerr<<"error:can't open file "<<modelFilename<<endl; 84 exit(1); 85 } 86 int roundTotal=weakClassifier.size(); 87 outputModelStream<<roundTotal<<endl; 88 for(int roundIndex=0;roundIndex<roundTotal;++roundIndex) 89 { 90 outputModelStream<<weakClassifier[roundIndex].featureIndex()<<" "; 91 outputModelStream<<weakClassifier[roundIndex].threshold()<<" "; 92 outputModelStream<<weakClassifier[roundIndex].outputLarger()<<" "; 93 outputModelStream<<weakClassifier[roundIndex].outputSmaller()<<endl; 94 } 95 outputModelStream.close(); 96 } 97 double AdaBoost::DecisionStump::evaluate(double featureValue) 98 { 99 if(featureValue>threshold_) 100 return outputLarger_; 101 else 102 return outputSmaller_; 103 } 104 105 double AdaBoost::DecisionStump::evaluate(vector<double>featureVector) 106 { 107 if(featureVector[featureIndex_]>threshold_) 108 return outputLarger_; 109 else 110 return outputSmaller_; 111 } 112 void AdaBoost::DecisionStump::set(int featureIndex,double threshold,double error,double outputLarger,double outputSmaller) 113 { 114 featureIndex_=featureIndex; 115 threshold_=threshold; 116 error_=error; 117 outputLarger_=outputLarger; 118 outputSmaller_=outputSmaller; 119 } 120 void AdaBoost::sortSampleIndices() 121 { 122 sortedSampleIndices.resize(featureTotal); 123 for(int featureIndex=0;featureIndex<featureTotal;++featureIndex) 124 { 125 sortedSampleIndices[featureIndex].resize(sampleTotal); 126 vector<SampleElement> sampleElementList(sampleTotal); 127 SampleElement sampleElement; 128 for(int sampleIndex=0;sampleIndex<sampleTotal;++sampleIndex) 129 { 130 sampleElement.sampleIndex=sampleIndex; 131 sampleElement.sampleValue=samples[sampleIndex][featureIndex]; 132 sampleElementList[sampleIndex]=sampleElement; 133 } 134 sort(sampleElementList.begin(),sampleElementList.end()); 135 for(int sortIndex=0;sortIndex<sampleTotal;++sortIndex) 136 { 137 sortedSampleIndices[featureIndex][sortIndex]=sampleElementList[sortIndex].sampleIndex; 138 } 139 } 140 141 } 142 void AdaBoost::initializeWeights() 143 { 144 weights.resize(sampleTotal); 145 for(int sampleIndex=0;sampleIndex<sampleTotal;++sampleIndex) 146 { 147 weights[sampleIndex]=1.0/sampleTotal; 148 } 149 } 150 151 void AdaBoost::trainRound() 152 { 153 DecisionStump optimalClassifier,bestClassifier; 154 calcWeightSum(); 155 for(int featureIndex=0;featureIndex<featureTotal;++featureIndex) 156 { 157 optimalClassifier=learnOptimalClassifier(featureIndex); 158 if(featureIndex==0||optimalClassifier.error()<bestClassifier.error()) 159 bestClassifier=optimalClassifier; 160 } 161 updateWeight(bestClassifier); 162 weakClassifier.push_back(bestClassifier); 163 } 164 void AdaBoost::calcWeightSum() 165 { 166 weightSum=0; 167 weightLabelSum=0; 168 positiveWeightSum=0; 169 negativeWeightSum=0; 170 for(int sampleIndex=0;sampleIndex<sampleTotal;++sampleIndex) 171 { 172 double sampleWeight=weights[sampleIndex]; 173 weightSum+=sampleWeight; 174 if(labels[sampleIndex]) 175 { 176 weightLabelSum+=sampleWeight; 177 positiveWeightSum+=sampleWeight; 178 } 179 else 180 { 181 weightLabelSum-=sampleWeight; 182 negativeWeightSum+=sampleWeight; 183 } 184 } 185 } 186 187 AdaBoost::DecisionStump AdaBoost::learnOptimalClassifier(int featureIndex) 188 { 189 DecisionStump optimalClassifier; 190 double weightSumLarger=weightSum; 191 double weightLabelSumLarger=weightLabelSum; 192 double positiveWeightSumLarger=positiveWeightSum; 193 double negativeWeightSumLarger=negativeWeightSum; 194 195 for(int sortIndex=0;sortIndex<sampleTotal-1;++sortIndex) 196 { 197 int sampleIndex=sortedSampleIndices[featureIndex][sortIndex]; 198 double threshold=samples[sampleIndex][featureIndex]; 199 double sampleWeight=weights[sampleIndex]; 200 weightSumLarger-=sampleWeight; 201 if(labels[sampleIndex]) 202 { 203 weightLabelSumLarger-=sampleWeight; 204 positiveWeightSumLarger-=sampleWeight; 205 } 206 else 207 { 208 weightLabelSumLarger+=sampleWeight; 209 negativeWeightSumLarger-=sampleWeight; 210 } 211 int nextSampleIndex=sortedSampleIndices[featureIndex][sortIndex+1]; 212 while(sortIndex<sampleTotal-1&&samples[sampleIndex][featureIndex]==samples[nextSampleIndex][featureIndex]) 213 { 214 ++sortIndex; 215 sampleIndex=sortedSampleIndices[featureIndex][sortIndex]; 216 sampleWeight=weights[sortIndex]; 217 weightSumLarger-=sampleWeight; 218 if(labels[sampleIndex]) 219 { 220 weightLabelSumLarger-=sampleWeight; 221 positiveWeightSumLarger-=sampleWeight; 222 } 223 else 224 { 225 weightLabelSumLarger+=sampleWeight; 226 negativeWeightSumLarger-=sampleWeight; 227 } 228 if(sortIndex<sampleTotal-1) 229 nextSampleIndex=sortedSampleIndices[featureIndex][sortIndex+1]; 230 } 231 if(sortIndex>=sampleTotal-1) 232 break; 233 double outputLarger,outputSmaller; 234 computeClassifierOutput(weightSumLarger,outputLarger,outputSmaller); 235 double error=computeError(positiveWeightSumLarger,negativeWeightSumLarger,outputLarger,outputSmaller); 236 if(optimalClassifier.error()<0||error<optimalClassifier.error()) 237 { 238 double classifierThreshold=(threshold+weights[nextSampleIndex])/2.0; 239 optimalClassifier.set(featureIndex,classifierThreshold,error,outputLarger,outputSmaller); 240 double classifierWeight=log((1-error)/error)/2.0; 241 outputLarger*=classifierWeight; 242 outputSmaller*=classifierWeight; 243 } 244 245 } 246 return optimalClassifier; 247 } 248 249 void AdaBoost::computeClassifierOutput(double weightSumLarger,double &outputLarger,double &outputSmaller) 250 { 251 if(weightSumLarger) 252 { 253 outputLarger=1; 254 outputSmaller=-1; 255 } 256 else 257 { 258 outputLarger=-1; 259 outputSmaller=1; 260 } 261 } 262 double AdaBoost::computeError(double positiveWeightSumLarger,double negativeWeightSumLarger,double outputLarger,double outputSmaller) 263 { 264 double error=0; 265 error=positiveWeightSumLarger*(1-outputLarger) 266 +(positiveWeightSum-positiveWeightSumLarger)*(1-outputSmaller) 267 +negativeWeightSumLarger*(-1-outputLarger) 268 +(negativeWeightSum-negativeWeightSumLarger)*(-1-outputSmaller); 269 return error; 270 } 271 272 void AdaBoost::updateWeight(DecisionStump bestClassifier) 273 { 274 double error=bestClassifier.error(); 275 double weightTotal=0; 276 for(int sampleIndex=0;sampleIndex<sampleTotal;++sampleIndex) 277 { 278 279 weights[sampleIndex]*=exp(-1.0*labels[sampleIndex]*bestClassifier.evaluate(samples[sampleIndex])); 280 weightTotal+=weights[sampleIndex]; 281 } 282 for(int sampleIndex=0;sampleIndex<sampleTotal;++sampleIndex) 283 { 284 weights[sampleIndex]/=weightTotal; 285 } 286 }

浙公网安备 33010602011771号