[ML] Concept Learning
Candidate Elimination
Thanks for Sanketh Vedula. This is a good demo to understand candidate elimination algorithm that I have optimized based on this guy's good work.
rika@rika-UX303UB$ ./a.out <Input> Number of Features:6 <Input> Number of Attributes[0]:2 <Input> (1):rainy <Input> (2):sunny <Input> Number of Attributes[1]:2 <Input> (1):cold <Input> (2):warm <Input> Number of Attributes[2]:2 <Input> (1):normal <Input> (2):high <Input> Number of Attributes[3]:2 <Input> (1):weak <Input> (2):strong <Input> Number of Attributes[4]:2 <Input> (1):cool <Input> (2):warm <Input> Number of Attributes[5]:2 <Input> (1):same <Input> (2):change Input training data: Sunny Warm Normal Strong Warm Same Yes S 0 < Sunny Warm Normal Strong Warm Same > G 0 < ? ? ? ? ? ? > Sunny Warm High Strong Warm Same Yes S 1 < Sunny Warm ? Strong Warm Same > G 1 < ? ? ? ? ? ? > Rainy Cold High Strong Warm Change No S 2 < Sunny Warm ? Strong Warm Same > G 2 < Sunny ? ? ? ? ? > < ? Warm ? ? ? ? > < ? ? ? ? ? Same > Sunny Warm High Strong Cool Change Yes S 3 < Sunny Warm ? Strong ? ? > G 3 < Sunny ? ? ? ? ? > < ? Warm ? ? ? ? >
Start with negative sample:
Input training data: Rainy Cold High Strong Warm Change No S 0 < > G 0 < Sunny ? ? ? ? ? > < Cloudy ? ? ? ? ? > < ? Warm ? ? ? ? > < ? ? Normal ? ? ? > < ? ? ? Weak ? ? > < ? ? ? ? Cool ? > < ? ? ? ? ? Same > Sunny Warm High Strong Warm Change No // Sunny and Warm do not co-exist S 1 < > G 1 < Sunny Cold ? ? ? ? > < Rainy Warm ? ? ? ? > < Cloudy ? ? ? ? ? > < ? ? Normal ? ? ? > < ? ? ? Weak ? ? > < ? ? ? ? Cool ? > < ? ? ? ? ? Same >
This is complete code:
1 /* 2 Candidate Elimination 3 */ 4 5 #include <cstdio> 6 #include <cstdlib> 7 #include <cctype> 8 #include <vector> 9 #include <string> 10 #include <stack> 11 #include <queue> 12 #include <iterator> 13 #include <set> 14 #include <map> 15 #include <iostream> 16 #include <sstream> 17 #include <deque> 18 #include <cmath> 19 #include <memory.h> 20 #include <algorithm> 21 #include <utility> 22 #include <climits> 23 24 typedef long double ld; 25 typedef long long ll; 26 #define all(c) c.begin(),c.end() 27 28 using namespace std; 29 30 int main() 31 { 32 int numberFeatures; //number of features in the dataset 33 string data; 34 bool consistent; 35 36 cout << "<Input> Number of Features:"; 37 cin >> numberFeatures; //input the number of features 38 39 vector <int> numberAttributes(numberFeatures); //number of attributes for each feature. 40 map <string, int> instance[numberFeatures+1]; 41 42 for(int i=0; i<numberFeatures; i++) 43 { 44 cout <<"<Input> Number of Attributes[" << i+1 << "]:"; 45 cin >> numberAttributes[i]; 46 47 for(int j=1; j<=numberAttributes[i]; j++) 48 { 49 string temp; 50 cout <<"<Input> (" << j << "):"; 51 cin>> temp; 52 instance[i][temp] = j; //map attribute name with number 53 } 54 } 55 56 instance[numberFeatures]["Yes"] = 1; 57 instance[numberFeatures]["No"] = 0; 58 59 /* 60 for any feature if attribute=0 -> null value; 61 attribute = INT_MAX -> all 62 */ 63 64 vector <int> currdata(numberFeatures+1), tmpData(numberFeatures); 65 66 //set used to represent the generic and specific boundaries. 67 set < vector<int> > specific, generic; 68 69 //initialization 70 for(int i=0; i<numberFeatures; i++) 71 { 72 tmpData[i]=0; 73 } 74 specific.insert(tmpData); 75 76 for(int i=0; i<numberFeatures; i++) 77 { 78 tmpData[i]=INT_MAX; 79 } 80 generic.insert(tmpData); 81 82 83 cout << endl << "Input training data:" << endl << endl; 84 int loop = 0; 85 while( getline(cin, data) ) 86 { 87 88 if(data.size()==0) 89 { 90 continue; 91 } 92 93 string temp; 94 int st = 0, count = 0; 95 int len = data.size(); 96 97 for( int i=0; i<len; i++ ) 98 { 99 if(data[i]==' '|| i==len-1) 100 { 101 if(i==len-1) 102 temp = data.substr(st, i-st+1); 103 else 104 temp = data.substr(st, i-st); 105 106 currdata[count] = instance[count][temp]; 107 count++; 108 st = i+1; 109 } 110 } 111 112 vector<int> m, n, p; 113 114 if( currdata[count-1]==1 ) //if positive example 115 { 116 //remove inconsistent hypotheses from generic border 117 set < vector<int> > tempg, temps; 118 tempg = generic; 119 for(set<vector<int> >::iterator it= generic.begin(); it!=generic.end(); it++) 120 { 121 m = *it; 122 int er =0; 123 for(int i=0; i<numberFeatures; i++) 124 { 125 if(m[i]!=currdata[i] && m[i]!=INT_MAX ) 126 { 127 er = 1; 128 break; 129 } 130 } 131 if(er==1) 132 tempg.erase(m); 133 } 134 generic = tempg; 135 136 m = *(specific.begin()); 137 n = *(specific.begin()); 138 specific.erase(n); 139 140 for(int i=0; i<numberFeatures; i++) 141 { 142 if(m[i]==0) 143 { 144 m[i] = currdata[i]; 145 } 146 else if(m[i]!=currdata[i]) 147 { 148 m[i]=INT_MAX; 149 } 150 } 151 152 specific.insert(m); 153 } 154 else //if negative example 155 { 156 /* 157 if example is inconsistent with spec border, then it is noise 158 */ 159 160 set < vector<int> > tempg, temps; 161 n = *(specific.begin()); 162 163 for(set< vector<int> >::iterator it= generic.begin(); it!=generic.end(); it++) 164 { 165 m = *it; 166 //cout << "Checking if the example is consistent with the present hypothesis\n"; 167 168 int er=0; 169 //check if given example is consistent with the present hypothesis 170 for(int i=0; i<numberFeatures; i++) 171 { 172 if( m[i]!=INT_MAX && m[i]!=currdata[i]) 173 { 174 er=1; //curr hyp is consistent 175 break; 176 } 177 } 178 179 if(er==1)//if hyp is consistent with the example 180 { 181 tempg.insert(m); 182 } 183 else//hyp is not consistent with the example 184 { 185 vector<int> temphyp; 186 for(int i=0; i<numberFeatures; i++) 187 { 188 if( m[i]==INT_MAX ) 189 { 190 temphyp = m; 191 for(int j=1; j<=numberAttributes[i]; j++)//values are 1-based 192 { 193 if(j==currdata[i]) 194 continue; 195 else 196 { 197 temphyp[i] = j; 198 199 //check if temphyp is more general than specifc hyp. 200 consistent = true; 201 for(int k=0; k<numberFeatures; k++) 202 { 203 if(temphyp[k]!=INT_MAX && temphyp[k]!=n[k] && n[k]!=0) 204 { 205 consistent = false; 206 break; 207 } 208 } 209 if(consistent) 210 tempg.insert(temphyp); // new hypo is consistent 211 } 212 213 } 214 } 215 } 216 217 } 218 219 } 220 //cout << "Exited from the for loop\n"; 221 generic.clear(); 222 bool mGen; 223 set<vector<int> > tempgg; 224 225 //remove from generic any hyp that is more specific than another hyp in generic 226 for(set< vector<int> >::iterator it= tempg.begin(); it!=tempg.end(); it++) 227 { 228 m = *it; 229 230 for( set< vector<int> >::iterator jt= tempg.begin(); jt!=tempg.end(); jt++ ) 231 { 232 if(it==jt) 233 continue; 234 235 p = *jt; 236 consistent = true; 237 for(int k=0; k<numberFeatures; k++) 238 { 239 if(m[k]!=INT_MAX && m[k]!=p[k]) 240 { 241 consistent = false; 242 break; 243 } 244 } 245 if(consistent) 246 tempgg.insert(p); 247 } 248 } 249 250 //cout << "Compared hypothesis in generic space \n"; 251 252 // generic = set_difference(tempg, tempgg ); 253 for( set< vector<int> >::iterator it= tempg.begin(); it!=tempg.end(); it++ ) 254 { 255 m = *it; 256 //cout << "Last for loop .., \n"; 257 if(tempgg.find(m)==tempgg.end()) 258 { 259 generic.insert(m); 260 //cout << "Entered if in teh for loop\n"; 261 } 262 } 263 } 264 265 266 267 /******************************************** 268 Printing Specific and General borders 269 ********************************************/ 270 vector<int> abc; 271 272 if(specific.empty() || generic.empty()) 273 { 274 cout << "Inconsistent data..\n"; 275 } 276 else 277 { 278 cout<<"\n\tS " << loop << endl; 279 abc = *(specific.begin()); 280 cout<<"\t< "; 281 for(int i=0; i<numberFeatures; i++) 282 { 283 if( abc[i]==INT_MAX ) 284 cout<<"?"<<" "; 285 else 286 { 287 for(map<string,int>::iterator jt = instance[i].begin(); jt!=instance[i].end();jt++) 288 { 289 if((*jt).second == abc[i]) 290 cout<<(*jt).first<<" "; 291 } 292 } 293 } 294 cout<<">\n"; 295 296 297 cout<<"\n\tG " << loop << endl; 298 for(set< vector<int> >::iterator it= generic.begin(); it!=generic.end(); it++) 299 { 300 abc = *it; 301 cout<<"\t< "; 302 for(int i=0; i< numberFeatures; i++) 303 { 304 if( abc[i]==INT_MAX ) 305 cout<<"?"<<" "; 306 else 307 { 308 for(map<string,int>::iterator jt = instance[i].begin(); jt!=instance[i].end();jt++) 309 { 310 //cout << abc[i] << endl; 311 if((*jt).second == abc[i]) 312 cout<<(*jt).first<<" "; 313 } 314 } 315 } 316 cout<<">\n"; 317 } 318 319 } 320 321 cout << endl; 322 loop++; 323 } 324 325 return 0; 326 }
From 血糯米Otomii, which will help you to understand more.
样本集:
把S集合初始化为H中极大特殊假设:
把G集合初始化为H中极大一般假设:
首先加载第一条和第二条样本:
这个过程是特殊向一般的转变,这个过程非常地类似FIND-S算法
接着我们处理第三条样本:
让我们回到数据
我们会发现,Sky,AirTemp和Foreast和以前的数据不一致,我们可以怀疑是这三个数据导致最后结果的变化。
所以,我们就针对这3个数据进行一次特殊化:
接着,我们输入第四条样本:
在处理第四条样本的时候,我们先对于S集合进行一般化:
然后,为了让G集合覆盖S集合,我们需要剔除,过程为
在处理完了这四个样本后,我们就可以获取所有的假设:
当前为6个假设,当我们可以获取到更多的训练集的时候,我们可以划出更小的设计空间。
当我使用这6个假设对测试集进行测试的时候,我们可以使用这6个假设同时对测试样本进行检测,每个假设都有自己的权重,如果最后的结果超过80%,那么就测试通过。
本算法弊端:
- 对噪点兼容性非常差
- 当我们Sky属性有10+个的时候,往往这个属性必然被一般化,所以我们需要对数据进行预处理

浙公网安备 33010602011771号