[ML] Concept Learning

Candidate Elimination

 


Thanks for Sanketh Vedula. This is a good demo to understand candidate elimination algorithm that I have optimized based on this guy's good work.

rika@rika-UX303UB$ ./a.out 
<Input> Number of Features:6
<Input> Number of Attributes[0]:2
<Input>     (1):rainy
<Input>     (2):sunny
<Input> Number of Attributes[1]:2
<Input>     (1):cold
<Input>     (2):warm
<Input> Number of Attributes[2]:2
<Input>     (1):normal
<Input>     (2):high
<Input> Number of Attributes[3]:2
<Input>     (1):weak
<Input>     (2):strong
<Input> Number of Attributes[4]:2
<Input>     (1):cool
<Input>     (2):warm 
<Input> Number of Attributes[5]:2
<Input>     (1):same
<Input>     (2):change

Input training data:

Sunny Warm Normal Strong Warm Same Yes

    S 0
    < Sunny Warm Normal Strong Warm Same >

    G 0
    < ? ? ? ? ? ? >

Sunny Warm High Strong Warm Same Yes

    S 1
    < Sunny Warm ? Strong Warm Same >

    G 1
    < ? ? ? ? ? ? >

Rainy Cold High Strong Warm Change No

    S 2
    < Sunny Warm ? Strong Warm Same >

    G 2
    < Sunny ? ? ? ? ? >
    < ? Warm ? ? ? ? >
    < ? ? ? ? ? Same >

Sunny Warm High Strong Cool Change Yes

    S 3
    < Sunny Warm ? Strong ? ? >

    G 3
    < Sunny ? ? ? ? ? >
    < ? Warm ? ? ? ? >

Start with negative sample:

Input training data:

Rainy Cold High Strong Warm Change No

    S 0
    < >

    G 0
    < Sunny ? ? ? ? ? >
    < Cloudy ? ? ? ? ? >
    < ? Warm ? ? ? ? >
    < ? ? Normal ? ? ? >
    < ? ? ? Weak ? ? >
    < ? ? ? ? Cool ? >
    < ? ? ? ? ? Same >

Sunny Warm High Strong Warm Change No   // Sunny and Warm do not co-exist

    S 1
    < >

    G 1
    < Sunny Cold ? ? ? ? >
    < Rainy Warm ? ? ? ? >
    < Cloudy ? ? ? ? ? >
    < ? ? Normal ? ? ? >
    < ? ? ? Weak ? ? >
    < ? ? ? ? Cool ? >
    < ? ? ? ? ? Same >

 

This is complete code:

  1 /*
  2   Candidate Elimination
  3 */
  4 
  5 #include <cstdio>
  6 #include <cstdlib>
  7 #include <cctype>
  8 #include <vector>
  9 #include <string>
 10 #include <stack>
 11 #include <queue>
 12 #include <iterator>
 13 #include <set>
 14 #include <map>
 15 #include <iostream>
 16 #include <sstream>
 17 #include <deque>
 18 #include <cmath>
 19 #include <memory.h>
 20 #include <algorithm>
 21 #include <utility>
 22 #include <climits>
 23 
 24 typedef long double ld;
 25 typedef long long ll;
 26 #define all(c) c.begin(),c.end()
 27 
 28 using namespace std;
 29 
 30 int main()
 31 {
 32     int numberFeatures; //number of features in the dataset
 33     string data;
 34     bool consistent;
 35 
 36     cout << "<Input> Number of Features:";
 37     cin >> numberFeatures; //input the number of features
 38 
 39     vector <int> numberAttributes(numberFeatures); //number of attributes for each feature.
 40     map <string, int> instance[numberFeatures+1];
 41 
 42     for(int i=0; i<numberFeatures; i++)
 43     {
 44         cout <<"<Input> Number of Attributes[" << i+1 << "]:";
 45         cin >> numberAttributes[i];
 46 
 47         for(int j=1; j<=numberAttributes[i]; j++)
 48         {
 49             string temp;
 50             cout <<"<Input>     (" << j << "):";
 51             cin>> temp;
 52             instance[i][temp] = j; //map attribute name with number
 53         }
 54     }
 55 
 56     instance[numberFeatures]["Yes"] = 1;
 57     instance[numberFeatures]["No"] = 0;
 58 
 59     /*
 60        for any feature if attribute=0 -> null value;
 61        attribute = INT_MAX -> all
 62      */
 63 
 64     vector <int> currdata(numberFeatures+1), tmpData(numberFeatures);
 65 
 66     //set used to represent the generic and specific boundaries.
 67     set < vector<int> > specific, generic;
 68 
 69     //initialization
 70     for(int i=0; i<numberFeatures; i++)
 71     {
 72         tmpData[i]=0;
 73     }
 74     specific.insert(tmpData);
 75 
 76     for(int i=0; i<numberFeatures; i++)
 77     {
 78         tmpData[i]=INT_MAX;
 79     }
 80     generic.insert(tmpData);
 81 
 82 
 83     cout << endl << "Input training data:" << endl << endl;
 84     int loop = 0;
 85     while( getline(cin, data) )
 86     { 
 87 
 88         if(data.size()==0)
 89         {
 90             continue;
 91         }
 92 
 93         string temp;
 94         int st = 0, count = 0;
 95         int len = data.size();
 96 
 97         for( int i=0; i<len; i++ )
 98         {
 99             if(data[i]==' '|| i==len-1)
100             {
101                 if(i==len-1)
102                     temp = data.substr(st, i-st+1);
103                 else
104                     temp = data.substr(st, i-st);
105 
106                 currdata[count] = instance[count][temp];
107                 count++;
108                 st = i+1;
109             }
110         }
111 
112         vector<int> m, n, p;
113 
114         if( currdata[count-1]==1 ) //if positive example
115         {
116             //remove inconsistent hypotheses from generic border
117             set < vector<int> > tempg, temps;
118             tempg = generic;
119             for(set<vector<int> >::iterator it= generic.begin(); it!=generic.end(); it++)
120             {
121                 m = *it;
122                 int er =0;
123                 for(int i=0; i<numberFeatures; i++)
124                 {
125                     if(m[i]!=currdata[i] && m[i]!=INT_MAX )
126                     {
127                         er = 1;
128                         break;
129                     }
130                 }
131                 if(er==1)
132                     tempg.erase(m);
133             }
134             generic = tempg;
135 
136             m = *(specific.begin());
137             n = *(specific.begin());
138             specific.erase(n);
139 
140             for(int i=0; i<numberFeatures; i++)
141             {
142                 if(m[i]==0)
143                 {
144                     m[i] = currdata[i];
145                 }
146                 else if(m[i]!=currdata[i])
147                 {
148                     m[i]=INT_MAX;
149                 }
150             }
151 
152             specific.insert(m);
153         }
154         else //if negative example
155         {
156             /*
157                if example is inconsistent with spec border, then it is noise
158              */
159 
160             set < vector<int> > tempg, temps;
161             n = *(specific.begin());
162 
163             for(set< vector<int> >::iterator it= generic.begin(); it!=generic.end(); it++)
164             {
165                 m = *it;
166                 //cout << "Checking if the example is consistent with the present hypothesis\n";
167 
168                 int er=0;
169                 //check if given example is consistent with the present hypothesis
170                 for(int i=0; i<numberFeatures; i++)
171                 {
172                     if( m[i]!=INT_MAX && m[i]!=currdata[i])
173                     {
174                         er=1; //curr hyp is consistent
175                         break;
176                     }
177                 }
178 
179                 if(er==1)//if hyp is consistent with the example
180                 {
181                     tempg.insert(m);
182                 }
183                 else//hyp is not consistent with the example
184                 {
185                     vector<int> temphyp;
186                     for(int i=0; i<numberFeatures; i++)
187                     {
188                         if( m[i]==INT_MAX )
189                         {
190                             temphyp = m;
191                             for(int j=1; j<=numberAttributes[i]; j++)//values are 1-based
192                             {
193                                 if(j==currdata[i])
194                                     continue;
195                                 else
196                                 {
197                                     temphyp[i] = j;
198 
199                                     //check if temphyp is more general than specifc hyp.
200                                     consistent = true;
201                                     for(int k=0; k<numberFeatures; k++)
202                                     {
203                                         if(temphyp[k]!=INT_MAX && temphyp[k]!=n[k] && n[k]!=0)
204                                         {
205                                             consistent = false;
206                                             break;
207                                         }
208                                     }
209                                     if(consistent)
210                                         tempg.insert(temphyp); // new hypo is consistent
211                                 }
212 
213                             }
214                         }
215                     }
216 
217                 }
218 
219             }
220             //cout << "Exited from the for loop\n";
221             generic.clear();
222             bool mGen;
223             set<vector<int> > tempgg;
224 
225             //remove from generic any hyp that is more specific than another hyp in generic
226             for(set< vector<int> >::iterator it= tempg.begin(); it!=tempg.end(); it++)
227             {
228                 m = *it;
229 
230                 for( set< vector<int> >::iterator jt= tempg.begin(); jt!=tempg.end(); jt++ )
231                 {
232                     if(it==jt)
233                         continue;
234 
235                     p = *jt;
236                     consistent = true;
237                     for(int k=0; k<numberFeatures; k++)
238                     {
239                         if(m[k]!=INT_MAX && m[k]!=p[k])
240                         {
241                             consistent = false;
242                             break;
243                         }
244                     }
245                     if(consistent)
246                         tempgg.insert(p);
247                 }
248             }
249 
250             //cout << "Compared hypothesis in generic space \n";
251 
252             // generic = set_difference(tempg, tempgg );
253             for( set< vector<int> >::iterator it= tempg.begin(); it!=tempg.end(); it++ )
254             {
255                 m = *it;
256                 //cout << "Last for loop .., \n";
257                 if(tempgg.find(m)==tempgg.end())
258                 {
259                     generic.insert(m);
260                     //cout << "Entered if in teh for loop\n";
261                 }
262             }
263         }
264 
265 
266 
267         /********************************************
268            Printing Specific and General borders
269          ********************************************/
270         vector<int> abc;
271 
272         if(specific.empty() || generic.empty())
273         {
274             cout << "Inconsistent data..\n";
275         }
276         else
277         {
278             cout<<"\n\tS " << loop << endl;
279             abc = *(specific.begin());
280             cout<<"\t< ";
281             for(int i=0; i<numberFeatures; i++)
282             {
283                 if( abc[i]==INT_MAX )
284                     cout<<"?"<<" ";
285                 else
286                 {
287                     for(map<string,int>::iterator jt = instance[i].begin(); jt!=instance[i].end();jt++)
288                     {
289                         if((*jt).second == abc[i])
290                             cout<<(*jt).first<<" ";
291                     }
292                 }
293             }
294             cout<<">\n";
295 
296 
297             cout<<"\n\tG " << loop << endl;
298             for(set< vector<int> >::iterator it= generic.begin(); it!=generic.end(); it++)
299             {
300                 abc = *it;
301                 cout<<"\t< ";
302                 for(int i=0; i< numberFeatures; i++)
303                 {
304                     if( abc[i]==INT_MAX )
305                         cout<<"?"<<" ";
306                     else
307                     {
308                         for(map<string,int>::iterator jt = instance[i].begin(); jt!=instance[i].end();jt++)
309                         {
310                             //cout << abc[i] << endl;
311                             if((*jt).second == abc[i])
312                                 cout<<(*jt).first<<" ";
313                         }
314                     }
315                 }
316                 cout<<">\n";
317             }
318 
319         }
320 
321         cout << endl;
322         loop++;
323     }
324 
325     return 0;
326 }
View Code

 

From 血糯米Otomii, which will help you to understand more.


样本集:

 

把S集合初始化为H中极大特殊假设:

把G集合初始化为H中极大一般假设:

 

首先加载第一条和第二条样本:

这个过程是特殊向一般的转变,这个过程非常地类似FIND-S算法

接着我们处理第三条样本:

让我们回到数据

我们会发现,Sky,AirTemp和Foreast和以前的数据不一致,我们可以怀疑是这三个数据导致最后结果的变化。

所以,我们就针对这3个数据进行一次特殊化:

接着,我们输入第四条样本:

 

 

 

在处理第四条样本的时候,我们先对于S集合进行一般化:

 

 

然后,为了让G集合覆盖S集合,我们需要剔除,过程为

 

在处理完了这四个样本后,我们就可以获取所有的假设:

 

当前为6个假设,当我们可以获取到更多的训练集的时候,我们可以划出更小的设计空间。

当我使用这6个假设对测试集进行测试的时候,我们可以使用这6个假设同时对测试样本进行检测,每个假设都有自己的权重,如果最后的结果超过80%,那么就测试通过。

 

本算法弊端

  1. 对噪点兼容性非常差
  2. 当我们Sky属性有10+个的时候,往往这个属性必然被一般化,所以我们需要对数据进行预处理
posted @ 2016-06-08 09:26  郝壹贰叁  阅读(316)  评论(0)    收藏  举报