数据挖掘算法实现

学习了数据挖掘这门课,但是里面的算法仅仅是稍微了解了一下,并没有实现一下,试着把每个算法实现一下。。。。

1、决策树之ID3

下表记录了在不同气候条件下是否去打球的情况,要求根据该表用程序输出决策树。

Day Outlook Temperature Humidity Wind PlayTennis
1 Sunny Hot High Weak no
2 Sunny Hot High Strong no
3 Overcast Hot High Weak yes
4 Rainy Mild High Weak yes
5 Rainy Cool Normal Weak yes
6 Rainy Cool Normal Strong no
7 Overcast Cool Normal Strong yes
8 Sunny Mild High Weak no
9 Sunny Cool Normal Weak yes
10 Rainy Mild Normal Weak yes
11 Sunny Mild Normal Strong yes
12 Overcast Mild High Strong yes
13 Overcast Hot Normal Weak yes
14 Rainy Mild High Strong no
end

下面是ID3的部分程序,还没有写完,慢慢再补。

  1 #include <iostream>
  2 #include <string>
  3 #include <cstring>
  4 #include <vector>
  5 #include <list>
  6 #include <map>
  7 #include <algorithm>
  8 #include <cstdlib>
  9 #include <cstdio>
 10 #include <cmath>
 11 
 12 using namespace std;
 13 
 14 class Node
 15 {
 16 public:
 17     vector<int> next;
 18     string attr;
 19     string ans;
 20     //Node() next(), attr(""), ans(""){}
 21 };
 22 
 23 const string yes = "yes";
 24 const string no = "no";
 25 const int attribute_name_size = 6;
 26 vector< vector<string> > data; //day weather temperature humidity wind play_or_not
 27 Node node[1000];
 28 int cnt_of_node = 0;
 29 
 30 void input()
 31 {
 32     string str;
 33     vector<string> tmp;
 34     while (cin >> str && str != "end")
 35     {
 36         tmp.push_back(str);
 37         for (int i = 0; i < attribute_name_size-1; ++i)
 38         {
 39             cin >> str;
 40             tmp.push_back(str);            
 41         }
 42         data.push_back(tmp);
 43         tmp.clear();
 44     }
 45 }
 46 
 47 double calcEntropy(vector<vector<string> >& vec, string element)
 48 {
 49     double ans = 0;    
 50     map<string, int> mp;
 51     if (vec.size() <= 0) return -1;
 52     for (int j = 0; j < vec[0].size(); ++j)
 53     {
 54         if (vec[0][j] == element)
 55             for (int i = 1; i < vec.size(); ++i)
 56                 mp[vec[i][j]]++;            
 57     }
 58     double cnt = vec.size()-1;
 59     for (map<string, int>::iterator it = mp.begin(); it != mp.end(); ++it)
 60     {
 61         double p = (it->second)/cnt;
 62         ans -= p*log2(p);
 63     }
 64     return ans;
 65 }
 66 
 67 double calcInfo(vector<vector<string> >& vec, int idx)
 68 {
 69     double ans = 0;
 70     if (vec.size() <= 1) return -1;
 71     map<string, map<string, int> > mp;
 72     int len = vec[0].size();
 73     int size = vec.size()-1;
 74 
 75     for (int j = 1; j < vec.size(); ++j)
 76         mp[vec[j][idx]][vec[j][len-1]]++;
 77     for (map<string, map<string, int> >::iterator it = mp.begin(); it != mp.end(); ++it)
 78     {
 79         int ys = 0, nt = 0;
 80         for (map<string, int>::iterator itr = (it->second).begin(); itr != (it->second).end(); ++itr)
 81         {
 82             if (itr->first == yes) ys += itr->second;
 83             if (itr->first == no) nt += itr->second;            
 84         }
 85         ans = -(ys+nt)/size*(-ys/(ys+nt)*log2(ys/(ys+nt)) - nt/(ys+nt)*log2(nt/(ys+nt)));
 86     }
 87     return ans;
 88 }
 89 
 90 int findBestAttribute(vector<vector<string> >& tmp)
 91 {
 92     if (tmp.size() <= 1) return -1;
 93     int len = tmp[0].size();
 94     string result = tmp[0][len-1];
 95     vector<double> v;
 96     double info_result = calcEntropy(tmp, result);
 97     for (int i = 0; i < len; ++i)
 98         v.push_back(calcInfo(tmp, i));
 99     double max_info_gain = 0;
100     int idx = 0;
101     for (int i = 0; i < v.size(); ++i)
102     {
103         if (info_result-v[i] > max_info_gain)
104             max_info_gain = info_result-v[idx=i];
105     }
106     return idx;
107 }
108 
109 void work(vector< vector<string> >& source, int now_node_num)
110 {
111     int idx = 0;
112     idx = findBestAttribute(source);
113     vector<int> vis(source.size(), 0);
114     vector<string> attribute_tmp;
115     for (int i = 0; i < source[0].size(); ++i)
116         if (i != idx) attribute_tmp.push_back(source[0][i]);
117     int len = source[0].size();
118     for (int i = 1; i < source.size(); ++i)
119     {
120         if (vis[i]) continue;
121         map<string, int> mp;
122         for (int j = i; j < source.size(); ++j)
123         {
124             if (source[j][idx] == source[i][idx])
125             {
126                 mp[source[j][len-1]]++;
127                 vis[j] = 1;
128             }
129         }
130         node[now_node_num].next.push_back(++cnt_of_node);
131         node[cnt_of_node].attr = source[i][idx];
132         if (mp.size() == 1)
133         {
134             node[cnt_of_node].ans = source[i][len-1];    
135             node[cnt_of_node].next.clear();
136         }
137         else
138         {
139             vector<vector<string> > vs;
140             for (int j = 0; j < source.size(); ++j)
141             {
142                 vector<string> tmp;
143                 for (int k = 0; k < source[0].size(); ++k)
144                 {
145                     if (k == idx) continue;
146                     tmp.push_back(source[j][k]);    
147                 }
148                 vs.push_back(tmp);
149             }
150             work(vs, cnt_of_node);
151         }
152     }
153 }
154 
155 void outputSourceData()
156 {
157     for (int i = 0; i < data.size(); ++i)
158     {
159         for (int j = 0; j < data[i].size(); ++j)
160             cout << data[i][j] << '\t';
161         cout << endl;
162     }
163 }
164 
165 int main()
166 {
167         
168     return 0;
169 }
ID3

 

posted on 2015-10-16 14:34  JustForCS  阅读(490)  评论(0编辑  收藏  举报

导航