1 package apriori;
2
3 import java.io.BufferedReader;
4 import java.io.BufferedWriter;
5 import java.io.File;
6 import java.io.FileInputStream;
7 import java.io.FileOutputStream;
8 import java.io.InputStream;
9 import java.io.InputStreamReader;
10 import java.io.OutputStream;
11 import java.io.OutputStreamWriter;
12 import java.util.ArrayList;
13 import java.util.Arrays;
14 import java.util.Collections;
15 import java.util.HashMap;
16 import java.util.HashSet;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.Set;
20
21 import org.junit.Test;
22
23 /**
24 * @author code
25 */
26 //simulation Apriori
27 public class Ariori
28 {
29 private File file; //transaction 数据库文件
30 public static final int min_support = 2; //设置最小支持度
31 private Map<String,List<String>> map = new HashMap<String,List<String>>(); //存储 原始数据库数据
32 private Map<Integer,ArrayList<ArrayList<String>>> frequentSet =
33 new HashMap<Integer,ArrayList<ArrayList<String>>>(); //存储 频繁项集 1->L[1] 2->L[2]
34
35 ArrayList<ArrayList<String>> cadidate = new ArrayList<ArrayList<String>>();//存放候选项集[[1,2],[3,4],[3,2]]
36 //存放支持度计数
37 //[1,2,3]:5
38 //[2,3,4]:5
39 Map<ArrayList<String>,Integer> countMap = new HashMap<ArrayList<String>,Integer>();
40
41 public void setFile(String path)
42 {
43 this.file = new File(path);
44 }
45 public File getFile()
46 {
47 return this.file;
48 }
49
50 //将文件的内容存储到map中
51 public void getAllItems() throws Exception
52 {
53 //遍历文件每一行
54 InputStream ins = new FileInputStream(file);
55 InputStreamReader insr = new InputStreamReader(ins);
56 BufferedReader br = new BufferedReader(insr);
57 String content = null;
58 String[] contents = null;
59 while(null != (content = br.readLine()))
60 {
61 contents = content.substring(content.indexOf(":")+1).split(",");
62 //将t1:1,2,3 加入到map中
63 map.put(content.substring(0,content.indexOf(":")), Arrays.asList(contents));
64 }
65 br.close();
66 }
67
68 //查找一项集 [[1],[2],[3],[4]]
69 public void find_frequent_1_items()
70 {
71 HashSet<String> set = new HashSet<String>();
72 Set<String> keys = this.map.keySet();
73 for(String key:keys)
74 {
75 List<String> list = this.map.get(key);
76 for(String s:list)
77 {
78 set.add(s);
79 }
80 }
81
82 //this.cadidate=[[1],[2],[3],[4],[5]]
83 for(String s:set)
84 {
85 ArrayList<String> list = new ArrayList<String>();
86 list.add(s);
87 this.cadidate.add(list);
88 }
89
90 this.removeLessThanSupport();
91 this.frequentSet.put(1, new ArrayList<ArrayList<String>>(this.cadidate));
92 this.cadidate.clear();
93 }
94
95 //第二步 对当前的候选集中 进行支持度的修改
96 public void removeLessThanSupport()
97 {
98 //countMap{[1]:0 [2]:0 [3]:0 [4]:0 [5]:0}
99 //遍历数据库 文件map
100 Set<String> keys = this.map.keySet();
101 for(String key:keys)
102 {
103 List<String> list = this.map.get(key);
104 //this.cadidate=[[1],[2],[3],[4]]
105 for(ArrayList<String> ca_list:this.cadidate)
106 {
107 //[1,2,3] 包含 [2,3]
108 if(list.containsAll(ca_list))
109 {
110 if(this.countMap.get(ca_list) == null)
111 {
112 this.countMap.put(ca_list, 1);
113 }
114 else
115 {
116 this.countMap.put(ca_list, this.countMap.get(ca_list)+1);
117 }
118 }
119 }
120 }
121 //如果countMap没有被计数的,说明 这一对值计数为0 countMap.size() < this.cadidate.size()
122 if(countMap.size() < this.cadidate.size())
123 {
124 HashSet<ArrayList<String>> set = new HashSet<ArrayList<String>>(this.cadidate);
125 set.removeAll(this.countMap.keySet());
126
127 for(ArrayList<String> list:set)
128 {
129 this.countMap.put(list, 0);
130 }
131 }
132
133 //比较最小支持度
134 for(List<String> list:this.countMap.keySet())
135 {
136 if(this.countMap.get(list) < min_support)
137 {
138 this.cadidate.remove(list);
139 }
140 }
141
142 //比较完成之后 将其写到文件中
143 if(!this.cadidate.isEmpty())
144 {
145 int k = this.cadidate.get(0).size();
146 File file = new File("L" + k + ".txt");
147 //内存 -> 文件
148 try
149 {
150 OutputStream ous = new FileOutputStream(file);
151 OutputStreamWriter osw = new OutputStreamWriter(ous);
152 BufferedWriter bw = new BufferedWriter(osw);
153
154 for(ArrayList<String> list:this.cadidate)
155 {
156 bw.write(list + ":" + this.countMap.get(list) + "\n");
157 }
158 this.countMap.clear();
159 bw.close();
160 }
161 catch (Exception e)
162 {
163 e.printStackTrace();
164 }
165 }
166
167
168 }
169 //处理过程
170 public void main()
171 {
172 this.find_frequent_1_items();
173 for(int i = 2;!this.frequentSet.get(i-1).isEmpty();i++)
174 {
175 this.apriori_gen(i-1);
176 }
177 }
178 //候选集[k] = L[k-1] 连接 L[k-1] index:k-1
179 public void apriori_gen(int index)
180 {
181 //[[1],[2],[3],[4],[5]]
182 ArrayList<ArrayList<String>> list_index = this.frequentSet.get(index);
183 ArrayList<ArrayList<String>> newList = new ArrayList<ArrayList<String>>();
184 boolean flag = true;//标 志前index 位是否相同
185 for(int k = 0;k < list_index.size();k++)
186 {
187 ArrayList<String> list1 = list_index.get(k);
188 for(int j = k+1;j < list_index.size();j++)
189 {
190 ArrayList<String> list2 = list_index.get(j);
191 int i;
192 for(i = 0;i < index-1;i++)
193 {
194 if(!list1.get(i).equals(list2.get(i)))
195 {
196 flag = false;
197 }
198 }
199
200 if(i == 0)
201 {
202 ArrayList<String> subList = new ArrayList<String>();
203 if(!list1.get(i).equals(list2.get(i)))
204 {
205 subList.add(list1.get(i));
206 subList.add(list2.get(i));
207 //对每一个产生的subList进行分析
208 //[1,2,3] k-1 子集 受否在L[k-1]中
209 Collections.sort(subList);
210 if(!this.has_infrequent_subset(subList, list_index))
211 {
212 newList.add(subList);
213 }
214 }
215 }
216 //前k-2位全部相同
217 if(flag && i!= 0)
218 {
219 //[3,4] [3,5] index=2 list=[3]
220 List<String> list = list1.subList(0, index-1);
221 ArrayList<String> subList = new ArrayList<String>();
222 subList.addAll(list);
223 subList.add(list1.get(index-1));
224 subList.add(list2.get(index-1));
225 Collections.sort(subList);
226 if(!this.has_infrequent_subset(subList, list_index))
227 {
228 newList.add(subList);
229 }
230 }
231
232 flag = true;
233 }
234 flag = true;
235 }
236 this.cadidate = newList;
237 this.removeLessThanSupport();
238 //对当前的候选键 min_support进行比较修改
239 this.frequentSet.put(index+1, new ArrayList<ArrayList<String>>(this.cadidate));
240 this.cadidate.clear();
241 }
242
243 //对于新产生的[]元素L[K] 是否在L[k-1]中
244 //list 是L[k-1] 频繁集
245 //[1,3,5] [1,3],[1,5],[3,5]
246 public boolean has_infrequent_subset(ArrayList<String> subList,ArrayList<ArrayList<String>> list)
247 {
248 int k = subList.size();
249 //查找subList的k-1子集
250 //[1,2,3,4] [1,2,3] [1,2,4] [1,3,4]
251 for(int i = 0;i < k;i++)
252 {
253 ArrayList<String> k_subList = new ArrayList<String>(subList);
254 k_subList.remove(i);
255 if(!list.contains(k_subList))
256 {
257 return true;
258 }
259 }
260 return false;
261 }
262 @Test
263 public void test() throws Exception
264 {
265 this.setFile("transaction.txt");
266 this.getAllItems();
267 this.main();
268 }
269
270 }