使用Java8最新API进行词频统计

  今天是博客园的第一天,就写一段Java8的代码练练手吧。这是一个统计最高频词汇的小程序,具体功能为从文件夹中遍历所有文件,统计该文件中最高频的10个词汇,其中有排除一些无用词。

 1 package test.wuni;
 2 
 3 import java.io.BufferedReader;
 4 import java.io.File;
 5 import java.io.FileInputStream;
 6 import java.io.IOException;
 7 import java.io.InputStreamReader;
 8 import java.util.Comparator;
 9 import java.util.LinkedHashMap;
10 import java.util.Map.Entry;
11 
12 /**
13  * @author papa_oy
14  * 2014/6/25
15  */
16 public class TZC {
17     private static int i = 1;
18     private String inPath;
19     private File inFile;
20     
21     
22     private String[] nouse_words = {"我们","微博","浏览器","什么","活跃","可以"};
23     
24 
25     public TZC(String inpath) {
26         this.inPath = inpath;
27         this.inFile = new File(inpath);
28     }
29 
30     public void getGP() throws IOException {
31 
32         String[] fileNameList = inFile.list();
33         for (String name : fileNameList) {
34             LinkedHashMap<String, Integer> lhm = new LinkedHashMap<>();
35             String AbsoluteInPath = this.inPath + "\\" + name;
36             BufferedReader buffer = new BufferedReader(new InputStreamReader(
37                     new FileInputStream(AbsoluteInPath), "gb2312"));
38 
39             String line = "";
40             while ((line = buffer.readLine()) != null) {
41                 String xString = line.replace("\n", "");
42                 String[] words = xString.split(" ");
43                 
44                 for (String word : words) {
45                     if (word.length() > 1 && ifUse(word))
46                         lhm.merge(word, 1, Math::addExact);
47                 }
48             }
49             
50             buffer.close();
51             System.out.println("高频词汇 TOP10:" + i++);
52             lhm.entrySet()
53                     .stream()
54                     .sorted(new Comparator<Entry<String, Integer>>() {
55 
56                         @Override
57                         public int compare(Entry<String, Integer> o1,
58                                 Entry<String, Integer> o2) {
59                             return o2.getValue() - o1.getValue();// go from the highest
60                         }
61                     })
62                     .limit(10)
63                     // top 10 words
64                     .forEach(
65                             entry -> System.out.println(entry.getKey() + ": "
66                                     + entry.getValue()));
67         }
68     }
69     
70     private boolean ifUse(String word){
71         for(String s : nouse_words){
72             if(word.equals(s))
73                 return false;
74         }
75         
76         return true;
77     }
78 
79     public static void main(String[] args) throws IOException {
80         String inpath = "C:\\Users\\papa_oy\\Desktop\\xxxx";
81         TZC tzc = new TZC(inpath);
82         tzc.getGP();
83     }
84 }

 

posted @ 2014-06-25 19:20  danny0405  Views(545)  Comments(0)    收藏  举报