使用Java8最新API进行词频统计
今天是博客园的第一天,就写一段Java8的代码练练手吧。这是一个统计最高频词汇的小程序,具体功能为从文件夹中遍历所有文件,统计该文件中最高频的10个词汇,其中有排除一些无用词。
1 package test.wuni; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.FileInputStream; 6 import java.io.IOException; 7 import java.io.InputStreamReader; 8 import java.util.Comparator; 9 import java.util.LinkedHashMap; 10 import java.util.Map.Entry; 11 12 /** 13 * @author papa_oy 14 * 2014/6/25 15 */ 16 public class TZC { 17 private static int i = 1; 18 private String inPath; 19 private File inFile; 20 21 22 private String[] nouse_words = {"我们","微博","浏览器","什么","活跃","可以"}; 23 24 25 public TZC(String inpath) { 26 this.inPath = inpath; 27 this.inFile = new File(inpath); 28 } 29 30 public void getGP() throws IOException { 31 32 String[] fileNameList = inFile.list(); 33 for (String name : fileNameList) { 34 LinkedHashMap<String, Integer> lhm = new LinkedHashMap<>(); 35 String AbsoluteInPath = this.inPath + "\\" + name; 36 BufferedReader buffer = new BufferedReader(new InputStreamReader( 37 new FileInputStream(AbsoluteInPath), "gb2312")); 38 39 String line = ""; 40 while ((line = buffer.readLine()) != null) { 41 String xString = line.replace("\n", ""); 42 String[] words = xString.split(" "); 43 44 for (String word : words) { 45 if (word.length() > 1 && ifUse(word)) 46 lhm.merge(word, 1, Math::addExact); 47 } 48 } 49 50 buffer.close(); 51 System.out.println("高频词汇 TOP10:" + i++); 52 lhm.entrySet() 53 .stream() 54 .sorted(new Comparator<Entry<String, Integer>>() { 55 56 @Override 57 public int compare(Entry<String, Integer> o1, 58 Entry<String, Integer> o2) { 59 return o2.getValue() - o1.getValue();// go from the highest 60 } 61 }) 62 .limit(10) 63 // top 10 words 64 .forEach( 65 entry -> System.out.println(entry.getKey() + ": " 66 + entry.getValue())); 67 } 68 } 69 70 private boolean ifUse(String word){ 71 for(String s : nouse_words){ 72 if(word.equals(s)) 73 return false; 74 } 75 76 return true; 77 } 78 79 public static void main(String[] args) throws IOException { 80 String inpath = "C:\\Users\\papa_oy\\Desktop\\xxxx"; 81 TZC tzc = new TZC(inpath); 82 tzc.getGP(); 83 } 84 }

浙公网安备 33010602011771号