java_对7个文件中的单词进行词频统计所有单词进行词频统计，要求去除停用词、去除单词首尾的标点符号，并按词频大小按降序排列写到文件WordCount.txt中

版本1(探索版)

package experiment6.exp4;
/*
对7个文件Lincoln, Abraham - The Writings of Abraham Lincoln Volume 1.txt     ~    Lincoln, Abraham - The Writings of Abraham Lincoln Volume 7.txt中的
所有单词进行词频统计，
要求去除停用词、
去除 单词首尾 的标点符号，
并按词频大小按 降序排列
写到文件WordCount.txt中。
可用HashMap实现单词词频记录。
写文件可用语句：
import java.io.PrintWriter;
PrintWriter pw = new PrintWriter("data/wordcount.txt");
pw.write();

*/

import experiment5.exp4.Tuple;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;

public class WordFrequencyStatistics {
    public static void main(String[] args) {
        List<String> list = new ArrayList<>();
        Map<String, Integer> map = new TreeMap<>();
        Set<String> setStopWords = new HashSet<>();/*Set里的对象是String,不需要重写equals和hashCode方法.*/
        /*读入多个文件的数据到list中去.*/
        /*分析文件名结构,以便利用循环读入数据.*/
        String filesDirectory = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment6";
        String filenamesPre = filesDirectory + "\\Lincoln, Abraham - The Writings of Abraham Lincoln Volume ";
        int no = 1;
        String filenamesPos = ".txt";
        Scanner scanner = null;
        /*读取停用词:*/
        File fileStopWord = new File(filesDirectory + "\\stopwords" + filenamesPos);
        try {
            scanner = new Scanner(fileStopWord);
            String stringStopWord = scanner.nextLine();
            for (; scanner.hasNextLine(); stringStopWord = scanner.nextLine())
                setStopWords.add(stringStopWord);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        /*开始读入各个文件,并处理(这里读入7个文件)*/
        for (; no < 8; no++) {

            File file = new File(filenamesPre + no + filenamesPos);
            // File file2 = new File("Lincoln, Abraham - The Writings of Abraham Lincoln Volume 2.txt");
            try {
                scanner = new Scanner(file);
                String s;
                for (s = scanner.next(); scanner.hasNext(); s = scanner.next()) {
                    char tmpCharPos = (s.charAt(s.length() - 1));
                    char tmpCharPre = (s.charAt(0));

                    /*由于待分析文本成分比较复杂,如果不利用正则表达式,会显得力不从心,某些特殊情况无法良好解析导致有偏差
                     * 也就是说,主要是正确的解析单词是修正本程序的关键(待优化....).*/
                    /*将不是字母同时也不是数字的边缘字符丢掉(注意isDigital会将'.'作为数字的一部分*/
                    /* */
                    if (!Character.isAlphabetic(tmpCharPos) && (!(Character.isDigit(tmpCharPos) && tmpCharPos != '.')) && s.length() > 1) {
                        s = s.substring(0, s.length() - 1);
                    }//endIf1
                    if (!Character.isAlphabetic(tmpCharPre) && (!(Character.isDigit(tmpCharPre) && tmpCharPre != '.')) && s.length() > 1) {
                        s = s.substring(1);
                    }//endIf2
                    if (!setStopWords.contains(s.toLowerCase()))//忽视大小写的区别(这里提供的StopWords中的单词都是小写的,因而只需要当方面的将被比较字符转为纯小写即可达到效果(如有必要,可以将listStopWord中的单词也都转为小写,可以达到忽略大小写的效果.
                    {
                        list.add(s);
                    }//endIf3
                }/*endfor至此,成功读入数据到list中*/

            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }//endCatch
        }//endFor

        /*开始利用map统计词频:*/
        for (String x : list) {
            if (!map.containsKey(x)) {
                map.put(x, 1);
            } else
                map.put(x, map.get(x) + 1);
        }
        /*监视下结果:*/
        System.out.println("observation");
        //Collections.sort(list,new Comparator<Integer>());
        /*使用遍历map的套路(两种之一)*/
/*        for (Map.Entry<String, Integer> x : map.entrySet()) {
            System.out.println(x);
        }*/
        List<Tuple> listTuples = new ArrayList<>();
        for (String x : map.keySet()) {
            listTuples.add(new Tuple(x, map.get(x)));
        }
        Collections.sort(listTuples, new Comparator<Tuple>() {
            @Override
            public int compare(Tuple o1, Tuple o2) {
                return o2.getValue() - o1.getValue();
            }
        });
        //System.out.println(listTuples);
        for (Tuple x : listTuples) {
            System.out.println(x);
        }

    }//endMain
}

package experiment6.exp4;

public class Tuple {
    String string;
    int num;

    public Tuple(String string, int num) {
        this.string = string;
        this.num = num;
    }

    public String getKey() {
        return string;
    }

    public int getValue() {
        return num;
    }

    @Override
    public String toString() {
        return getKey()+"\t"+getValue();
    }
}

版本2

package experiment6.exp4;
/*
对7个文件Lincoln, Abraham - The Writings of Abraham Lincoln Volume 1.txt     ~    Lincoln, Abraham - The Writings of Abraham Lincoln Volume 7.txt中的
所有单词进行词频统计，
要求去除停用词、
去除 单词首尾 的标点符号，
并按词频大小按 降序排列
写到文件WordCount.txt中。
可用HashMap实现单词词频记录。
写文件可用语句：
import java.io.PrintWriter;
PrintWriter pw = new PrintWriter("data/wordcount.txt");
pw.write();

*/

import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.*;

public class WordFrequencyStatistics {
    public static void main(String[] args) {
        List<String> list = new ArrayList<>();
        Map<String, Integer> map = new TreeMap<>();
        Set<String> setStopWords = new HashSet<>();/*Set里的对象是String,不需要重写equals和hashCode方法.*/
        /*读入多个文件的数据到list中去.*/
        /*分析文件名结构,以便利用循环读入数据.*/
        String filesDirectory = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment6";
        String filenamesPre = filesDirectory + "\\Lincoln, Abraham - The Writings of Abraham Lincoln Volume ";
        int no = 1;
        String filenamesPos = ".txt";
        Scanner scanner = null;
        /*读取停用词:*/
        File fileStopWord = new File(filesDirectory + "\\stopwords" + filenamesPos);
        try {
            scanner = new Scanner(fileStopWord);
            String stringStopWord = scanner.nextLine();
            for (; scanner.hasNextLine(); stringStopWord = scanner.nextLine())
                setStopWords.add(stringStopWord);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        /*开始读入各个文件,并处理(这里读入7个文件)*/
        for (; no < 8; no++) {

            File file = new File(filenamesPre + no + filenamesPos);
            // File file2 = new File("Lincoln, Abraham - The Writings of Abraham Lincoln Volume 2.txt");
            try {
                scanner = new Scanner(file);
                String s;

                for (s = scanner.nextLine(); scanner.hasNextLine(); s = scanner.nextLine()) {
                    StringTokenizer tokenizer = new StringTokenizer(s, "  #*-,.!:;\"$()[]\\&?");//delimiter界定符
                   String str;
                    while(tokenizer.hasMoreElements()){
                        str=(String) tokenizer.nextElement();
                        if (!setStopWords.contains(str.toLowerCase()))/*//忽视大小写的区别(这里提供的StopWords中的单词都是小写的,
                    因而只需要当方面的将被比较字符转为纯小写即可达到效果(如有必要,可以将listStopWord中的单词也都转为小写,可以达到忽略大小写的效果.*/
                        {
                            list.add(str);
                        }//endIf
                    }

                }/*endfor至此,成功读入数据到list中*/

            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }//endCatch
        }//endFor

        /*开始利用map统计词频:*/
        for (String x : list) {
            if (!map.containsKey(x)) {
                map.put(x, 1);
            } else
                map.put(x, map.get(x) + 1);
        }
        /*监视下结果:*/
        System.out.println("observation");
        //Collections.sort(list,new Comparator<Integer>());
        /*使用遍历map的套路(两种之一)*/
/*        for (Map.Entry<String, Integer> x : map.entrySet()) {
            System.out.println(x);
        }*/
        List<Tuple> listTuples = new ArrayList<>();
        for (String x : map.keySet()) {
            listTuples.add(new Tuple(x, map.get(x)));
        }
        Collections.sort(listTuples, new Comparator<Tuple>() {
            @Override
            public int compare(Tuple o1, Tuple o2) {
                return o2.getValue() - o1.getValue();
            }
        });
        //System.out.println(listTuples);
  /*      for (Tuple x : listTuples) {
            System.out.println(x);
        }*/
        PrintWriter pw = null;
        try {
            pw = new PrintWriter(filesDirectory+"/data/wordcount.txt");
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        for (Tuple x : listTuples) {
            pw.write(x.toString());
        }
    }//endMain
}

package experiment6.exp4;

public class Tuple {
    String string;
    int num;

    public Tuple(String string, int num) {
        this.string = string;
        this.num = num;
    }

    public String getKey() {
        return string;
    }

    public int getValue() {
        return num;
    }

    @Override
    public String toString() {
        return getKey()+"\t"+getValue()+"\n";
    }
}

版本三(主要是排除停用词的另一种过滤方式)

package experiment6.exp4;


import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.*;

public class WordFrequencyStatistics {
    public static void main(String[] args) {
        List<String> list = new ArrayList<>();
        Map<String, Integer> map = new TreeMap<>();
        Set<String> stopWordsSet = new HashSet<>();/*Set里的对象是String,不需要重写equals和hashCode方法.*/
        /*读入多个文件的数据到list中去.*/
        /*分析文件名结构,以便利用循环读入数据.*/
        String filesDirectory = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment6";
        String filenamesPre = filesDirectory + "\\Lincoln, Abraham - The Writings of Abraham Lincoln Volume ";
        int no = 1;
        String filenamesPos = ".txt";
        Scanner scanner;
        /*读取停用词:*/
        File fileStopWord = new File(filesDirectory + "\\stopwords" + filenamesPos);
        try {
            scanner = new Scanner(fileStopWord);
            String stringStopWord = scanner.nextLine();
            for (; scanner.hasNextLine(); stringStopWord = scanner.nextLine())
                stopWordsSet.add(stringStopWord);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        /*开始读入各个文件,并处理(这里读入7个文件)*/
        for (; no < 8; no++) {

            File file = new File(filenamesPre + no + filenamesPos);
            // File file2 = new File("Lincoln, Abraham - The Writings of Abraham Lincoln Volume 2.txt");
            try {
                scanner = new Scanner(file);
                String s;

                for (s = scanner.nextLine(); scanner.hasNextLine(); s = scanner.nextLine()) {
                    StringTokenizer tokenizer = new StringTokenizer(s, "  #*-,.!:;\"$()[]\\&?");//delimiter界定符
                    String str;
                    /*分析(剔除停用词)并将满足条件的单词添加到list中*/
                    while (tokenizer.hasMoreElements()) {//这一行中的所有单词(保存在tokenizer中)
                        str = (String) tokenizer.nextElement();//挨个地获取单词
                        /*排除停用词(方法1)*/
                        if (!stopWordsSet.contains(str.toLowerCase()))/*//忽视大小写的区别(这里提供的StopWords中的单词都是小写的,
                    因而只需要当方面的将被比较字符转为纯小写即可达到效果(如有必要,可以将listStopWord中的单词也都转为小写,可以达到忽略大小写的效果.*/
                        {
                            list.add(str);
                        }//endIf
                        /*方法2:(这个就不推荐,时间开销比方法一大的多)*/
//                        boolean isEliminate = false;
//                        for (String x : stopWordsSet) {
//                            if (str.compareToIgnoreCase(x) == 0)/*public int compareToIgnoreCase(@NotNull String str)*/ {
//                                isEliminate = true;
//                                break;
//                            }//endIf
//                        }//endFor
//                        if (isEliminate == false)
//                            list.add(str);
                        
                    }//endWhile

                }/*endfor至此,成功读入数据到list中*/

            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }//endCatch
        }//endFor

        /*开始利用map统计词频:*/
        for (String x : list) {
            if (!map.containsKey(x)) {
                map.put(x, 1);
            } else
                map.put(x, map.get(x) + 1);
        }
        /*监视下结果:*/
        System.out.println("observation");
        //Collections.sort(list,new Comparator<Integer>());
        /*使用遍历map的套路(两种之一)*/
/*        for (Map.Entry<String, Integer> x : map.entrySet()) {
            System.out.println(x);
        }*/
        List<Tuple> listTuples = new ArrayList<>();
        for (String x : map.keySet()) {
            listTuples.add(new Tuple(x, map.get(x)));
        }
        Collections.sort(listTuples, new Comparator<Tuple>() {
            @Override
            public int compare(Tuple o1, Tuple o2) {
                return o2.getValue() - o1.getValue();
            }
        });
        /*打印结果*/
        //System.out.println(listTuples);
        for (Tuple x : listTuples) {
            System.out.println(x);
        }
        /*将结果输出到文件中*/
//        PrintWriter pw = null;
//        try {
//            pw = new PrintWriter(filesDirectory+"/data/wordcount.txt");
//        } catch (FileNotFoundException e) {
//            e.printStackTrace();
//        }
//        for (Tuple x : listTuples) {
//            pw.write(x.toString());
//        }//endFor
    }//endMain
}

版本4(利用StringBuffer来处理待写入字符串内容)

package experiment6.exp4;
/*
对7个文件Lincoln, Abraham - The Writings of Abraham Lincoln Volume 1.txt     ~    Lincoln, Abraham - The Writings of Abraham Lincoln Volume 7.txt中的
所有单词进行词频统计，
要求去除停用词、
去除 单词首尾 的标点符号，
并按词频大小按 降序排列
写到文件WordCount.txt中。
可用HashMap实现单词词频记录。
写文件可用语句：
import java.io.PrintWriter;
PrintWriter pw = new PrintWriter("data/wordcount.txt");
pw.write();

*/

import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.*;

public class WordFrequencyStatistics {
    public static void main(String[] args) {
        List<String> list = new ArrayList<>();
        Map<String, Integer> map = new TreeMap<>();
        Set<String> stopWordsSet = new HashSet<>();/*Set里的对象是String,不需要重写equals和hashCode方法.*/
        /*读入多个文件的数据到list中去.*/
        /*分析文件名结构,以便利用循环读入数据.*/
        String filesDirectory = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment6";
        String filenamesPre = filesDirectory + "\\Lincoln, Abraham - The Writings of Abraham Lincoln Volume ";
        int no = 1;
        String filenamesPos = ".txt";
        Scanner scanner;
        /*读取停用词:*/
        File fileStopWord = new File(filesDirectory + "\\stopwords" + filenamesPos);
        try {
            scanner = new Scanner(fileStopWord);
            String stringStopWord = scanner.nextLine();
            for (; scanner.hasNextLine(); stringStopWord = scanner.nextLine())
                stopWordsSet.add(stringStopWord);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        /*开始读入各个文件,并处理(这里读入7个文件)*/
        for (; no < 8; no++) {
//File 对象是定义在for中的,即,尽管每次进入for时,File类的实例名都叫file,但由于都是通过new来实例化的,所以这些同名File对象是不同的.
            File file = new File(filenamesPre + no + filenamesPos);
            // File file2 = new File("Lincoln, Abraham - The Writings of Abraham Lincoln Volume 2.txt");
            try {
                scanner = new Scanner(file);
                String s;

                for (s = scanner.nextLine(); scanner.hasNextLine(); s = scanner.nextLine()) {
                    StringTokenizer tokenizer = new StringTokenizer(s, "  #*-,.!:;\"$()[]\\&?");//delimiter界定符
                    String str;
                    /*分析(剔除停用词)并将满足条件的单词添加到list中*/
                    while (tokenizer.hasMoreElements()) {//这一行中的所有单词(保存在tokenizer中)
                        str = (String) tokenizer.nextElement();//挨个地获取单词
                        /*排除停用词(方法1)*/
                        if (!stopWordsSet.contains(str.toLowerCase()))/*//忽视大小写的区别(这里提供的StopWords中的单词都是小写的,
                    因而只需要当方面的将被比较字符转为纯小写即可达到效果(如有必要,可以将listStopWord中的单词也都转为小写,可以达到忽略大小写的效果.*/
                        {
                            list.add(str);
                        }//endIf
                        /*方法2:(这个就不推荐,时间开销比方法一大的多)*/
//                        boolean isEliminate = false;
//                        for (String x : stopWordsSet) {
//                            if (str.compareToIgnoreCase(x) == 0)/*public int compareToIgnoreCase(@NotNull String str)*/ {
//                                isEliminate = true;
//                                break;
//                            }//endIf
//                        }//endFor
//                        if (isEliminate == false)
//                            list.add(str);

                    }//endWhile

                }/*endfor至此,成功读入数据到list中*/

            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }//endCatch
        }//endFor

        /*开始利用map统计词频:*/
        for (String x : list) {
            if (!map.containsKey(x)) {
                map.put(x, 1);
            } else
                map.put(x, map.get(x) + 1);
        }

        List<Tuple> listTuples = new ArrayList<>();
        for (String x : map.keySet()) {
            listTuples.add(new Tuple(x, map.get(x)));
        }
        Collections.sort(listTuples, new Comparator<Tuple>() {
            @Override
            public int compare(Tuple o1, Tuple o2) {
                return o2.getValue() - o1.getValue();
            }
        });
        /*打印结果*/
//        //System.out.println(listTuples);
//        for (Tuple x : listTuples) {
//            System.out.println(x);
//        }
        /*将结果输出到文件中*/
        /*创建文件*/
        PrintWriter pw = null;
        try {
            //pw = new PrintWriter(filesDirectory+"/data/wordcount.txt");
            pw = new PrintWriter("C://users//xuchaoxin//desktop//wordcount.txt");
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        /*写入方式1:*/
//        for (Tuple x : listTuples) {
//            pw.write(x.toString());
//        }//endFor
        /*写入方式2:*/
        StringBuffer sb=new StringBuffer();
        for(Tuple x:listTuples){
            sb.append(x.toString());
        }
        //System.out.println(sb);//其实是打印了sb.toString();
        pw.write(sb.toString());
    }//endMain
}

posted @ 2023-12-05 18:30 xuchaoxin1375 阅读(22) 评论(0) 收藏举报来源

刷新页面返回顶部

xuchaoxin1375

java_对7个文件中的单词进行词频统计所有单词进行词频统计， 要求去除停用词、 去除 单词首尾 的标点符号， 并按词频大小按 降序排列 写到文件WordCount.txt中

公告

java_对7个文件中的单词进行词频统计所有单词进行词频统计，要求去除停用词、去除单词首尾的标点符号，并按词频大小按降序排列写到文件WordCount.txt中