倒排索引的 JAVA 简单实现

 

   倒排索引的简单 JAVA 实现,当玩具其实都很粗糙,简单实现下原理:

public class IntertedIndex {

    // 倒排索引
    private Map<String, List<String>> indexMap;
    // 关键词计数
    private Map<String, Integer> keywordNums;

    IntertedIndex() {
        this.indexMap = new HashMap<String, List<String>>();
        this.keywordNums = new HashMap<String, Integer>();
    }

    public static void main(String[] args) throws Exception {
        IntertedIndex intertedIndex = new IntertedIndex();
        intertedIndex.createIndexByFolder("C:\\Users\\Administrator\\Desktop\\logs");
        java.util.Scanner s = new java.util.Scanner(System.in);
        String inString;
        while ((inString = s.next()) != null) {
            System.out.println("查询关键字 :" + inString);
            try {
                intertedIndex.search(inString);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
//        System.out.println(intertedIndex.toString());
    }

    public void search(String keyword) {
        if (!this.indexMap.keySet().contains(keyword))
            throw new RuntimeException(" keyword : " + keyword + " is not in map ! ");
        System.out.println(">>>>>> " + keyword + " ( " + this.keywordNums.get(keyword) + " ) ");
        List<String> fileList = this.indexMap.get(keyword);
        for (int i = 0; i < fileList.size(); i++) {
            System.out.println("       >>> " + fileList.get(i));
        }
    }

    public void createIndexByFolder(String folderName) throws Exception {
        File foler = new File(folderName);
        File[] files = foler.listFiles();
        for (int i = 0; i < files.length; i++) {
            File file = files[i];
            String filePath = file.getAbsolutePath();
            System.out.println("deal file : " + filePath);
            this.createIndexByFile(filePath);
        }
    }

    public void createIndexByFile(String fileName) throws Exception {
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), Charset.forName("GBK")));
        String row = null;
        try {
            while ((row = reader.readLine()) != null) {
                String[] keywords = row.split(" ");
                // 分词
                StringTokenizer str = new StringTokenizer(row, " ,:\" \rn()'-'.,!?,:“”‘’?-!。," + System.lineSeparator());
                while (str.hasMoreElements()) {
                    String keyword = str.nextElement().toString();
                    if (keyword.length() > 100) continue;
                    while (keyword.contains(" ")) keyword = keyword.replace(" ", "");
                    this.createIndexByKeyword(keyword, fileName);
                }
            }
        } catch (Exception e) {
            throw e;
        } finally {
            reader.close();
        }
    }

    public synchronized void createIndexByKeyword(String keyword, String fileName) {
        System.out.println("deal keyword : " + keyword);
        boolean isNew = false;
        if (isNew = !indexMap.containsKey(keyword)) indexMap.put(keyword, new ArrayList<String>());
        List<String> fileNameList = indexMap.get(keyword);
        if (isNew || !fileNameList.contains(fileName)) {
            fileNameList.add(fileName);
        }
        keywordNums.put(keyword, keywordNums.getOrDefault(keyword, 0) + 1);
    }

    @Override
    public String toString() {
        String res = "";
        Iterator<String> iterator = this.indexMap.keySet().iterator();
        while (iterator.hasNext()) {
            String keyword = iterator.next();
            res += ">>>>>> " + keyword + " (" + this.keywordNums.get(keyword) + " ) " + " <<<<<<";
            res += System.lineSeparator();
            List<String> fileList = this.indexMap.get(keyword);
            for (int i = 0; i < fileList.size(); i++) {
                String filePath = fileList.get(i);
                if (filePath.contains("/")) {
                    String[] pathArr = filePath.split("/");
                    filePath = pathArr[pathArr.length - 1];
                }
                res += "       >>> " + filePath;
                res += System.lineSeparator();
            }
        }
        return res;
    }
}

 

posted @ 2023-01-09 13:50  牛有肉  阅读(520)  评论(0编辑  收藏  举报