词法分析器1.0版

词法分析器1.0版

  • 对Java源码进行词法分析,仅供参考
package org.experience;

import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;

/**
 * Java程序语言的单词符号的种别:
 * 关键字:1
 * 操作符:2
 * 分界符:3
 * 标识符:4
 * 常数:5
 * 无识别:6
 */
@SuppressWarnings("all")
public class WordAnalysis_Java {

    private HashSet<String> keyWord = readKeyWord();
    private HashSet<String> operator = operator();
    private HashSet<String> delimiter = delimiter();

    public WordAnalysis_Java() throws Exception {
    }


    public static void main(String[] args) throws Exception {
        File file = new File("D:\\compileDoc\\WordAnalysis.java");
        WordAnalysis_Java a = new WordAnalysis_Java();
        List<String> src = a.readSrc(file);
        List<String> word = a.processSrc(src);
        word = a.processSrc2(word);
//        for (String s : word) {
//            System.out.println(s);
//        }
        a.analysis(word);
//        for (String string : src) {
//            System.out.println(string);
//        }
//        HashSet<String> keyWord = a.readKeyWord();
//        for (String s : keyWord) {
//            System.out.println(s);
//        }

    }

    /**
     * 词法分析
     * @param words
     */
    public void analysis(List<String> words) {
        for (String word : words) {
            if (keyWord.contains(word)) {
                System.out.println("关键字:\t" + word);
            } else if (isDigit2(word)) {
                System.out.println("数字:\t" + word);
            } else if (operator.contains(word)) {
                System.out.println("运算符:\t" + word);
            } else if (delimiter.contains(word)) {
                System.out.println("分界符:\t" + word);

            } else if (isIdentifier(word)){
                System.out.println("标识符:\t" + word);
            }else {
                System.out.println("其他:\t" + word);
            }
        }
        
    }
    //判断当前词是否为数字
    public boolean isDigit2(String str){
        return str.matches("[0-9]+");
    }
    //判断当前词是否为标识符
    public boolean isIdentifier(String str){
        str = str.toLowerCase();
        return str.matches("[a-z]+");
    }

    /**
     * 读取源程序
     *
     * @param file 源文件
     * @return
     * @throws Exception
     */
    public List<String> readSrc(File file) throws Exception {
        ArrayList<String> strings = new ArrayList<>();
        BufferedReader br = new BufferedReader(new FileReader(file));
        String line = "";
        while ((line = br.readLine()) != null) {
            line = line.replace("  ", " ");
            line = line.replace("  ", " ");
            line = line.replace("  ", " ");
            line = line.replace("  ", " ");
            String[] s = line.split(" ");
            for (String s1 : s) {
                strings.add(s1);
            }
        }
        br.close();
        return strings;
    }

    /**
     * 
     * @param src
     * @return
     * @throws Exception
     */
    public List<String> processSrc2(List<String> src) throws Exception {
        ArrayList<String> single = new ArrayList<>();
        StringBuffer buffer = new StringBuffer();
        for (String s : src) {
            buffer.append(s + " ");
        }
        String s = buffer.toString();
        s = s.replace(".", " . ");
        s = s.replace(";", " ; ");
        s = s.replace("\"", " \" ");
        s = s.replace("[", " [ ");
        s = s.replace("]", " ] ");
        s = s.replace("{", " { ");
        s = s.replace("}", " } ");
        s = s.replace(")", " ) ");
        s = s.replace("(", " ( ");
        s = s.replace("=", " = ");
        s = s.replace("<", " < ");
        s = s.replace(">", " > ");
        s = s.replace("++", " ++ ");
        s = s.replace("+", " + ");
        s = s.replace("-", " - ");
        s = s.replace("*", " * ");
        s = s.replace("/", " / ");
        s = s.replace("%", " % ");
        s = s.replace("!", " ! ");
        s = s.replace(":", " : ");
        s = s.replace("\'", " \' ");
        s = s.replace("\\t", " \\t ");
        s = s.replace("=  =", " == ");
        s = s.replace("+  =", " += ");
        s = s.replace("+  +", " += ");
        s = s.replace(">  =", " >= ");
        s = s.replace("<  =", " <= ");
        s = s.replace("-  -", " -- ");
        s = s.replace("&&", " && ");
        s = s.replace("||", " || ");

        s = s.replace("  ", " ");
        s = s.replace("  ", " ");
        
//        System.out.println(s);
        String[] s1 = s.split(" ");
        for (String s2 : s1) {
            single.add(s2);
        }
        return single;
    }

    /**
     * 处理源程序
     * 删除注释和空白行
     *
     * @param src
     * @return input=input.replaceAll("(?m)(?:\n|\r\n)(?:^\s*$)", "");
     * input=input.replaceAll("(?m)(?:^\s*$(?:\n|\r\n))", "");
     */
    public List<String> processSrc(List<String> src) throws Exception {
        ArrayList<String> words = new ArrayList<>();
        ArrayList<String> words2 = new ArrayList<>();
        //粗略去除空行和当行注释
        for (String s : src) {
            if (!s.equals("")) {
//                words.add(s);
                s = s.replaceAll("^(//)(.*?)$", "");
                words.add(s);
            }
        }
        StringBuffer buffer = new StringBuffer();
        for (String word : words) {
            if (!word.equals("")) {
                buffer.append(word + "\t");
//                word = word.replaceAll("^(/*)(.*?)(/)$","");
//                words2.add(word);
            }
        }
//        System.out.println(buffer);
        String s = buffer.toString();
//        System.out.println(s);
        s = s.replaceAll("/\\*{1,2}[\\s\\S]*?\\*/", "");
        s = s.replaceAll("//[\\s\\S]*?\\n\n", "");
        s = s.replaceAll("^(//)(.*?)$", "");
        String[] split = s.split("\t");
        for (String s1 : split) {
//            System.out.println(s1);
            if (!s1.equals("")) {
//                s1 = s1.replaceAll("//[\\s\\S]*?\\n\n","");
                s1 = s1.replaceAll("(//)(.*?)$", "");
                words2.add(s1);
            }
        }
        return words2;
    }


    /**
     * 判断一个字符是否是数字
     *
     * @param letter
     * @return
     */
    public boolean isDigit(char letter) {
        if (letter >= '0' && letter <= '9')
            return true;
        else {
            return false;
        }
    }

    /**
     * 判断一个字符是否是字母
     *
     * @param letter
     * @return
     */
    public boolean isLetter(char letter) {
        if ((letter >= 'a' && letter <= 'z') || (letter >= 'A' && letter <= 'Z'))
            return true;
        else {
            return false;
        }
    }

    /**
     * 判断一个词是否是关键字
     *
     * @param word    当前词
     * @param keyWord 关键字表
     * @return
     */
    public boolean isKeyWord(String word, HashSet<String> keyWord) {
        if (keyWord.contains(word)) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * 建立关键字表
     *
     * @throws Exception
     */
    public static HashSet<String> readKeyWord() throws Exception {
//        ArrayList<String> keyWord = new ArrayList<>();
        HashSet<String> keyWord = new HashSet<>();
        BufferedReader br = new BufferedReader(new FileReader("D:\\compileDoc\\keyWord.txt"));
        String line = "";
        while ((line = br.readLine()) != null) {
            line = line.replace("\t", " ");
//            System.out.println(line);
            String[] s = line.split(" ");
            for (String s1 : s) {
                keyWord.add(s1);
            }
        }
//        System.out.println(keyWord.size());
        br.close();
        return keyWord;
    }

    /**
     * 分界符
     *
     * @return
     */
    public static HashSet<String> delimiter() {
        HashSet<String> hashSet = new HashSet<>();
        hashSet.add(";");
        hashSet.add(".");
        hashSet.add("{");
        hashSet.add("}");
        hashSet.add(")");
        hashSet.add("(");
        hashSet.add("[");
        hashSet.add("]");
        hashSet.add("\"");
        hashSet.add("\'");

        return hashSet;
    }

    /**
     * @return
     */
    public static HashSet<String> operator() {
        HashSet<String> hashSet = new HashSet<>();
        hashSet.add("+");
        hashSet.add("-");
        hashSet.add("*");
        hashSet.add("/");
        hashSet.add("%");
        hashSet.add("++");
        hashSet.add("--");
        hashSet.add("==");
        hashSet.add("!=");
        hashSet.add(">");
        hashSet.add("<");
        hashSet.add(">=");
        hashSet.add("<=");
        hashSet.add("&");
        hashSet.add("&&");
        hashSet.add("|");
        hashSet.add("^");
        hashSet.add("~");
        hashSet.add("<<");
        hashSet.add(">>");
        hashSet.add(">>>");
        hashSet.add("||");
        hashSet.add("!");
        hashSet.add("+=");
        hashSet.add("-=");
        hashSet.add("%=");
        hashSet.add("/=");
        hashSet.add("*=");
        hashSet.add("=");
        hashSet.add("<<=");
        hashSet.add(">>=");
        hashSet.add("&=");
        hashSet.add("^=");
        hashSet.add("|=");
        hashSet.add("?:");
        hashSet.add("instanceof");

        return hashSet;
    }

}

posted @ 2021-10-09 23:12  mx_info  阅读(41)  评论(0编辑  收藏  举报