一个简单最大正向匹配（Maximum Matching）MM中文分词算法的实现

转载http://blog.csdn.net/wzb56/article/details/7914954#

1.构建词典内存树的TrieNode节点类：

package cn.wzb.segmenter.mm.bean;

import java.util.HashMap;

/**
* 构建内存词典的Trie树结点
*
*/
public class TrieNode {
    /** 结点关键字，其值为中文词中的一个字 */
    public char key = '\0';

    /** 如果该字在词语的末尾，则bound=true */
    public boolean bound = false;

    /** 指向下一个结点的指针结构，用来存放当前字在词中的下一个字的位置 */
    public HashMap<Character, TrieNode> childs = new HashMap<Character, TrieNode>();

    public TrieNode() {}

    public TrieNode(char key) {
        this.key = key;
    }
}

2. 最大正向匹配算法（Maximum Matching）算法的实现类：segmenter类：核心方法segment();

package cn.wzb.segmenter.mm;

import java.io.IOException;

import cn.wzb.segmenter.AbstractSegmenter;
import cn.wzb.segmenter.mm.bean.TrieDictionary;
import cn.wzb.segmenter.mm.bean.TrieNode;

public class MMSegmenter extends AbstractSegmenter {
    public static TrieDictionary dict = null;

    static { //加载词典
        String dictionaryName = "/cn/wzb/dictionary/word_dic_utf8.txt";
        dict = TrieDictionary.getInstance(dictionaryName);
    }

    public MMSegmenter() {
        super("一个简单的最大的正向匹配器：MMSegmenter");
    }

    /**
     * 词典：用Trie树表示，每个节点都是一个TrieNode节点
     * 每个TrieNode节点中有:
     *   1.表示一个字
     *   2.以该字为前缀的所有的下一个字的HashMap<"字"，字的TrieNode>
     *   3.bound标记，该字是不是一个词的结尾。在最大匹配中有用（Maximum Matching）
     *
     * 正向MM（Maximum Matching）算法的核心思想：
     *  1. 从句子中，取词
     *  2. 将词添加到分词列表中
     *  3. 将分词标记 "|"添加到分词表
     *
     * 其中的句子中的成分分为以下几种：
     * 1. 非分词：如分隔符，直接跳过
     * 2. 分词：分词分为以下几种：
     *      a. 非中文分词：将分隔符分隔的连续的非中文字符作为一个分词
     *      b. 中文分词：
     *          i. 词典中的词：作为一个分词
     *         ii. 词典中的词的前缀：将每个字作为一个分词
     *        iii. 非词典中的词：将每个字作为一个分词
     *
     * 该分词的核心：对于前缀词的划分
     */

    public String segment(String sentence) {
        StringBuffer segBuffer = new StringBuffer();

        TrieNode p = dict.getRoot();
        ;
        TrieNode pChild = null;

        int length = sentence.length();
        int segBoundIndex = -1; //保存上次分词结束字符在sentence中的位置

        for (int i = 0; i < length; ++i) {
            char c = sentence.charAt(i);
            if (CharacterType.isCharSeperator(c)) {// 分隔符
                // do nothing;
            } else if (CharacterType.isCharOther(c)) {// 其他语言字符
                 do {
                    segBuffer.append(c);
                    if(++i == length){
                        break;
                    }
                    c = sentence.charAt(i);
                }while (CharacterType.isCharOther(c));
                 if( i != length) --i; //还原现场
            } else if (CharacterType.isCharChinese(c)) {
                pChild = p.childs.get(Character.valueOf(c));
                if (pChild == null) {// 不在词典中的中文字符
                    segBuffer.append(c);
                } else {
                     do {// 在词典中的词
                        segBuffer.append(c);
                        if (p == dict.getRoot() || pChild.bound) { // 算法的关键，能够保证前缀词，被划分。
                            segBoundIndex = i;
                        }
                        if (++i >= length) {
                            break;
                        }
                        c = sentence.charAt(i);
                        p = pChild;
                        pChild = (TrieNode) p.childs.get(Character.valueOf(c));
                    }while (pChild != null);
                    //切除非词典中词的前缀词
                    if (--i >= segBoundIndex) {
                        segBuffer.delete(segBuffer.length() - (i - segBoundIndex), segBuffer.length());
                    }
                    //还原现场
                    i = segBoundIndex;
                    p = dict.getRoot();
                }
            }
            segBuffer.append('|'); //添加分词标记
        }

        return new String(segBuffer);
    }

    public String segment(String sentence, String verison) {
        StringBuffer segBuffer = new StringBuffer();

        int segBoundIdx = 0;
        int length = sentence.length();
        TrieNode p = null;
        TrieNode pChild = null;

        for (int i = 0; i < length; i++) {
            char c = sentence.charAt(i);

            p = dict.getRoot();
            pChild = p.childs.get(Character.valueOf(c));

            // 不在词典中的字符
            if (pChild == null) {
                if (CharacterType.isCharSeperator(c)){
                    segBuffer.append(c);// do something;
                } if (CharacterType.isCharChinese(c)) {
                    segBuffer.append(c);
                } else {
                    do { // 非中文字符
                        segBuffer.append(c);
                        if (++i == length){
                            break;
                        }
                        c = sentence.charAt(i);
                    } while (CharacterType.isCharOther(c));
                    if( i != length) --i; //还原现场
                }
            } else { // 中文字词
                while (pChild != null) {
                    if (p == dict.getRoot() || pChild.bound) { //词典中的词或者词典中词的前缀词；前缀词将被单字划分
                        segBoundIdx = i;
                    }
                    segBuffer.append(c);
                    if (++i == length) {
                        break;
                    }
                    c = sentence.charAt(i);
                    p = pChild;
                    pChild = p.childs.get(Character.valueOf(c));
                }
                //切除分词表中不在词典中的前缀字词
                if (--i > segBoundIdx) {
                    segBuffer.delete(segBuffer.length() - (i - segBoundIdx), segBuffer.length());
                }
                //还原现场
                i = segBoundIdx;
            }
            segBuffer.append('|');
        }

        return new String(segBuffer);
    }

    public static void main(String args[]) throws IOException {
        MMSegmenter mmsegger = new MMSegmenter();
        System.out.println(mmsegger.segment("中华人民共和国是一个伟大的国家hello world"));
        System.out.println(mmsegger.segment("小红是个爱学习的好学生!!!!!"));
        System.out.println(mmsegger.segment("中华民de hello world!人民共"));
        System.out.println(mmsegger.segment("中华人民共"));
        System.out.println(mmsegger.segment("中华人民共和国家"));
        System.out.println(mmsegger.segment("爱国"));
        System.out.println(mmsegger.segment("爱我Love你"));
        System.out.println(mmsegger.segment("京华时报２００８年1月23日报道昨天，受一股来自中西伯利亚的强冷空气影响，本市出现大风降温天气，白天最高气温只有零下7摄氏度，同时伴有6到7级的偏北风。"));

        System.out.println("another version: ");
        System.out.println(mmsegger.segment("中华人民共和国是一个伟大的国家hello world", " "));
        System.out.println(mmsegger.segment("小红是个爱学习的好学生!!!!!", " "));
        System.out.println(mmsegger.segment("中华民de hello world!人民共", " "));
        System.out.println(mmsegger.segment("中华人民共", " "));
        System.out.println(mmsegger.segment("中华人民共和国家", " "));
        System.out.println(mmsegger.segment("爱国", " "));
        System.out.println(mmsegger.segment("爱我Love你", " "));
        System.out.println(mmsegger.segment("京华时报2008年1月23日报道昨天，受一股来自中西伯利亚的强冷空气影响，本市出现大风降温天气，白天最高气温只有零下7摄氏度，同时伴有6到7级的偏北风。", ""));




        //System.out.println(CharacterType.isCharSeperator(' '));
    }
}

3.关于字符类型辅助类：CharacterType类：

package cn.wzb.segmenter.mm;

class CharacterType {
    public static boolean isCharSeperator(char c) {
        return "\u3002\uFF01\uFF1F\uFF1A\uFF1B\u3001\uFF0C\uFF08\uFF09\u300A\u300B\u3010\u3011{}\u201C\u201D\u2018\u2019!?:;,()<>[]{}\"'\n\r\t ".indexOf(c) != -1;
    }

    public static boolean isCharChinese(char c) {
        return c >= '\u4E00' && c <= '\u9FBF';
    }

    public static boolean isCharOther(char c) {
        return !isCharSeperator(c) && !isCharChinese(c);
    }

    //private static final String C_E_SEPERATOR = "\u3002\uFF01\uFF1F\uFF1A\uFF1B\u3001\uFF0C\uFF08\uFF09\u300A\u300B\u3010\u3011{}\u201C\u201D\u2018\u2019!?:;,()<>[]{}\"'\n\r\t ";
    //private static final String str = "。！？：；、，（）《》【】{}“”‘’!?:;,()<>[]{}\"'\n\r\t ";
}

4. 该算法使用的词典文件：

[java] view plain copy

希望
中华
人民
共和国
中华人民共和国
一个
伟大
国家
西安
北京
家庭
家里
爱国者
我Love你
学习
好学生
学生
爱学
爱学习

5.分词测试结果：

[java] view plain copy

dictionary loading OK!
[oooggooo]一个简单的最大的正向匹配器：MMSegmenter segmenter on
中华人民共和国|是|一个|伟大|的|国家|hello|world|
小|红|是|个|爱学习|的|好学生||||||
中华|民|de|hello|world|人民|共|
中华|人民|共|
中华人民共和国|家|
爱|国|
爱|我Love你|
another version:
中华人民共和国|是|一个|伟大|的|国家|hello|world|
小|红|是|个|爱学习|的|好学生|!|!|!|!|!|
中华|民|de|hello|world|人民|共|
中华|人民|共|
中华人民共和国|家|
爱|国|
爱|我Love你|

posted on 2012-09-30 00:39 刺猬的温驯阅读(3953) 评论(0) 收藏举报

刷新页面返回顶部

君子博学而日参省乎己则知明而行无过矣

公告

君子博学而日参省乎己 则知明而行无过矣

公告

君子博学而日参省乎己则知明而行无过矣