最近参与一个小project,需要编写一个针对英文单词的stem 算法。

1. 最为常见的stem 算法 就是The English (Porter2) stemming algorithm http://snowball.tartarus.org/algorithms/english/stemmer.html

// This file was generated automatically by the Snowball to Java compiler

package org.tartarus.snowball.ext;

import org.tartarus.snowball.Among;

 /**
  * This class was automatically generated by a Snowball to Java compiler 
  * It implements the stemming algorithm defined by a snowball script.
  */

public class englishStemmer extends org.tartarus.snowball.SnowballStemmer {

private static final long serialVersionUID = 1L;

        private final static englishStemmer methodObject = new englishStemmer ();

                private final static Among a_0[] = {
                    new Among ( "arsen", -1, -1, "", methodObject ),
                    new Among ( "commun", -1, -1, "", methodObject ),
                    new Among ( "gener", -1, -1, "", methodObject )
                };

                private final static Among a_1[] = {
                    new Among ( "'", -1, 1, "", methodObject ),
                    new Among ( "'s'", 0, 1, "", methodObject ),
                    new Among ( "'s", -1, 1, "", methodObject )
                };

                private final static Among a_2[] = {
                    new Among ( "ied", -1, 2, "", methodObject ),
                    new Among ( "s", -1, 3, "", methodObject ),
                    new Among ( "ies", 1, 2, "", methodObject ),
                    new Among ( "sses", 1, 1, "", methodObject ),
                    new Among ( "ss", 1, -1, "", methodObject ),
                    new Among ( "us", 1, -1, "", methodObject )
                };

                private final static Among a_3[] = {
                    new Among ( "", -1, 3, "", methodObject ),
                    new Among ( "bb", 0, 2, "", methodObject ),
                    new Among ( "dd", 0, 2, "", methodObject ),
                    new Among ( "ff", 0, 2, "", methodObject ),
                    new Among ( "gg", 0, 2, "", methodObject ),
                    new Among ( "bl", 0, 1, "", methodObject ),
                    new Among ( "mm", 0, 2, "", methodObject ),
                    new Among ( "nn", 0, 2, "", methodObject ),
                    new Among ( "pp", 0, 2, "", methodObject ),
                    new Among ( "rr", 0, 2, "", methodObject ),
                    new Among ( "at", 0, 1, "", methodObject ),
                    new Among ( "tt", 0, 2, "", methodObject ),
                    new Among ( "iz", 0, 1, "", methodObject )
                };

                private final static Among a_4[] = {
                    new Among ( "ed", -1, 2, "", methodObject ),
                    new Among ( "eed", 0, 1, "", methodObject ),
                    new Among ( "ing", -1, 2, "", methodObject ),
                    new Among ( "edly", -1, 2, "", methodObject ),
                    new Among ( "eedly", 3, 1, "", methodObject ),
                    new Among ( "ingly", -1, 2, "", methodObject )
                };

                private final static Among a_5[] = {
                    new Among ( "anci", -1, 3, "", methodObject ),
                    new Among ( "enci", -1, 2, "", methodObject ),
                    new Among ( "ogi", -1, 13, "", methodObject ),
                    new Among ( "li", -1, 16, "", methodObject ),
                    new Among ( "bli", 3, 12, "", methodObject ),
                    new Among ( "abli", 4, 4, "", methodObject ),
                    new Among ( "alli", 3, 8, "", methodObject ),
                    new Among ( "fulli", 3, 14, "", methodObject ),
                    new Among ( "lessli", 3, 15, "", methodObject ),
                    new Among ( "ousli", 3, 10, "", methodObject ),
                    new Among ( "entli", 3, 5, "", methodObject ),
                    new Among ( "aliti", -1, 8, "", methodObject ),
                    new Among ( "biliti", -1, 12, "", methodObject ),
                    new Among ( "iviti", -1, 11, "", methodObject ),
                    new Among ( "tional", -1, 1, "", methodObject ),
                    new Among ( "ational", 14, 7, "", methodObject ),
                    new Among ( "alism", -1, 8, "", methodObject ),
                    new Among ( "ation", -1, 7, "", methodObject ),
                    new Among ( "ization", 17, 6, "", methodObject ),
                    new Among ( "izer", -1, 6, "", methodObject ),
                    new Among ( "ator", -1, 7, "", methodObject ),
                    new Among ( "iveness", -1, 11, "", methodObject ),
                    new Among ( "fulness", -1, 9, "", methodObject ),
                    new Among ( "ousness", -1, 10, "", methodObject )
                };

                private final static Among a_6[] = {
                    new Among ( "icate", -1, 4, "", methodObject ),
                    new Among ( "ative", -1, 6, "", methodObject ),
                    new Among ( "alize", -1, 3, "", methodObject ),
                    new Among ( "iciti", -1, 4, "", methodObject ),
                    new Among ( "ical", -1, 4, "", methodObject ),
                    new Among ( "tional", -1, 1, "", methodObject ),
                    new Among ( "ational", 5, 2, "", methodObject ),
                    new Among ( "ful", -1, 5, "", methodObject ),
                    new Among ( "ness", -1, 5, "", methodObject )
                };

                private final static Among a_7[] = {
                    new Among ( "ic", -1, 1, "", methodObject ),
                    new Among ( "ance", -1, 1, "", methodObject ),
                    new Among ( "ence", -1, 1, "", methodObject ),
                    new Among ( "able", -1, 1, "", methodObject ),
                    new Among ( "ible", -1, 1, "", methodObject ),
                    new Among ( "ate", -1, 1, "", methodObject ),
                    new Among ( "ive", -1, 1, "", methodObject ),
                    new Among ( "ize", -1, 1, "", methodObject ),
                    new Among ( "iti", -1, 1, "", methodObject ),
                    new Among ( "al", -1, 1, "", methodObject ),
                    new Among ( "ism", -1, 1, "", methodObject ),
                    new Among ( "ion", -1, 2, "", methodObject ),
                    new Among ( "er", -1, 1, "", methodObject ),
                    new Among ( "ous", -1, 1, "", methodObject ),
                    new Among ( "ant", -1, 1, "", methodObject ),
                    new Among ( "ent", -1, 1, "", methodObject ),
                    new Among ( "ment", 15, 1, "", methodObject ),
                    new Among ( "ement", 16, 1, "", methodObject )
                };

                private final static Among a_8[] = {
                    new Among ( "e", -1, 1, "", methodObject ),
                    new Among ( "l", -1, 2, "", methodObject )
                };

                private final static Among a_9[] = {
                    new Among ( "succeed", -1, -1, "", methodObject ),
                    new Among ( "proceed", -1, -1, "", methodObject ),
                    new Among ( "exceed", -1, -1, "", methodObject ),
                    new Among ( "canning", -1, -1, "", methodObject ),
                    new Among ( "inning", -1, -1, "", methodObject ),
                    new Among ( "earring", -1, -1, "", methodObject ),
                    new Among ( "herring", -1, -1, "", methodObject ),
                    new Among ( "outing", -1, -1, "", methodObject )
                };

                private final static Among a_10[] = {
                    new Among ( "andes", -1, -1, "", methodObject ),
                    new Among ( "atlas", -1, -1, "", methodObject ),
                    new Among ( "bias", -1, -1, "", methodObject ),
                    new Among ( "cosmos", -1, -1, "", methodObject ),
                    new Among ( "dying", -1, 3, "", methodObject ),
                    new Among ( "early", -1, 9, "", methodObject ),
                    new Among ( "gently", -1, 7, "", methodObject ),
                    new Among ( "howe", -1, -1, "", methodObject ),
                    new Among ( "idly", -1, 6, "", methodObject ),
                    new Among ( "lying", -1, 4, "", methodObject ),
                    new Among ( "news", -1, -1, "", methodObject ),
                    new Among ( "only", -1, 10, "", methodObject ),
                    new Among ( "singly", -1, 11, "", methodObject ),
                    new Among ( "skies", -1, 2, "", methodObject ),
                    new Among ( "skis", -1, 1, "", methodObject ),
                    new Among ( "sky", -1, -1, "", methodObject ),
                    new Among ( "tying", -1, 5, "", methodObject ),
                    new Among ( "ugly", -1, 8, "", methodObject )
                };

                private static final char g_v[] = {17, 65, 16, 1 };

                private static final char g_v_WXY[] = {1, 17, 65, 208, 1 };

                private static final char g_valid_LI[] = {55, 141, 2 };

        private boolean B_Y_found;
        private int I_p2;
        private int I_p1;

                private void copy_from(englishStemmer other) {
                    B_Y_found = other.B_Y_found;
                    I_p2 = other.I_p2;
                    I_p1 = other.I_p1;
                    super.copy_from(other);
                }

                private boolean r_prelude() {
            int v_1;
            int v_2;
            int v_3;
            int v_4;
            int v_5;
                    // (, line 25
                    // unset Y_found, line 26
                    B_Y_found = false;
                    // do, line 27
                    v_1 = cursor;
                    lab0: do {
                        // (, line 27
                        // [, line 27
                        bra = cursor;
                        // literal, line 27
                        if (!(eq_s(1, "'")))
                        {
                            break lab0;
                        }
                        // ], line 27
                        ket = cursor;
                        // delete, line 27
                        slice_del();
                    } while (false);
                    cursor = v_1;
                    // do, line 28
                    v_2 = cursor;
                    lab1: do {
                        // (, line 28
                        // [, line 28
                        bra = cursor;
                        // literal, line 28
                        if (!(eq_s(1, "y")))
                        {
                            break lab1;
                        }
                        // ], line 28
                        ket = cursor;
                        // <-, line 28
                        slice_from("Y");
                        // set Y_found, line 28
                        B_Y_found = true;
                    } while (false);
                    cursor = v_2;
                    // do, line 29
                    v_3 = cursor;
                    lab2: do {
                        // repeat, line 29
                        replab3: while(true)
                        {
                            v_4 = cursor;
                            lab4: do {
                                // (, line 29
                                // goto, line 29
                                golab5: while(true)
                                {
                                    v_5 = cursor;
                                    lab6: do {
                                        // (, line 29
                                        if (!(in_grouping(g_v, 97, 121)))
                                        {
                                            break lab6;
                                        }
                                        // [, line 29
                                        bra = cursor;
                                        // literal, line 29
                                        if (!(eq_s(1, "y")))
                                        {
                                            break lab6;
                                        }
                                        // ], line 29
                                        ket = cursor;
                                        cursor = v_5;
                                        break golab5;
                                    } while (false);
                                    cursor = v_5;
                                    if (cursor >= limit)
                                    {
                                        break lab4;
                                    }
                                    cursor++;
                                }
                                // <-, line 29
                                slice_from("Y");
                                // set Y_found, line 29
                                B_Y_found = true;
                                continue replab3;
                            } while (false);
                            cursor = v_4;
                            break replab3;
                        }
                    } while (false);
                    cursor = v_3;
                    return true;
                }

                private boolean r_mark_regions() {
            int v_1;
            int v_2;
                    // (, line 32
                    I_p1 = limit;
                    I_p2 = limit;
                    // do, line 35
                    v_1 = cursor;
                    lab0: do {
                        // (, line 35
                        // or, line 41
                        lab1: do {
                            v_2 = cursor;
                            lab2: do {
                                // among, line 36
                                if (find_among(a_0, 3) == 0)
                                {
                                    break lab2;
                                }
                                break lab1;
                            } while (false);
                            cursor = v_2;
                            // (, line 41
                            // gopast, line 41
                            golab3: while(true)
                            {
                                lab4: do {
                                    if (!(in_grouping(g_v, 97, 121)))
                                    {
                                        break lab4;
                                    }
                                    break golab3;
                                } while (false);
                                if (cursor >= limit)
                                {
                                    break lab0;
                                }
                                cursor++;
                            }
                            // gopast, line 41
                            golab5: while(true)
                            {
                                lab6: do {
                                    if (!(out_grouping(g_v, 97, 121)))
                                    {
                                        break lab6;
                                    }
                                    break golab5;
                                } while (false);
                                if (cursor >= limit)
                                {
                                    break lab0;
                                }
                                cursor++;
                            }
                        } while (false);
                        // setmark p1, line 42
                        I_p1 = cursor;
                        // gopast, line 43
                        golab7: while(true)
                        {
                            lab8: do {
                                if (!(in_grouping(g_v, 97, 121)))
                                {
                                    break lab8;
                                }
                                break golab7;
                            } while (false);
                            if (cursor >= limit)
                            {
                                break lab0;
                            }
                            cursor++;
                        }
                        // gopast, line 43
                        golab9: while(true)
                        {
                            lab10: do {
                                if (!(out_grouping(g_v, 97, 121)))
                                {
                                    break lab10;
                                }
                                break golab9;
                            } while (false);
                            if (cursor >= limit)
                            {
                                break lab0;
                            }
                            cursor++;
                        }
                        // setmark p2, line 43
                        I_p2 = cursor;
                    } while (false);
                    cursor = v_1;
                    return true;
                }

                private boolean r_shortv() {
            int v_1;
                    // (, line 49
                    // or, line 51
                    lab0: do {
                        v_1 = limit - cursor;
                        lab1: do {
                            // (, line 50
                            if (!(out_grouping_b(g_v_WXY, 89, 121)))
                            {
                                break lab1;
                            }
                            if (!(in_grouping_b(g_v, 97, 121)))
                            {
                                break lab1;
                            }
                            if (!(out_grouping_b(g_v, 97, 121)))
                            {
                                break lab1;
                            }
                            break lab0;
                        } while (false);
                        cursor = limit - v_1;
                        // (, line 52
                        if (!(out_grouping_b(g_v, 97, 121)))
                        {
                            return false;
                        }
                        if (!(in_grouping_b(g_v, 97, 121)))
                        {
                            return false;
                        }
                        // atlimit, line 52
                        if (cursor > limit_backward)
                        {
                            return false;
                        }
                    } while (false);
                    return true;
                }

                private boolean r_R1() {
                    if (!(I_p1 <= cursor))
                    {
                        return false;
                    }
                    return true;
                }

                private boolean r_R2() {
                    if (!(I_p2 <= cursor))
                    {
                        return false;
                    }
                    return true;
                }

                private boolean r_Step_1a() {
            int among_var;
            int v_1;
            int v_2;
                    // (, line 58
                    // try, line 59
                    v_1 = limit - cursor;
                    lab0: do {
                        // (, line 59
                        // [, line 60
                        ket = cursor;
                        // substring, line 60
                        among_var = find_among_b(a_1, 3);
                        if (among_var == 0)
                        {
                            cursor = limit - v_1;
                            break lab0;
                        }
                        // ], line 60
                        bra = cursor;
                        switch(among_var) {
                            case 0:
                                cursor = limit - v_1;
                                break lab0;
                            case 1:
                                // (, line 62
                                // delete, line 62
                                slice_del();
                                break;
                        }
                    } while (false);
                    // [, line 65
                    ket = cursor;
                    // substring, line 65
                    among_var = find_among_b(a_2, 6);
                    if (among_var == 0)
                    {
                        return false;
                    }
                    // ], line 65
                    bra = cursor;
                    switch(among_var) {
                        case 0:
                            return false;
                        case 1:
                            // (, line 66
                            // <-, line 66
                            slice_from("ss");
                            break;
                        case 2:
                            // (, line 68
                            // or, line 68
                            lab1: do {
                                v_2 = limit - cursor;
                                lab2: do {
                                    // (, line 68
                                    // hop, line 68
                                    {
                                        int c = cursor - 2;
                                        if (limit_backward > c || c > limit)
                                        {
                                            break lab2;
                                        }
                                        cursor = c;
                                    }
                                    // <-, line 68
                                    slice_from("i");
                                    break lab1;
                                } while (false);
                                cursor = limit - v_2;
                                // <-, line 68
                                slice_from("ie");
                            } while (false);
                            break;
                        case 3:
                            // (, line 69
                            // next, line 69
                            if (cursor <= limit_backward)
                            {
                                return false;
                            }
                            cursor--;
                            // gopast, line 69
                            golab3: while(true)
                            {
                                lab4: do {
                                    if (!(in_grouping_b(g_v, 97, 121)))
                                    {
                                        break lab4;
                                    }
                                    break golab3;
                                } while (false);
                                if (cursor <= limit_backward)
                                {
                                    return false;
                                }
                                cursor--;
                            }
                            // delete, line 69
                            slice_del();
                            break;
                    }
                    return true;
                }

                private boolean r_Step_1b() {
            int among_var;
            int v_1;
            int v_3;
            int v_4;
                    // (, line 74
                    // [, line 75
                    ket = cursor;
                    // substring, line 75
                    among_var = find_among_b(a_4, 6);
                    if (among_var == 0)
                    {
                        return false;
                    }
                    // ], line 75
                    bra = cursor;
                    switch(among_var) {
                        case 0:
                            return false;
                        case 1:
                            // (, line 77
                            // call R1, line 77
                            if (!r_R1())
                            {
                                return false;
                            }
                            // <-, line 77
                            slice_from("ee");
                            break;
                        case 2:
                            // (, line 79
                            // test, line 80
                            v_1 = limit - cursor;
                            // gopast, line 80
                            golab0: while(true)
                            {
                                lab1: do {
                                    if (!(in_grouping_b(g_v, 97, 121)))
                                    {
                                        break lab1;
                                    }
                                    break golab0;
                                } while (false);
                                if (cursor <= limit_backward)
                                {
                                    return false;
                                }
                                cursor--;
                            }
                            cursor = limit - v_1;
                            // delete, line 80
                            slice_del();
                            // test, line 81
                            v_3 = limit - cursor;
                            // substring, line 81
                            among_var = find_among_b(a_3, 13);
                            if (among_var == 0)
                            {
                                return false;
                            }
                            cursor = limit - v_3;
                            switch(among_var) {
                                case 0:
                                    return false;
                                case 1:
                                    // (, line 83
                                    // <+, line 83
                                    {
                                        int c = cursor;
                                        insert(cursor, cursor, "e");
                                        cursor = c;
                                    }
                                    break;
                                case 2:
                                    // (, line 86
                                    // [, line 86
                                    ket = cursor;
                                    // next, line 86
                                    if (cursor <= limit_backward)
                                    {
                                        return false;
                                    }
                                    cursor--;
                                    // ], line 86
                                    bra = cursor;
                                    // delete, line 86
                                    slice_del();
                                    break;
                                case 3:
                                    // (, line 87
                                    // atmark, line 87
                                    if (cursor != I_p1)
                                    {
                                        return false;
                                    }
                                    // test, line 87
                                    v_4 = limit - cursor;
                                    // call shortv, line 87
                                    if (!r_shortv())
                                    {
                                        return false;
                                    }
                                    cursor = limit - v_4;
                                    // <+, line 87
                                    {
                                        int c = cursor;
                                        insert(cursor, cursor, "e");
                                        cursor = c;
                                    }
                                    break;
                            }
                            break;
                    }
                    return true;
                }

                private boolean r_Step_1c() {
            int v_1;
            int v_2;
                    // (, line 93
                    // [, line 94
                    ket = cursor;
                    // or, line 94
                    lab0: do {
                        v_1 = limit - cursor;
                        lab1: do {
                            // literal, line 94
                            if (!(eq_s_b(1, "y")))
                            {
                                break lab1;
                            }
                            break lab0;
                        } while (false);
                        cursor = limit - v_1;
                        // literal, line 94
                        if (!(eq_s_b(1, "Y")))
                        {
                            return false;
                        }
                    } while (false);
                    // ], line 94
                    bra = cursor;
                    if (!(out_grouping_b(g_v, 97, 121)))
                    {
                        return false;
                    }
                    // not, line 95
                    {
                        v_2 = limit - cursor;
                        lab2: do {
                            // atlimit, line 95
                            if (cursor > limit_backward)
                            {
                                break lab2;
                            }
                            return false;
                        } while (false);
                        cursor = limit - v_2;
                    }
                    // <-, line 96
                    slice_from("i");
                    return true;
                }

                private boolean r_Step_2() {
            int among_var;
                    // (, line 99
                    // [, line 100
                    ket = cursor;
                    // substring, line 100
                    among_var = find_among_b(a_5, 24);
                    if (among_var == 0)
                    {
                        return false;
                    }
                    // ], line 100
                    bra = cursor;
                    // call R1, line 100
                    if (!r_R1())
                    {
                        return false;
                    }
                    switch(among_var) {
                        case 0:
                            return false;
                        case 1:
                            // (, line 101
                            // <-, line 101
                            slice_from("tion");
                            break;
                        case 2:
                            // (, line 102
                            // <-, line 102
                            slice_from("ence");
                            break;
                        case 3:
                            // (, line 103
                            // <-, line 103
                            slice_from("ance");
                            break;
                        case 4:
                            // (, line 104
                            // <-, line 104
                            slice_from("able");
                            break;
                        case 5:
                            // (, line 105
                            // <-, line 105
                            slice_from("ent");
                            break;
                        case 6:
                            // (, line 107
                            // <-, line 107
                            slice_from("ize");
                            break;
                        case 7:
                            // (, line 109
                            // <-, line 109
                            slice_from("ate");
                            break;
                        case 8:
                            // (, line 111
                            // <-, line 111
                            slice_from("al");
                            break;
                        case 9:
                            // (, line 112
                            // <-, line 112
                            slice_from("ful");
                            break;
                        case 10:
                            // (, line 114
                            // <-, line 114
                            slice_from("ous");
                            break;
                        case 11:
                            // (, line 116
                            // <-, line 116
                            slice_from("ive");
                            break;
                        case 12:
                            // (, line 118
                            // <-, line 118
                            slice_from("ble");
                            break;
                        case 13:
                            // (, line 119
                            // literal, line 119
                            if (!(eq_s_b(1, "l")))
                            {
                                return false;
                            }
                            // <-, line 119
                            slice_from("og");
                            break;
                        case 14:
                            // (, line 120
                            // <-, line 120
                            slice_from("ful");
                            break;
                        case 15:
                            // (, line 121
                            // <-, line 121
                            slice_from("less");
                            break;
                        case 16:
                            // (, line 122
                            if (!(in_grouping_b(g_valid_LI, 99, 116)))
                            {
                                return false;
                            }
                            // delete, line 122
                            slice_del();
                            break;
                    }
                    return true;
                }

                private boolean r_Step_3() {
            int among_var;
                    // (, line 126
                    // [, line 127
                    ket = cursor;
                    // substring, line 127
                    among_var = find_among_b(a_6, 9);
                    if (among_var == 0)
                    {
                        return false;
                    }
                    // ], line 127
                    bra = cursor;
                    // call R1, line 127
                    if (!r_R1())
                    {
                        return false;
                    }
                    switch(among_var) {
                        case 0:
                            return false;
                        case 1:
                            // (, line 128
                            // <-, line 128
                            slice_from("tion");
                            break;
                        case 2:
                            // (, line 129
                            // <-, line 129
                            slice_from("ate");
                            break;
                        case 3:
                            // (, line 130
                            // <-, line 130
                            slice_from("al");
                            break;
                        case 4:
                            // (, line 132
                            // <-, line 132
                            slice_from("ic");
                            break;
                        case 5:
                            // (, line 134
                            // delete, line 134
                            slice_del();
                            break;
                        case 6:
                            // (, line 136
                            // call R2, line 136
                            if (!r_R2())
                            {
                                return false;
                            }
                            // delete, line 136
                            slice_del();
                            break;
                    }
                    return true;
                }

                private boolean r_Step_4() {
            int among_var;
            int v_1;
                    // (, line 140
                    // [, line 141
                    ket = cursor;
                    // substring, line 141
                    among_var = find_among_b(a_7, 18);
                    if (among_var == 0)
                    {
                        return false;
                    }
                    // ], line 141
                    bra = cursor;
                    // call R2, line 141
                    if (!r_R2())
                    {
                        return false;
                    }
                    switch(among_var) {
                        case 0:
                            return false;
                        case 1:
                            // (, line 144
                            // delete, line 144
                            slice_del();
                            break;
                        case 2:
                            // (, line 145
                            // or, line 145
                            lab0: do {
                                v_1 = limit - cursor;
                                lab1: do {
                                    // literal, line 145
                                    if (!(eq_s_b(1, "s")))
                                    {
                                        break lab1;
                                    }
                                    break lab0;
                                } while (false);
                                cursor = limit - v_1;
                                // literal, line 145
                                if (!(eq_s_b(1, "t")))
                                {
                                    return false;
                                }
                            } while (false);
                            // delete, line 145
                            slice_del();
                            break;
                    }
                    return true;
                }

                private boolean r_Step_5() {
            int among_var;
            int v_1;
            int v_2;
                    // (, line 149
                    // [, line 150
                    ket = cursor;
                    // substring, line 150
                    among_var = find_among_b(a_8, 2);
                    if (among_var == 0)
                    {
                        return false;
                    }
                    // ], line 150
                    bra = cursor;
                    switch(among_var) {
                        case 0:
                            return false;
                        case 1:
                            // (, line 151
                            // or, line 151
                            lab0: do {
                                v_1 = limit - cursor;
                                lab1: do {
                                    // call R2, line 151
                                    if (!r_R2())
                                    {
                                        break lab1;
                                    }
                                    break lab0;
                                } while (false);
                                cursor = limit - v_1;
                                // (, line 151
                                // call R1, line 151
                                if (!r_R1())
                                {
                                    return false;
                                }
                                // not, line 151
                                {
                                    v_2 = limit - cursor;
                                    lab2: do {
                                        // call shortv, line 151
                                        if (!r_shortv())
                                        {
                                            break lab2;
                                        }
                                        return false;
                                    } while (false);
                                    cursor = limit - v_2;
                                }
                            } while (false);
                            // delete, line 151
                            slice_del();
                            break;
                        case 2:
                            // (, line 152
                            // call R2, line 152
                            if (!r_R2())
                            {
                                return false;
                            }
                            // literal, line 152
                            if (!(eq_s_b(1, "l")))
                            {
                                return false;
                            }
                            // delete, line 152
                            slice_del();
                            break;
                    }
                    return true;
                }

                private boolean r_exception2() {
                    // (, line 156
                    // [, line 158
                    ket = cursor;
                    // substring, line 158
                    if (find_among_b(a_9, 8) == 0)
                    {
                        return false;
                    }
                    // ], line 158
                    bra = cursor;
                    // atlimit, line 158
                    if (cursor > limit_backward)
                    {
                        return false;
                    }
                    return true;
                }

                private boolean r_exception1() {
            int among_var;
                    // (, line 168
                    // [, line 170
                    bra = cursor;
                    // substring, line 170
                    among_var = find_among(a_10, 18);
                    if (among_var == 0)
                    {
                        return false;
                    }
                    // ], line 170
                    ket = cursor;
                    // atlimit, line 170
                    if (cursor < limit)
                    {
                        return false;
                    }
                    switch(among_var) {
                        case 0:
                            return false;
                        case 1:
                            // (, line 174
                            // <-, line 174
                            slice_from("ski");
                            break;
                        case 2:
                            // (, line 175
                            // <-, line 175
                            slice_from("sky");
                            break;
                        case 3:
                            // (, line 176
                            // <-, line 176
                            slice_from("die");
                            break;
                        case 4:
                            // (, line 177
                            // <-, line 177
                            slice_from("lie");
                            break;
                        case 5:
                            // (, line 178
                            // <-, line 178
                            slice_from("tie");
                            break;
                        case 6:
                            // (, line 182
                            // <-, line 182
                            slice_from("idl");
                            break;
                        case 7:
                            // (, line 183
                            // <-, line 183
                            slice_from("gentl");
                            break;
                        case 8:
                            // (, line 184
                            // <-, line 184
                            slice_from("ugli");
                            break;
                        case 9:
                            // (, line 185
                            // <-, line 185
                            slice_from("earli");
                            break;
                        case 10:
                            // (, line 186
                            // <-, line 186
                            slice_from("onli");
                            break;
                        case 11:
                            // (, line 187
                            // <-, line 187
                            slice_from("singl");
                            break;
                    }
                    return true;
                }

                private boolean r_postlude() {
            int v_1;
            int v_2;
                    // (, line 203
                    // Boolean test Y_found, line 203
                    if (!(B_Y_found))
                    {
                        return false;
                    }
                    // repeat, line 203
                    replab0: while(true)
                    {
                        v_1 = cursor;
                        lab1: do {
                            // (, line 203
                            // goto, line 203
                            golab2: while(true)
                            {
                                v_2 = cursor;
                                lab3: do {
                                    // (, line 203
                                    // [, line 203
                                    bra = cursor;
                                    // literal, line 203
                                    if (!(eq_s(1, "Y")))
                                    {
                                        break lab3;
                                    }
                                    // ], line 203
                                    ket = cursor;
                                    cursor = v_2;
                                    break golab2;
                                } while (false);
                                cursor = v_2;
                                if (cursor >= limit)
                                {
                                    break lab1;
                                }
                                cursor++;
                            }
                            // <-, line 203
                            slice_from("y");
                            continue replab0;
                        } while (false);
                        cursor = v_1;
                        break replab0;
                    }
                    return true;
                }

                public boolean stem() {
            int v_1;
            int v_2;
            int v_3;
            int v_4;
            int v_5;
            int v_6;
            int v_7;
            int v_8;
            int v_9;
            int v_10;
            int v_11;
            int v_12;
            int v_13;
                    // (, line 205
                    // or, line 207
                    lab0: do {
                        v_1 = cursor;
                        lab1: do {
                            // call exception1, line 207
                            if (!r_exception1())
                            {
                                break lab1;
                            }
                            break lab0;
                        } while (false);
                        cursor = v_1;
                        lab2: do {
                            // not, line 208
                            {
                                v_2 = cursor;
                                lab3: do {
                                    // hop, line 208
                                    {
                                        int c = cursor + 3;
                                        if (0 > c || c > limit)
                                        {
                                            break lab3;
                                        }
                                        cursor = c;
                                    }
                                    break lab2;
                                } while (false);
                                cursor = v_2;
                            }
                            break lab0;
                        } while (false);
                        cursor = v_1;
                        // (, line 208
                        // do, line 209
                        v_3 = cursor;
                        lab4: do {
                            // call prelude, line 209
                            if (!r_prelude())
                            {
                                break lab4;
                            }
                        } while (false);
                        cursor = v_3;
                        // do, line 210
                        v_4 = cursor;
                        lab5: do {
                            // call mark_regions, line 210
                            if (!r_mark_regions())
                            {
                                break lab5;
                            }
                        } while (false);
                        cursor = v_4;
                        // backwards, line 211
                        limit_backward = cursor; cursor = limit;
                        // (, line 211
                        // do, line 213
                        v_5 = limit - cursor;
                        lab6: do {
                            // call Step_1a, line 213
                            if (!r_Step_1a())
                            {
                                break lab6;
                            }
                        } while (false);
                        cursor = limit - v_5;
                        // or, line 215
                        lab7: do {
                            v_6 = limit - cursor;
                            lab8: do {
                                // call exception2, line 215
                                if (!r_exception2())
                                {
                                    break lab8;
                                }
                                break lab7;
                            } while (false);
                            cursor = limit - v_6;
                            // (, line 215
                            // do, line 217
                            v_7 = limit - cursor;
                            lab9: do {
                                // call Step_1b, line 217
                                if (!r_Step_1b())
                                {
                                    break lab9;
                                }
                            } while (false);
                            cursor = limit - v_7;
                            // do, line 218
                            v_8 = limit - cursor;
                            lab10: do {
                                // call Step_1c, line 218
                                if (!r_Step_1c())
                                {
                                    break lab10;
                                }
                            } while (false);
                            cursor = limit - v_8;
                            // do, line 220
                            v_9 = limit - cursor;
                            lab11: do {
                                // call Step_2, line 220
                                if (!r_Step_2())
                                {
                                    break lab11;
                                }
                            } while (false);
                            cursor = limit - v_9;
                            // do, line 221
                            v_10 = limit - cursor;
                            lab12: do {
                                // call Step_3, line 221
                                if (!r_Step_3())
                                {
                                    break lab12;
                                }
                            } while (false);
                            cursor = limit - v_10;
                            // do, line 222
                            v_11 = limit - cursor;
                            lab13: do {
                                // call Step_4, line 222
                                if (!r_Step_4())
                                {
                                    break lab13;
                                }
                            } while (false);
                            cursor = limit - v_11;
                            // do, line 224
                            v_12 = limit - cursor;
                            lab14: do {
                                // call Step_5, line 224
                                if (!r_Step_5())
                                {
                                    break lab14;
                                }
                            } while (false);
                            cursor = limit - v_12;
                        } while (false);
                        cursor = limit_backward;                        // do, line 227
                        v_13 = cursor;
                        lab15: do {
                            // call postlude, line 227
                            if (!r_postlude())
                            {
                                break lab15;
                            }
                        } while (false);
                        cursor = v_13;
                    } while (false);
                    return true;
                }

        public boolean equals( Object o ) {
            return o instanceof englishStemmer;
        }

        public int hashCode() {
            return englishStemmer.class.getName().hashCode();
        }



}
porter2 stemming algorithm

 然而,porter stemming 仅仅是一个基于后缀的词干提取技术,它仅仅定义了一些基本的后缀规则,能识别出"books"->"book"等. 然而针对一些诸如 "bought"->"buy","brought"->"bring"等异常形式并不能识别出来。

2. The dragon toolkit (http://dragon.ischool.drexel.edu/download.asp)

然后发现上面nlp 处理工具,其中的EngLemmatiser 类就是stem类,能提取出单词的词干。

它首先定义一些基本点后缀规则(只有十几条),然后定义一些独立于这些规则的异常词库(master slave 的形式,这样就能基本实现单词词干的正确提取,解决了porter stemming 存在的问题。

String dictionaryPath = "lemmatiser";
        EngLemmatiser lemmatiser =  new EngLemmatiser(dictionaryPath, false, true);

        String a = "brought";
        String lemmatizedWord = lemmatiser.lemmatize(a);
        System.out.println(lemmatizedWord);
View Code

然而我还是觉得,在规则基础之上附加词典的技术过于死板,不够灵活。

3. Stanford CoreNLP

后来发现斯坦福大学的一个NLP工具,其中提取词干的技术:针对大量语料库进行机器学习,利用有限自动机提炼并生成规则(不必附加词典)。能完美解决词干的提取问题,准确率很高。它对地名、人名等专有词识别不出来,但达到了基本的需求。

    String word="magnificus";
        Morphology morph=new Morphology();
        System.out.println(morph.stem(word));
View Code

 

posted on 2014-07-10 13:29  kenneth shu  阅读(373)  评论(0编辑  收藏  举报