拼写检查算法 Golang 版

最近看了 阮一峰的一篇文章介绍使用贝叶斯推断方法做拼写检查的文章,该文章的易懂程度输于 Google 技术总监写的原文,其优秀的译文

说明了啥,越是大师级的人写的文章往往越易懂。所以关于贝叶斯方法我就不解释了。只帖代码

 

我使用golang对照实现了一遍:

一是为了弄懂其算法细节

二是不使前段时间看的golang语法忘记

就像几年前在学校时候对着C版的数据结构书用C#去实现一样。

 

package main


import (
    "fmt"
    "io/ioutil"
    "regexp"
)

var (
    NWORDS map[string]int
)

const (
    alphabet = "abcdefghijklmnopqrstuvwxyz"
)

func words(text string) []string {
    regex, _ := regexp.Compile("[a-z]+")
    return regex.FindAllString(text, -1)
}

func train(features []string) map[string]int {
    result := make(map[string]int)
    for i := range features {
        _, isexist := result[features[i]]
        if !isexist {
            result[features[i]] = 1
        } else {
            result[features[i]] += 1
        }
    }

    return result
}

func edit1(word string) []string {
    type tuple struct{ a, b string }
    var splits []tuple
    for i := 0; i < len(word)+1; i++ {
        splits = append(splits, tuple{word[:i], word[i:]})
    }

    var deletes []string
    for _, t := range splits {
        if len(t.b) > 0 {
            deletes = append(deletes, t.a+t.b[1:])
        }
    }

    var transposes []string
    for _, t := range splits {
        if len(t.b) > 1 {
            transposes = append(transposes, t.a+string(t.b[1])+string(t.b[0])+t.b[2:])
        }
    }

    var replaces []string
    for _, c := range alphabet {
        for _, t := range splits {
            if len(t.b) > 0 {
                replaces = append(replaces, t.a+string(c)+t.b[1:])
            }
        }
    }

    var inserts []string
    for _, c := range alphabet {
        for _, t := range splits {
            inserts = append(inserts, t.a+string(c)+t.b)
        }
    }

    //concat this slice 
    deletes = append(deletes, transposes...)
    deletes = append(deletes, replaces...)
    deletes = append(deletes, inserts...)

    return set(deletes)
}

func known_edits2(word string) []string {
    var arr []string
    for _, e1 := range edit1(word) {
        for _, e2 := range edit1(e1) {
            if _, ok := NWORDS[e2]; ok {
                arr = append(arr, e2)
            }
        }
    }
    return set(arr)
}

func known(words []string) []string {
    var knows []string
    for _, value := range words {
        if _, ok := NWORDS[value]; ok {
            knows = append(knows, value)
        }
    }
    return knows
}

func appendIfMissing(slice []string, i string) []string {
    for _, ele := range slice {
        if ele == i {
            return slice
        }
    }
    return append(slice, i)
}

func set(arr []string) []string {
    var result []string
    for _, ele := range arr {
        result = appendIfMissing(result, ele)
    }
    return result
}

func correct(word string) string {
    candidates := known([]string{word})
    if len(candidates) <= 0 {
        candidates = known(edit1(word))
        if len(candidates) <= 0 {
            candidates = known(known_edits2(word))
        }
    }
    return max(candidates, NWORDS)
}

func max(arr []string, dict map[string]int) string {
    flag := 0
    index := 0
    for ix, value := range arr {
        if v, ok := dict[value]; ok {
            if v > flag {
                flag = v
                index = ix
            }
        }
    }
    return arr[index]
}

func main() {
    buf, _ := ioutil.ReadFile("big.txt")
    NWORDS = train(words(string(buf)))
    word := "beford"
    fmt.Println("input:", word, "correct word:", correct(word))
}

python 版本只有30行左右,golang对各种集合操作和python对比差了许多。

python里用set(arr),即可将列表里重复的删除。简洁的用for in 构造列表实在很cool

posted on 2012-10-30 17:09 Haozes 阅读(...) 评论(...) 编辑 收藏

公告

统计

  • 随笔 - 103
  • 文章 - 2
  • 评论 - 192