盘古分词的核心实现(一)初始化装载词库
盘古分词的核心实现,只需要跟踪核心的几个函数实现即可。
1,PanGu.Segment.Init();
public static void Init(string fileName) { lock (_LockObj) { if (_Inited) { return; } InitInfinitiveVerbTable(); if (fileName == null) { Setting.SettingLoader loader = new PanGu.Setting.SettingLoader(); } else { Setting.SettingLoader loader = new PanGu.Setting.SettingLoader(fileName); } LoadDictionary(); _Inited = true; _Wildcard = new PanGu.Dict.Wildcard(Setting.PanGuSettings.Config.MatchOptions, Setting.PanGuSettings.Config.Parameters); string dir = Setting.PanGuSettings.Config.GetDictionaryPath(); if (Setting.PanGuSettings.Config.MatchOptions.WildcardOutput) { _Wildcard.Load(dir); } } }
2, LoadDictionary();
static private void LoadDictionary() { _WordDictionary = new PanGu.Dict.WordDictionary(); string dir = Setting.PanGuSettings.Config.GetDictionaryPath(); _WordDictionary.Load(dir + "Dict.Dct"); _ChsName = new PanGu.Dict.ChsName(); _ChsName.LoadChsName(Setting.PanGuSettings.Config.GetDictionaryPath()); _WordDictionary.ChineseName = _ChsName; _StopWord = new PanGu.Dict.StopWord(); _StopWord.LoadStopwordsDict(dir + "Stopword.txt"); _Synonym = new PanGu.Dict.Synonym(); if (Setting.PanGuSettings.Config.MatchOptions.SynonymOutput) { _Synonym.Load(dir); } _DictLoader = new PanGu.Dict.DictionaryLoader(Setting.PanGuSettings.Config.GetDictionaryPath()); }
3,装载词库核心函数
private WordDictionaryFile LoadFromBinFile(String fileName, out string verNumStr) { WordDictionaryFile dictFile = new WordDictionaryFile(); dictFile.Dicts = new List<WordAttribute>(); FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read); byte[] version = new byte[32]; fs.Read(version, 0, version.Length); String ver = Encoding.UTF8.GetString(version, 0, version.Length); int zeroPosition = ver.IndexOf('\0'); if (zeroPosition >= 0) { ver = ver.Substring(0, zeroPosition); } verNumStr = Framework.Regex.GetMatch(ver, "Pan Gu Segment V(.+)", true); while (fs.Position < fs.Length) { byte[] buf = new byte[sizeof(int)]; fs.Read(buf, 0, buf.Length); int length = BitConverter.ToInt32(buf, 0); buf = new byte[length]; fs.Read(buf, 0, buf.Length); string word = Encoding.UTF8.GetString(buf, 0, length - sizeof(int) - sizeof(double)); POS pos = (POS)BitConverter.ToInt32(buf, length - sizeof(int) - sizeof(double)); double frequency = BitConverter.ToDouble(buf, length - sizeof(double)); WordAttribute dict = new WordAttribute(word, pos, frequency); string.Intern(dict.Word); dictFile.Dicts.Add(dict); } fs.Close(); return dictFile; }
private WordDictionaryFile LoadFromTextFile(String fileName) { WordDictionaryFile dictFile = new WordDictionaryFile(); dictFile.Dicts = new List<WordAttribute>(); using (StreamReader sr = new StreamReader(fileName, Encoding.UTF8)) { while (!sr.EndOfStream) { string line = sr.ReadLine(); string[] strs = line.Split(new char[] { '|' }); if (strs.Length == 3) { string word = strs[0].Trim(); POS pos = (POS)int.Parse(strs[1].Substring(2, strs[1].Length - 2), System.Globalization.NumberStyles.HexNumber); double frequency = double.Parse(strs[2]); WordAttribute dict = new WordAttribute(word, pos, frequency); dictFile.Dicts.Add(dict); } } } return dictFile; }
浙公网安备 33010602011771号