盘古分词的核心实现(一)初始化装载词库

盘古分词的核心实现,只需要跟踪核心的几个函数实现即可。

 

1,PanGu.Segment.Init();

        public static void Init(string fileName)
        {
            lock (_LockObj)
            {
                if (_Inited)
                {
                    return;
                }

                InitInfinitiveVerbTable();

                if (fileName == null)
                {
                    Setting.SettingLoader loader = new PanGu.Setting.SettingLoader();
                }
                else
                {
                    Setting.SettingLoader loader = new PanGu.Setting.SettingLoader(fileName);
                }

                LoadDictionary();

                _Inited = true;

                _Wildcard = new PanGu.Dict.Wildcard(Setting.PanGuSettings.Config.MatchOptions,
                    Setting.PanGuSettings.Config.Parameters);

                string dir = Setting.PanGuSettings.Config.GetDictionaryPath();

                if (Setting.PanGuSettings.Config.MatchOptions.WildcardOutput)
                {
                    _Wildcard.Load(dir);
                }

            }
        }

2, LoadDictionary();

        static private void LoadDictionary()
        {
            _WordDictionary = new PanGu.Dict.WordDictionary();
            string dir = Setting.PanGuSettings.Config.GetDictionaryPath();
            _WordDictionary.Load(dir + "Dict.Dct");

            _ChsName = new PanGu.Dict.ChsName();
            _ChsName.LoadChsName(Setting.PanGuSettings.Config.GetDictionaryPath());


            _WordDictionary.ChineseName = _ChsName;

            _StopWord = new PanGu.Dict.StopWord();
            _StopWord.LoadStopwordsDict(dir + "Stopword.txt");

            _Synonym = new PanGu.Dict.Synonym();

            if (Setting.PanGuSettings.Config.MatchOptions.SynonymOutput)
            {
                _Synonym.Load(dir);
            }

            _DictLoader = new PanGu.Dict.DictionaryLoader(Setting.PanGuSettings.Config.GetDictionaryPath());
        }

3,装载词库核心函数

        private WordDictionaryFile LoadFromBinFile(String fileName, out string verNumStr)
        {
            WordDictionaryFile dictFile = new WordDictionaryFile();
            dictFile.Dicts = new List<WordAttribute>();

            FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read);

            byte[] version = new byte[32];
            fs.Read(version, 0, version.Length);

            String ver = Encoding.UTF8.GetString(version, 0, version.Length);

            int zeroPosition = ver.IndexOf('\0');
            if (zeroPosition >= 0)
            {
                ver = ver.Substring(0, zeroPosition);
            }

            verNumStr = Framework.Regex.GetMatch(ver, "Pan Gu Segment V(.+)", true);

            while (fs.Position < fs.Length)
            {
                byte[] buf = new byte[sizeof(int)];
                fs.Read(buf, 0, buf.Length);
                int length = BitConverter.ToInt32(buf, 0);

                buf = new byte[length];

                fs.Read(buf, 0, buf.Length);

                string word = Encoding.UTF8.GetString(buf, 0, length - sizeof(int) - sizeof(double));
                POS pos = (POS)BitConverter.ToInt32(buf, length - sizeof(int) - sizeof(double));
                double frequency = BitConverter.ToDouble(buf, length - sizeof(double));

                WordAttribute dict = new WordAttribute(word, pos, frequency);
                string.Intern(dict.Word);

                dictFile.Dicts.Add(dict);
            }

            fs.Close();

            return dictFile;
        }
        private WordDictionaryFile LoadFromTextFile(String fileName)
        {
            WordDictionaryFile dictFile = new WordDictionaryFile();
            dictFile.Dicts = new List<WordAttribute>();

            using (StreamReader sr = new StreamReader(fileName, Encoding.UTF8))
            {
                while (!sr.EndOfStream)
                {
                    string line = sr.ReadLine();

                    string[] strs = line.Split(new char[] { '|' });

                    if (strs.Length == 3)
                    {
                        string word = strs[0].Trim();

                        POS pos = (POS)int.Parse(strs[1].Substring(2, strs[1].Length - 2), System.Globalization.NumberStyles.HexNumber);
                        double frequency = double.Parse(strs[2]);
                        WordAttribute dict = new WordAttribute(word, pos, frequency);

                        dictFile.Dicts.Add(dict);
                    }
                }
            }

            return dictFile;
        }

 

posted @ 2016-01-20 06:22  RulesOS  阅读(872)  评论(0)    收藏  举报