汉字(包括多音字)转换拼音和简码的处理

将汉字转换成拼音,由于汉字有多音字,所以转换拼音是让人很头疼的一个问题,我想到了一个解决这个问题的办法,就是将每个字的音节都读出来,然后进行拼音重组,这样总有一组拼音是正确的,然后在根据全文搜索技术,就能找到想要的记录了,本人不才自己写了个方法,贴出了,跟大家分享一下,希望多批评指正!!

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.International.Converters.PinYinConverter;

namespace ChineseConvertPinyin
{
    public class ChineseConvertPinyin
    {  
       /// 判断给定的字符串是否是全部有效的汉字
       /// </summary>
       /// <param name="chinesechar">给定的字符串</param>
       /// <returns>true给定的字符串全部是汉字,false给定的字符串中包含其他字符</returns>
        private bool IsValidChar(string chinesechar)
        {
            char[] validchar = chinesechar.ToCharArray();
            for (int i = 0; i < validchar.Length; i++)
            {
                if (!ChineseChar.IsValidChar(validchar[i]))
                {
                    return false;
                }
            }
            return true;
        }

        /// <summary>
        /// 获得字符串全拼(包含多音字功能)
        /// </summary>
        /// <param name="chinesechar">转换的字符串</param>
        /// <returns>0:给定的字符串中包含其他字符 ,其他:拼音(格式 全拼/全拼/全拼@简码,简码,简码)</returns>
        public string GetPinyinList(string chinesechar)
        {
            //判断给定字符串是否有非法的汉字
            if (!IsValidChar(chinesechar))
            {               
                return "0";//存在非法汉字
            }

            //获取每个汉字拼音列表
            List<System.Collections.ObjectModel.ReadOnlyCollection<string>> list = ToPinYin(chinesechar);

            //定义输出列表,用于存在最终拼写组合的结果
            List<string> listout = new List<string>();

            //重组拼音
            for (int i = 0; i < list.Count; i++)
            {
                if (listout.Count == 0) //如果listout==0则需要将list的第一条记录中的值付给listout
                {
                    for (int k = 0; k < list[i].Count; k++)//遍历第一个字的所有读音
                    {
                        //判断第k个读音是否位null,是null 则说明已经到尾部了,没有读音了
                        if (((System.Collections.ObjectModel.ReadOnlyCollection<string>)list[i])[k] == null) { break; }
                        if (k != 0)//比较前后读音是否是音调不同 例如 hang2 heng2 这样就只记录一次
                        {
                            if (k == list[i].Count - 1) { break; }//最后一个记录以及在第list[i].Count-1  次完成判断了
                            string s1 = System.Text.RegularExpressions.Regex.Replace(((System.Collections.ObjectModel.ReadOnlyCollection<string>)list[i])[k - 1], @"[0-9]+", "");
                            string s2 = System.Text.RegularExpressions.Regex.Replace(((System.Collections.ObjectModel.ReadOnlyCollection<string>)list[i])[k], @"[0-9]+", "");
                            if (s1 == s2)//把数字去掉,只比字母 相等则证明是音调不一样
                            {
                                continue;
                            }
                            else
                            {
                                //声母或韵母不同
                                listout.Add(((System.Collections.ObjectModel.ReadOnlyCollection<string>)list[i])[k]);
                            }
                        }
                        else
                        {
                            //k=0 是第一次不用比较
                            listout.Add(((System.Collections.ObjectModel.ReadOnlyCollection<string>)list[i])[k]);
                        }
                    }
                    continue;
                }
                //中间变量 存放临时重组拼音结果
                List<string> temp = new List<string>();
                //遍历第i个字的所有读音
                for (int p = 0; p < list[i].Count; p++)
                {
                    if (p == 0)//读到当前字的第一个读音 (由于是第一个不存在字母相同声调不同的问题)
                    {
                        for (int j = 0; j < listout.Count; j++)//遍历 定义输出列表,用于存放最终拼写组合的结果,
                        {
                            temp.Add(listout[j] + list[i][p]);//将当前字的读音与listout[j]进行拼接
                        }
                    }
                    else //需要判断声调的问题
                    {
                        //判断当前字的位置第p字音是否位null ,为null 则证明已经读到当前字的最后一个音节
                        if (((System.Collections.ObjectModel.ReadOnlyCollection<string>)list[i])[p] == null) { break; }

                        if (p == list[i].Count - 1) { break; }//最后一个记录以及在第list[i].Count-1  次完成判断了
                        string t1 = System.Text.RegularExpressions.Regex.Replace(((System.Collections.ObjectModel.ReadOnlyCollection<string>)list[i])[p - 1], @"[0-9]+", "");
                        string t2 = System.Text.RegularExpressions.Regex.Replace(((System.Collections.ObjectModel.ReadOnlyCollection<string>)list[i])[p], @"[0-9]+", "");
                        if (t1 == t2)//音相同
                        {
                            continue;
                        }
                        else
                        {
                            //音不同
                            for (int j = 0; j < listout.Count; j++)
                            {
                                temp.Add(listout[j] + list[i][p]);
                            }
                        }
                    }
                }
                listout = temp;//将本次重组的结果复制给 定义输出列表,用于存放最终拼写组合的结果,
            }
            return DealResult(listout);
        }

        /// <summary>
        /// 处理最后结果(去掉声调数字、和简码)
        /// </summary>
        /// <param name="listout"></param>
        /// <returns></returns>
        private static string DealResult(List<string> listout)
        {
            string results = "";
            string[] temparrar = null;//保存每组拼音以数字分组
            string tempjm = "";//保存每组拼音的首字母
            List<string> jm = new List<string>();//保存最后输出首字母列表
            for (int i = 0; i < listout.Count; i++)//定义输出列表,用于存在最终拼写组合的结果
            {
                tempjm = "";
                temparrar = System.Text.RegularExpressions.Regex.Split(listout[i], @"[0-9]+");
                for (int k = 0; k < temparrar.Length; k++)//循环分组获得首字母
                {
                    if (temparrar[k] != "")
                    {
                        tempjm += temparrar[k].Substring(0, 1);
                    }
                }
                if (!jm.Contains(tempjm))
                {
                    jm.Add(tempjm);
                }
                string temp = System.Text.RegularExpressions.Regex.Replace(listout[i], @"[0-9]+", "");//去掉拼音组的数字
                results += temp + "/";
            }
            if (results.LastIndexOf('/') >= 0)
            {
                results = results.Remove(results.Length - 1, 1);
            }
            return results + "@" + string.Join(",", jm.ToArray());
        }
      

        //注意多音字  
        /// <summary>
        /// 指定汉字查找其所有的音节
        /// </summary>
        /// <param name="str">一个汉字</param>
        /// <returns>拼音列表</returns>
        private List<System.Collections.ObjectModel.ReadOnlyCollection<string>> ToPinYin(string str)
        {
            List<System.Collections.ObjectModel.ReadOnlyCollection<string>> list = null;
            string pinyin = string.Empty;
            list = new List<System.Collections.ObjectModel.ReadOnlyCollection<string>>();
            foreach (var item in str.ToCharArray())
            {
                if (ChineseChar.IsValidChar(item))//是汉字  
                {

                    ChineseChar chars = new ChineseChar(item);
                    list.Add(chars.Pinyins);
                    pinyin += string.Format("{0} ", chars.Pinyins[0]);
                    //chars.IsPolyphone属性标识是不是多音字,                  

                    //chars.PinyinCount//拼音的个数           
                }
            }
            return list;
        }
    }
}

posted @ 2012-02-10 11:05  刘颖  阅读(1151)  评论(0编辑  收藏