WP谈开发(5)Gb2312编码

由于windows phone不支持gb2312编码,导致了很多的问题。

那首先我们需要判断一段文字是什么编码的。其实每个编码的文字都有其特点。我们只需要知道其特点就能判断出来它是什么编码。

Encod
public class Encod
    {

        private static int iUtfBom = 0;
        /// <summary>
        /// 取得一个文本文件流的编码方式。
        /// </summary>
        /// <param name="stream">文本文件流。</param>
        /// <returns></returns>
        public static Encoding GetEncoding(Stream stream)
        {
            Encoding EGb = new HtmlAgilityPack.Gb2312Encoding();
            return GetEncoding(stream, EGb);
        }



        /// <summary>
        /// 通过给定的文件流,判断文件的编码类型
        /// </summary>
        /// <param name="fs">文件流</param>
        /// <returns>文件的编码类型</returns>
        public static System.Text.Encoding GetType(Stream fs)
        {
            byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 };
            byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 };
            byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //带BOM
            Encoding reVal = new GB2312.GB2312Encoding();

            BinaryReader r = new BinaryReader(fs, reVal);
            int i;
            int.TryParse(fs.Length.ToString(), out i);
            byte[] ss = r.ReadBytes(i);
            if (IsUTF8Bytes(ss) || (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF))
            {
                reVal = Encoding.UTF8;
            }
            else if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00)
            {
                reVal = Encoding.BigEndianUnicode;
            }
            else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41)
            {
                reVal = Encoding.Unicode;
            }
            r.Close();
            return reVal;

        }

        /// <summary>
        /// 判断是否是不带 BOM 的 UTF8 格式
        /// </summary>
        /// <param name="data"></param>
        /// <returns></returns>
        private static bool IsUTF8Bytes(byte[] data)
        {
            int charByteCounter = 1;  //计算当前正分析的字符应还有的字节数
            byte curByte; //当前分析的字节.
            for (int i = 0; i < data.Length; i++)
            {
                curByte = data[i];
                if (charByteCounter == 1)
                {
                    if (curByte >= 0x80)
                    {
                        //判断当前
                        while (((curByte <<= 1) & 0x80) != 0)
                        {
                            charByteCounter++;
                        }
                        //标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X 
                        if (charByteCounter == 1 || charByteCounter > 6)
                        {
                            return false;
                        }
                    }
                }
                else
                {
                    //若是UTF-8 此时第一位必须为1
                    if ((curByte & 0xC0) != 0x80)
                    {
                        return false;
                    }
                    charByteCounter--;
                }
            }
            if (charByteCounter > 1)
            {
                //throw new Exception("非预期的byte格式");
            }
            return true;
        }



        /// <summary>
        /// 取得一个文本文件流的编码方式。
        /// </summary>
        /// <param name="stream">文本文件流。</param>
        /// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时,将返回该编码方式。</param>
        /// <returns></returns>
        public static Encoding GetEncoding(Stream stream, Encoding EGb)
        {
            Encoding targetEncoding = EGb;
            if (stream != null && stream.Length >= 2)
            {
                //保存文件流的前4个字节
                byte byte1 = 0;
                byte byte2 = 0;
                byte byte3 = 0;
                byte byte4 = 0;
                //保存当前Seek位置
                long origPos = stream.Seek(0, SeekOrigin.Begin);
                stream.Seek(0, SeekOrigin.Begin);
                //string so="";
                //for (int i = 0; i < 1000; i++)
                //{

                //    byte b1 = Convert.ToByte(stream.ReadByte());
                //    char ch = (char)b1;
                //    so += ch.ToString();
                //}
                int nByte = stream.ReadByte();
                byte1 = Convert.ToByte(nByte);
                byte2 = Convert.ToByte(stream.ReadByte());
                if (stream.Length >= 3)
                {
                    byte3 = Convert.ToByte(stream.ReadByte());
                }
                if (stream.Length >= 4)
                {
                    byte4 = Convert.ToByte(stream.ReadByte());
                }

                //根据文件流的前4个字节判断Encoding
                //Unicode {0xFF, 0xFE};
                //BE-Unicode {0xFE, 0xFF};
                //UTF8 = {0xEF, 0xBB, 0xBF};
                if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
                {
                    targetEncoding = Encoding.BigEndianUnicode;

                }
                if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
                {
                    targetEncoding = Encoding.Unicode;
                }
                stream.Seek(origPos, SeekOrigin.Begin);
                byte[] Btst = new byte[100];
                stream.Read(Btst, 0, Btst.Length);

                if (isUTF8(Btst))
                {
                    if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
                    {
                        targetEncoding = Encoding.UTF8;
                        iUtfBom = 3;
                    }
                    else
                    {
                        targetEncoding = Encoding.UTF8;
                    }
                }
                //恢复Seek位置      
                stream.Seek(origPos, SeekOrigin.Begin);
            }
            return targetEncoding;
        }



        private static bool isUTF8(byte[] abytes)
        {
            if (abytes.Length < 3)
            {
                return false;
            }
            int count = 0, index = 0, follow = 0;
            int b;
            for (int i = 0; i < 5; i++)
            {
                b = abytes[index] & 0xFF;
                if ((b & 0xBF) == b)
                { //10xxxxxx
                    index++;
                }
                else
                {
                    break;
                }
            }
            while (count < 10000 && index < abytes.Length)
            {
                b = abytes[index++] & 0xFF;
                if ((b & 0x7F) == b)
                { //0xxxxxxx
                    follow = 0;
                }
                else if ((b & 0xDF) == b)
                { //110xxxxx 10xxxxxx
                    follow = 1;
                }
                else if ((b & 0xEF) == b)
                { //1110xxxx 10xxxxxx 10xxxxxx
                    follow = 2;
                }
                else if ((b & 0xF7) == b)
                { //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                    follow = 3;
                }
                else if ((b & 0xFB) == b)
                { //111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                    follow = 4;
                }
                else if ((b & 0xFD) == b)
                { //1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                    follow = 5;
                }
                else
                {
                    return false;
                }
                for (int i = 0; i < follow && index < abytes.Length; i++)
                {
                    b = abytes[index++] & 0xFF;
                    if ((b & 0xBF) != b)
                    { //10xxxxxx
                        return false;
                    }
                }
            }
            return true;
        }



    }

这个类就用来判断一个文本文件流的编码。当然这里面还缺一样东西。HtmlAgilityPack这个是网上的一个dll。可以用来读取gb2312编码。不过我有一次用这个碰到了一个问题,总是报错超出索引范围。所以我用来读取的是用的另一个,详细的可以点这里去看http://www.cnblogs.com/xuesong/archive/2011/12/15/2288754.html

posted @ 2012-06-16 11:14  农村山沟沟出来的大学生  阅读(525)  评论(0编辑  收藏  举报