nasdaqhe's blog

被生活强jian着
  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

判断中文是否UTF8编码

Posted on 2010-12-23 11:16  nasdaqhe  阅读(1068)  评论(0编辑  收藏  举报
代码
        #region 判断Url参数是否UTF8编码
        
public static bool IsUTF8(string url)
        {
            
byte[] buf = GetUrlCodingToBytes(url);
            
return IsTextUTF8(buf);
        }
        
private static bool IsTextUTF8(byte[] buf)
        {
            
int i;
            
byte cOctets = 0// octets to go in this UTF-8 encoded character  
            bool bAllAscii = true;
            
long iLen = buf.Length;
            
for (i = 0; i < iLen; i++)
            {
                
if ((buf[i] & 0x80!= 0) bAllAscii = false;

                
if (cOctets == 0)
                {
                    
if (buf[i] >= 0x80)
                    {
                        
do
                        {
                            buf[i] 
<<= 1;
                            cOctets
++;
                        }
                        
while ((buf[i] & 0x80!= 0);

                        cOctets
--;
                        
if (cOctets != 2)
                            
return false;
                    }
                }
                
else
                {
                    
if ((buf[i] & 0xC0!= 0x80)
                        
return false;
                    cOctets
--;
                }
            }
            
if (cOctets > 0)
                
return false;
            
if (bAllAscii)
                
return false;
            
return true;
        }
        
private static byte[] GetUrlCodingToBytes(string url)
        {
            StringBuilder sb 
= new StringBuilder();

            
int i = url.IndexOf('%');
            
while (i >= 0)
            {
                
if (url.Length < i + 3)
                {
                    
break;
                }
                sb.Append(url.Substring(i, 
3));
                url 
= url.Substring(i + 3);
                i 
= url.IndexOf('%');
            }

            
string urlCoding = sb.ToString();
            
if (string.IsNullOrEmpty(urlCoding))
                
return new byte[0];

            urlCoding 
= urlCoding.Replace("%"string.Empty);
            
int len = urlCoding.Length / 2;
            
byte[] result = new byte[len];
            len 
*= 2;
            
for (int index = 0; index < len; index++)
            {
                
string s = urlCoding.Substring(index, 2);
                
int b = int.Parse(s, System.Globalization.NumberStyles.HexNumber);
                result[index 
/ 2= (byte)b;
                index
++;
            }
            
return result;
        }
        
#endregion 判断Url参数是否UTF8编码

UTF-8编码规则参考

http://blog.csdn.net/sandyen/archive/2006/08/23/1108168.aspx

 上面代码是网络上找的,不过存在大部分不能识别的情况,后根据对于中文,UTF8 一定编码成 3 字节,这个原则

 修改了一下,现在大部分情况下都能正确识别