最近做OWA开发,遇到很多俄文邮件,这些邮件经我们程序处理后全部显示为乱码,分析接收的数据类似下面这种类型: Elena%3CBR%3E%3CB%3ESubject%3A%3C%2FB%3E+%D0%94%D0%BE%D0%B3%D0%BE%D0%B2%D0%BE%D1%80+%0A%D1%86%D0%B5%D1%81%D1%81%D0%B8%D0%B8%3CBR%3E%3C%2FFONT%3E%3CBR%3E%3C%2FDIV%3E%0A%3CDIV%3E%3C%2FDIV%3E%0A%3CDIV+class%3DSection1%3E%0A%3CP+class   按道理说,Exchange服务器本身能够处理多语言,但是把这些数据传给Exchange后却显示乱码,没办法只能自己想办法。

 

    看上边的那段数据,都是%号,跟URL的编码很类似,试一下就知道了,用.net C#验证一下,很简单一句话搞定

string resultStr = System.Web.HttpUtility.UrlDecode(textbox1.Text, System.Text.Encoding.UTF8);(如果不是UTF-8编码,可以修改上述的编码方式继续试)

解析的结果是这样:Elena<BR><B>Subject:</B> Договор цессии<BR></FONT><BR></DIV>
<DIV></DIV><DIV class="Section1"><P class

知道了编码方式就好办了,但是我们的程序是C++的,不是基于.net的,C++解析这段数据可不是一句代码能解决的。实际上这段数据包含了两种编码方式: 1.URL编码 2.UTF-8编码,单纯用一种解码方式得到的还是乱码,另外我们需要明确的是不管上边这段编码包含的是中文,英文,俄文还是日文什么的,只要把它转换成Unicode就能被识别,而不用再考虑其他字符集的问题,UTF8是Unicode得一种实现方式,按规则转换即可,以下C++ 代码来自网上,感谢大虾们,不用自己写了,呵呵。

首先:Decode URL编码:

//From Codeguru.

string UriDecode(const std::string & sSrc)
{

   const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
   const int SRC_LEN = sSrc.length();
   const unsigned char * const SRC_END = pSrc + SRC_LEN;
   // last decodable '%'
   const unsigned char * const SRC_LAST_DEC = SRC_END - 2;

   char * const pStart = new char[SRC_LEN];
   char * pEnd = pStart;

   while (pSrc < SRC_LAST_DEC)
   {
      if (*pSrc == '%')
      {
         char dec1, dec2;
         if (-1 != (dec1 = HEX2DEC[*(pSrc + 1)])
            && -1 != (dec2 = HEX2DEC[*(pSrc + 2)]))
         {
            *pEnd++ = (dec1 << 4) + dec2;
            pSrc += 3;
            continue;
         }
      }

      *pEnd++ = *pSrc++;
   }

   // the last 2- chars
   while (pSrc < SRC_END)
      *pEnd++ = *pSrc++;

   std::string sResult(pStart, pEnd);
   delete [] pStart;
   return sResult;
}

 

然后,解码UTF8:

 

wstring UTF2Uni(const char* src, std::wstring &t)
{
    if (src == NULL)
    {
        return L"";
    }
   
    int size_s = strlen(src);
    int size_d = size_s + 10;          //?
   
    wchar_t *des = new wchar_t[size_d];
    memset(des, 0, size_d * sizeof(wchar_t));
   
    int s = 0, d = 0;
    bool toomuchbyte = true; //set true to skip error prefix.
   
    while (s < size_s && d < size_d)
    {
        unsigned char c = src[s];
        if ((c & 0x80) == 0)
        {
            des[d++] += src[s++];
        }
        else if((c & 0xE0) == 0xC0)  ///< 110x-xxxx 10xx-xxxx
        {
            WCHAR &wideChar = des[d++];
            wideChar  = (src[s + 0] & 0x3F) << 6;
            wideChar |= (src[s + 1] & 0x3F);
           
            s += 2;
        }
        else if((c & 0xF0) == 0xE0)  ///< 1110-xxxx 10xx-xxxx 10xx-xxxx
        {
            WCHAR &wideChar = des[d++];
           
            wideChar  = (src[s + 0] & 0x1F) << 12;
            wideChar |= (src[s + 1] & 0x3F) << 6;
            wideChar |= (src[s + 2] & 0x3F);
           
            s += 3;
        }
        else if((c & 0xF8) == 0xF0)  ///< 1111-0xxx 10xx-xxxx 10xx-xxxx 10xx-xxxx
        {
            WCHAR &wideChar = des[d++];
           
            wideChar  = (src[s + 0] & 0x0F) << 18;
            wideChar  = (src[s + 1] & 0x3F) << 12;
            wideChar |= (src[s + 2] & 0x3F) << 6;
            wideChar |= (src[s + 3] & 0x3F);
           
            s += 4;
        }
        else
        {
            WCHAR &wideChar = des[d++]; ///< 1111-10xx 10xx-xxxx 10xx-xxxx 10xx-xxxx 10xx-xxxx
           
            wideChar  = (src[s + 0] & 0x07) << 24;
            wideChar  = (src[s + 1] & 0x3F) << 18;
            wideChar  = (src[s + 2] & 0x3F) << 12;
            wideChar |= (src[s + 3] & 0x3F) << 6;
            wideChar |= (src[s + 4] & 0x3F);
           
            s += 5;
        }
    }
   
    t = des;
    delete[] des;
    des = NULL;
   
    return t;
}

最后,因为Unicode是宽字符,我们把它转成标准字符串:

string ws2s(const wstring& ws)
{
    string curLocale = setlocale(LC_ALL, NULL); // curLocale = "C";
    setlocale(LC_ALL, "chs");
    const wchar_t* _Source = ws.c_str();
    size_t _Dsize = 2 * ws.size() + 1;
    char *_Dest = new char[_Dsize];
    memset(_Dest,0,_Dsize);
    wcstombs(_Dest,_Source,_Dsize);
    string result = _Dest;
    delete []_Dest;
    setlocale(LC_ALL, curLocale.c_str());
    return result;
}

posted on 2010-02-28 10:34  掘井及泉  阅读(8650)  评论(0)    收藏  举报