最近做OWA开发,遇到很多俄文邮件,这些邮件经我们程序处理后全部显示为乱码,分析接收的数据类似下面这种类型: Elena%3CBR%3E%3CB%3ESubject%3A%3C%2FB%3E+%D0%94%D0%BE%D0%B3%D0%BE%D0%B2%D0%BE%D1%80+%0A%D1%86%D0%B5%D1%81%D1%81%D0%B8%D0%B8%3CBR%3E%3C%2FFONT%3E%3CBR%3E%3C%2FDIV%3E%0A%3CDIV%3E%3C%2FDIV%3E%0A%3CDIV+class%3DSection1%3E%0A%3CP+class 按道理说,Exchange服务器本身能够处理多语言,但是把这些数据传给Exchange后却显示乱码,没办法只能自己想办法。
看上边的那段数据,都是%号,跟URL的编码很类似,试一下就知道了,用.net C#验证一下,很简单一句话搞定
string resultStr = System.Web.HttpUtility.UrlDecode(textbox1.Text, System.Text.Encoding.UTF8);(如果不是UTF-8编码,可以修改上述的编码方式继续试)
解析的结果是这样:Elena<BR><B>Subject:</B> Договор цессии<BR></FONT><BR></DIV>
<DIV></DIV><DIV class="Section1"><P class
知道了编码方式就好办了,但是我们的程序是C++的,不是基于.net的,C++解析这段数据可不是一句代码能解决的。实际上这段数据包含了两种编码方式: 1.URL编码 2.UTF-8编码,单纯用一种解码方式得到的还是乱码,另外我们需要明确的是不管上边这段编码包含的是中文,英文,俄文还是日文什么的,只要把它转换成Unicode就能被识别,而不用再考虑其他字符集的问题,UTF8是Unicode得一种实现方式,按规则转换即可,以下C++ 代码来自网上,感谢大虾们,不用自己写了,呵呵。
首先:Decode URL编码:
//From Codeguru.
string UriDecode(const std::string & sSrc)
{
const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
const int SRC_LEN = sSrc.length();
const unsigned char * const SRC_END = pSrc + SRC_LEN;
// last decodable '%'
const unsigned char * const SRC_LAST_DEC = SRC_END - 2;
char * const pStart = new char[SRC_LEN];
char * pEnd = pStart;
while (pSrc < SRC_LAST_DEC)
{
if (*pSrc == '%')
{
char dec1, dec2;
if (-1 != (dec1 = HEX2DEC[*(pSrc + 1)])
&& -1 != (dec2 = HEX2DEC[*(pSrc + 2)]))
{
*pEnd++ = (dec1 << 4) + dec2;
pSrc += 3;
continue;
}
}
*pEnd++ = *pSrc++;
}
// the last 2- chars
while (pSrc < SRC_END)
*pEnd++ = *pSrc++;
std::string sResult(pStart, pEnd);
delete [] pStart;
return sResult;
}
然后,解码UTF8:
wstring UTF2Uni(const char* src, std::wstring &t)
{
if (src == NULL)
{
return L"";
}
int size_s = strlen(src);
int size_d = size_s + 10; //?
wchar_t *des = new wchar_t[size_d];
memset(des, 0, size_d * sizeof(wchar_t));
int s = 0, d = 0;
bool toomuchbyte = true; //set true to skip error prefix.
while (s < size_s && d < size_d)
{
unsigned char c = src[s];
if ((c & 0x80) == 0)
{
des[d++] += src[s++];
}
else if((c & 0xE0) == 0xC0) ///< 110x-xxxx 10xx-xxxx
{
WCHAR &wideChar = des[d++];
wideChar = (src[s + 0] & 0x3F) << 6;
wideChar |= (src[s + 1] & 0x3F);
s += 2;
}
else if((c & 0xF0) == 0xE0) ///< 1110-xxxx 10xx-xxxx 10xx-xxxx
{
WCHAR &wideChar = des[d++];
wideChar = (src[s + 0] & 0x1F) << 12;
wideChar |= (src[s + 1] & 0x3F) << 6;
wideChar |= (src[s + 2] & 0x3F);
s += 3;
}
else if((c & 0xF8) == 0xF0) ///< 1111-0xxx 10xx-xxxx 10xx-xxxx 10xx-xxxx
{
WCHAR &wideChar = des[d++];
wideChar = (src[s + 0] & 0x0F) << 18;
wideChar = (src[s + 1] & 0x3F) << 12;
wideChar |= (src[s + 2] & 0x3F) << 6;
wideChar |= (src[s + 3] & 0x3F);
s += 4;
}
else
{
WCHAR &wideChar = des[d++]; ///< 1111-10xx 10xx-xxxx 10xx-xxxx 10xx-xxxx 10xx-xxxx
wideChar = (src[s + 0] & 0x07) << 24;
wideChar = (src[s + 1] & 0x3F) << 18;
wideChar = (src[s + 2] & 0x3F) << 12;
wideChar |= (src[s + 3] & 0x3F) << 6;
wideChar |= (src[s + 4] & 0x3F);
s += 5;
}
}
t = des;
delete[] des;
des = NULL;
return t;
}
最后,因为Unicode是宽字符,我们把它转成标准字符串:
string ws2s(const wstring& ws)
{
string curLocale = setlocale(LC_ALL, NULL); // curLocale = "C";
setlocale(LC_ALL, "chs");
const wchar_t* _Source = ws.c_str();
size_t _Dsize = 2 * ws.size() + 1;
char *_Dest = new char[_Dsize];
memset(_Dest,0,_Dsize);
wcstombs(_Dest,_Source,_Dsize);
string result = _Dest;
delete []_Dest;
setlocale(LC_ALL, curLocale.c_str());
return result;
}