关于utf8和unicode的编码原理可以参考这篇文章:
http://hi.baidu.com/dustin_xiao/blog/item/2ab75b24c27ca32ed507426f.html
下面是实现代码,只实现了中文和英文字符的转换部分:
1 int GetUtf8ByteNumForWord(char firstCh)
2 {
3 int nRet=0;
4 __asm
5 {
6 movzx ecx,byte ptr[firstCh]
7 and ecx,0xE0
8 jz done
9 test ecx,0x80
10 jnz lbm
11 mov nRet,1
12 jmp done
13 lbm:
14 cmp cl,0xE0
15 jz lb3
16 cmp cl,0x0C
17 jz lb2
18 jmp done
19 lb3:
20 mov nRet,3
21 jmp done
22 lb2:
23 mov nRet,2
24 done:
25 }
26 return nRet;
27 }
28 void Utf8ToUnicode(const char* utf8, int len, wchar_t *unicode)
29 {
30 int i = 0;
31 int j = 0;
32 char* temp=(char*)unicode;
33 //循环解析
34 while (i < len)
35 {
36 int nByteNum=GetUtf8ByteNumForWord(utf8[i]);
37 if (nByteNum==0)
38 {
39 return;
40 }
41 switch(nByteNum)
42 {
43 case 1:
44 temp[j] = utf8[i];
45 temp[j+1]=0;
46 break;
47 case 2:
48 temp[j] = utf8[i];
49 temp[j + 1] = utf8[i + 1];
50 break;
51 case 3:
52 //这里就开始进行UTF8->Unicode
53 temp[j + 1] = ((utf8[i] & 0x0F) << 4) | ((utf8[i + 1] >> 2) & 0x0F);
54 temp[j] = ((utf8[i + 1] & 0x03) << 6) + (utf8[i + 2] & 0x3F);
55 break;
56 default:
57 break;
58 }
59 j+=2;
60 i+=nByteNum;
61 }
62 temp[j]=0;
63 temp[j+1]=0;
64 }
65
2 {
3 int nRet=0;
4 __asm
5 {
6 movzx ecx,byte ptr[firstCh]
7 and ecx,0xE0
8 jz done
9 test ecx,0x80
10 jnz lbm
11 mov nRet,1
12 jmp done
13 lbm:
14 cmp cl,0xE0
15 jz lb3
16 cmp cl,0x0C
17 jz lb2
18 jmp done
19 lb3:
20 mov nRet,3
21 jmp done
22 lb2:
23 mov nRet,2
24 done:
25 }
26 return nRet;
27 }
28 void Utf8ToUnicode(const char* utf8, int len, wchar_t *unicode)
29 {
30 int i = 0;
31 int j = 0;
32 char* temp=(char*)unicode;
33 //循环解析
34 while (i < len)
35 {
36 int nByteNum=GetUtf8ByteNumForWord(utf8[i]);
37 if (nByteNum==0)
38 {
39 return;
40 }
41 switch(nByteNum)
42 {
43 case 1:
44 temp[j] = utf8[i];
45 temp[j+1]=0;
46 break;
47 case 2:
48 temp[j] = utf8[i];
49 temp[j + 1] = utf8[i + 1];
50 break;
51 case 3:
52 //这里就开始进行UTF8->Unicode
53 temp[j + 1] = ((utf8[i] & 0x0F) << 4) | ((utf8[i + 1] >> 2) & 0x0F);
54 temp[j] = ((utf8[i + 1] & 0x03) << 6) + (utf8[i + 2] & 0x3F);
55 break;
56 default:
57 break;
58 }
59 j+=2;
60 i+=nByteNum;
61 }
62 temp[j]=0;
63 temp[j+1]=0;
64 }
65
测试代码如下:
1 std::ifstream fin("debug\\Test.txt");
2 const unsigned int L_MAX_LINE=1024;
3 char utf8[L_MAX_LINE];
4 wchar_t unicode[L_MAX_LINE];
5 while(fin.getline(utf8,L_MAX_LINE))
6 {
7 Utf8ToUnicode(utf8,strlen(utf8),unicode);
8 MessageBoxW(0,unicode,0,0);
9 }
10 fin.close();
2 const unsigned int L_MAX_LINE=1024;
3 char utf8[L_MAX_LINE];
4 wchar_t unicode[L_MAX_LINE];
5 while(fin.getline(utf8,L_MAX_LINE))
6 {
7 Utf8ToUnicode(utf8,strlen(utf8),unicode);
8 MessageBoxW(0,unicode,0,0);
9 }
10 fin.close();
用一篇文章进行测试,结果如下: