UTF8 Unicode 区别联系,UFT8 字符串Trim函数
最近项目中要将文档标题写入oracle数据库,编码是UTF8的,老报 字符串未正确结束这个错。 ORA-01756:括号内的字符串没有正确结束。
客户端,服务器的NLS_LANG都正确设置为了UTF8.
将标题按照自己的顺序输出,我一个一个字符编码去查,发现标题后面多了一个'E5' 字节。这个是一个三字节UTF8编码的前缀,后面没有后续字节了,ORACLE当然报错。。
在公布解决方法前,复习下UTF8编码:
UTF8和Ascii 是兼容编码,UTF8和Unicode是一一对应的。
一字节的为: 0*******
二字节的为: 110***** 10******
三字节的为: 1110**** 10****** 10******
依次内推 4 5 6字节。其中*为有效字符编码
到这不经有个疑问了?为什么会有 6个字节的utf8? unicode 不是两个字节么?这样表示的unicode不是超过16个bit了?
原来unicode有4字节编码的方式。只是比较少见。(不见得,我测试了20000篇文档,其中就有出现)
原问题的解决方法:
1.将数据库字符集改为 AL32UTF8。原因是 AL32UTF8支持更高版本的unicode.容错性强一点
2.写一个utf8_trim函数,将字符串中的非utf8编码去掉,替换成空格。然后入库。
我采用了第二种解决方法,写了一个utf8_trim,没有用循环,而用了大量丑陋的if else
主要考虑,一是比较简单,直观。还有就是效率高一点。因为,大部分的utf8是3字节,这样效率会高一点。
代码我初步测试过了,欢迎报bug
贴代码如下: ![]()
void trim_utf8( char* str)
{
if(str == NULL)
return ;
int length = strlen(str);
int i = 0;
while(i < length)
{
if(str[i]>0)
{
i++;
continue;
}
if(((unsigned char) str[i]&0xE0) == 192)//110 2byte
{
if ((i+1)< length)
{
if(((unsigned char)(str[i+1]&0xC0)) != 128)//10
{str[i] = ' '; i = i +1;}//invalid
else
{i= i +2;}
}
else//invalid
{
str[i]=' ';
}
continue;
}
if(((unsigned char) str[i]&0xC0) == 128)//10
{
str[i] = ' ';
i++;
continue;
}
if(((unsigned char)(str[i]&0xF0)) == 224)//1110 3byte
{
if ((i+2)< length)
{
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 )//10
{
str[i] = ' ';
if (((unsigned char)(str[i+1]&0xC0)) != 128)
{i = i+1;}
else
{str[i+1] = ' ';i = i +2;}
}
else
{
i= i +3;
}
}
else
{
for ( ;i <length;i++)
str[i]=' ';
}
continue;
}
if(((unsigned char)(str[i]&0xF8)) == 240)//11110 4byte
{
if ((i+3)< length)
{
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 || ((unsigned char)(str[i+3]&0xC0)) != 128)//10
{
int j = 0;
for ( j = i+1;j<= i+3;j++)
{
if(((unsigned char)(str[j]&0xC0)) == 128)
str[j-1] = ' ';
else
break;
}
i = j;
}
else
{
i= i +4;
}
}
else
{
for ( ;i <length;i++)
str[i]=' ';
}
continue;
}
if(((unsigned char)(str[i]&0xFC)) == 248)//111110 5byte
{
if ((i+4)< length)
{
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 || ((unsigned char)(str[i+3]&0xC0)) != 128|| ((unsigned char)(str[i+4]&0xC0)) != 128)//10
{
int j = 0;
for ( j = i+1;j<= i+4;j++)
{
if(((unsigned char)(str[j]&0xC0)) == 128)
str[j-1] = ' ';
else
break;
}
i = j;
}
else
{
i= i +5;
}
}
else
{
for ( ;i <length;i++)
str[i]=' ';
}
continue;
}
if(((unsigned char)(str[i]&0xFE)) == 252)//1111110 6byte
{
if ((i+5)< length)
{
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 || ((unsigned char)(str[i+3]&0xC0)) != 128|| ((unsigned char)(str[i+4]&0xC0)) != 128|| ((unsigned char)(str[i+5]&0xC0)) != 128)//10
{
int j = 0;
for ( j = i+1;j<= i+5;j++)
{
if(((unsigned char)(str[j]&0xC0)) == 128)
str[j-1] = ' ';
else
break;
}
i = j;
}
else
{
i= i +6;
}
}
else
{
for ( ;i <length;i++)
str[i]=' ';
}
continue;
}
str[i]=' ';
i++;
}
}
if(str == NULL)
return ;
int length = strlen(str);
int i = 0;
while(i < length)
{
if(str[i]>0)
{
i++;
continue;
}
if(((unsigned char) str[i]&0xE0) == 192)//110 2byte
{
if ((i+1)< length)
{
if(((unsigned char)(str[i+1]&0xC0)) != 128)//10
{str[i] = ' '; i = i +1;}//invalid
else
{i= i +2;}
}
else//invalid
{
str[i]=' ';
}
continue;
}
if(((unsigned char) str[i]&0xC0) == 128)//10
{
str[i] = ' ';
i++;
continue;
}
if(((unsigned char)(str[i]&0xF0)) == 224)//1110 3byte
{
if ((i+2)< length)
{
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 )//10
{
str[i] = ' ';
if (((unsigned char)(str[i+1]&0xC0)) != 128)
{i = i+1;}
else
{str[i+1] = ' ';i = i +2;}
}
else
{
i= i +3;
}
}
else
{
for ( ;i <length;i++)
str[i]=' ';
}
continue;
}
if(((unsigned char)(str[i]&0xF8)) == 240)//11110 4byte
{
if ((i+3)< length)
{
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 || ((unsigned char)(str[i+3]&0xC0)) != 128)//10
{
int j = 0;
for ( j = i+1;j<= i+3;j++)
{
if(((unsigned char)(str[j]&0xC0)) == 128)
str[j-1] = ' ';
else
break;
}
i = j;
}
else
{
i= i +4;
}
}
else
{
for ( ;i <length;i++)
str[i]=' ';
}
continue;
}
if(((unsigned char)(str[i]&0xFC)) == 248)//111110 5byte
{
if ((i+4)< length)
{
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 || ((unsigned char)(str[i+3]&0xC0)) != 128|| ((unsigned char)(str[i+4]&0xC0)) != 128)//10
{
int j = 0;
for ( j = i+1;j<= i+4;j++)
{
if(((unsigned char)(str[j]&0xC0)) == 128)
str[j-1] = ' ';
else
break;
}
i = j;
}
else
{
i= i +5;
}
}
else
{
for ( ;i <length;i++)
str[i]=' ';
}
continue;
}
if(((unsigned char)(str[i]&0xFE)) == 252)//1111110 6byte
{
if ((i+5)< length)
{
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 || ((unsigned char)(str[i+3]&0xC0)) != 128|| ((unsigned char)(str[i+4]&0xC0)) != 128|| ((unsigned char)(str[i+5]&0xC0)) != 128)//10
{
int j = 0;
for ( j = i+1;j<= i+5;j++)
{
if(((unsigned char)(str[j]&0xC0)) == 128)
str[j-1] = ' ';
else
break;
}
i = j;
}
else
{
i= i +6;
}
}
else
{
for ( ;i <length;i++)
str[i]=' ';
}
continue;
}
str[i]=' ';
i++;
}
}

浙公网安备 33010602011771号