Clucene C++编码转换

在做Clucene与lucene生成的Index文件相互兼容时，遇到了编码转换问题。它们的兼容性对于非英文的编码可能都会存在这样的问题，经过跟踪clucene程序，发现它用的是unicode编码方式储蓄，因此，要先把字符串或文件转换成unicode编码，然后再进行其它处理。

转换的具体代码如下(Linux与vc6.0测试通过)：

#ifndef _UNIX
static inline int codepage(const char* code_page)
{
return 936;//"GBK"
}
#endif

static inline int mb2wc(const char* code_page,/*in*/const char* in,int in_len,
/*out*/wchar_t* out,int out_max)
{
#ifdef _UNIX
size_t result;
iconv_t env;
env = iconv_open("WCHAR_T",code_page);
result = iconv(env,(char**)&in,(size_t*)&in_len,(char**)&out,(size_t*)&out_max);
iconv_close(env);
return (int) result;
#else
return ::MultiByteToWideChar(codepage(code_page),0,in,in_len,out,out_max);
#endif
}

static inline int wc2mb(const char* code_page,/*in*/const wchar_t* in,int in_len,
/*out*/char* out,int out_max)
{
#ifdef _UNIX
size_t result;
iconv_t env;
env = iconv_open(code_page,"WCHAR_T");
result = iconv(env,(char**)&in,(size_t*)&in_len,(char**)&out,(size_t*)&out_max);
iconv_close(env);
return (int) result;
#else
return ::WideCharToMultiByte(codepage(code_page),0,in,-1,out,out_max, NULL, NULL);
#endif
}

void str_to_UnicodeChar(const char* strIn,TCHAR* &strOut){
if(!strIn)
return;

int i= mb2wc("936",(char*)strIn, -1, NULL, 0);
strOut = (TCHAR*)malloc(sizeof(TCHAR)*i);
mb2wc("936",(char*)strIn, -1, strOut, i);
}
void UnicodeChar_to_str(const TCHAR* strIn,char* &strOut){
if(!strIn)
return;

int i = wc2mb("936",strIn,-1,NULL,0);
strOut = new char[i+1];
wc2mb("936", strIn, -1, strOut, i);
strOut[i] = 0;
}

void tchar_to_str(const const TCHAR* strIn ,char* &strOut){
int i=0;
if(!strIn)
  return ;
strOut = new char[1024];

while(*strIn) {
  strOut[i]=*strIn++;
  i++;
}
strOut[i]='\0';

}

posted on 2008-06-05 21:33 cy163 阅读(1072) 评论(0) 收藏举报

刷新页面返回顶部

Clucene C++编码转换

导航

公告