utf8_to_utf16

 

17down voteaccepted

Here's some code. Only lightly tested and there's probably a few improvements. Call this function to convert a UTF-8 string to a UTF-16 wstring. If it thinks the input string is not UTF-8 then it will throw an exception, otherwise it returns the equivalent UTF-16 wstring.

std::wstring utf8_to_utf16(const std::string& utf8)
{
    std::vector<unsigned long> unicode;
    size_t i = 0;
    while (i < utf8.size())
    {
        unsigned long uni;
        size_t todo;
        bool error = false;
        unsigned char ch = utf8[i++];
        if (ch <= 0x7F)
        {
            uni = ch;
            todo = 0;
        }
        else if (ch <= 0xBF)
        {
            throw std::logic_error("not a UTF-8 string");
        }
        else if (ch <= 0xDF)
        {
            uni = ch&0x1F;
            todo = 1;
        }
        else if (ch <= 0xEF)
        {
            uni = ch&0x0F;
            todo = 2;
        }
        else if (ch <= 0xF7)
        {
            uni = ch&0x07;
            todo = 3;
        }
        else
        {
            throw std::logic_error("not a UTF-8 string");
        }
        for (size_t j = 0; j < todo; ++j)
        {
            if (i == utf8.size())
                throw std::logic_error("not a UTF-8 string");
            unsigned char ch = utf8[i++];
            if (ch < 0x80 || ch > 0xBF)
                throw std::logic_error("not a UTF-8 string");
            uni <<= 6;
            uni += ch & 0x3F;
        }
        if (uni >= 0xD800 && uni <= 0xDFFF)
            throw std::logic_error("not a UTF-8 string");
        if (uni > 0x10FFFF)
            throw std::logic_error("not a UTF-8 string");
        unicode.push_back(uni);
    }
    std::wstring utf16;
    for (size_t i = 0; i < unicode.size(); ++i)
    {
        unsigned long uni = unicode[i];
        if (uni <= 0xFFFF)
        {
            utf16 += (wchar_t)uni;
        }
        else
        {
            uni -= 0x10000;
            utf16 += (wchar_t)((uni >> 10) + 0xD800);
            utf16 += (wchar_t)((uni & 0x3FF) + 0xDC00);
        }
    }
    return utf16;
}
shareimprove this answer

 

http://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring

 

 

#pragma once
#include <string>

#ifdef tstring
#error "\"tstring\" Macro has been defined."
#else
#ifdef _UNICODE
#define tstring wstring
#else
#define tstring string
#endif
#endif

class EncodingConverter
{
public:
    static int AnsiStrToWideStr(std::string& strSrc, std::wstring& strDest)
    {
        int nLen = strSrc.length() + 1;
        int nRet = 0;

        nLen *=  sizeof(wchar_t);

        wchar_t* pszW = new wchar_t[nLen];
        memset(pszW, 0, nLen);

        nRet = MultiByteToWideChar(CP_ACP, 0, strSrc.c_str(), -1, pszW, nLen); 

        strDest = pszW;
        delete[] pszW;

        return nRet;
    };

    static int WideStrToAnsiStr(std::wstring& strSrc, std::string& strDest)
    {
        int nLen = strSrc.length() + 1;
        int nRet = 0;

        nLen *= sizeof(wchar_t);

        char* pszA = new char[nLen];
        memset(pszA, 0, nLen);


        nRet = WideCharToMultiByte(CP_ACP, 0, strSrc.c_str(), -1, pszA, nLen, NULL, NULL); 

        strDest = pszA;
        delete[] pszA;

        return nRet;
    };

    static int AnsiStrToTStr(std::string& strSrc, std::tstring& strDest)
    {
        int nRet = 0;

#ifdef _UNICODE
        nRet = AnsiStrToWideStr(strSrc, strDest);
#else
        strDest = strSrc;
        nRet = strDest.length();
#endif

        return nRet;
    };

    static int TStrToAnsiStr(std::tstring& strSrc, std::string& strDest)
    {
        int nRet = 0;

#ifdef _UNICODE
        nRet = WideStrToAnsiStr(strSrc, strDest);
#else
        strDest = strSrc;
        nRet = strDest.length();
#endif

        return nRet;
    };

    static int WideStrToTStr(std::wstring& strSrc, std::tstring& strDest)
    {
        int nRet = 0;

#ifdef _UNICODE
        strDest = strSrc;
        nRet = strDest.length();
#else
        nRet = WideStrToAnsiStr(strSrc, strDest);
#endif

        return nRet;
    };

    static int TStrToWideStr(std::tstring& strSrc, std::wstring& strDest)
    {
        int nRet = 0;

#ifdef _UNICODE
        strDest = strSrc;
        nRet = strDest.length();
#else
        nRet = AnsiStrToWideStr(strSrc, strDest);
#endif

        return nRet;
    };

    static std::string ToAnsiString(const wchar_t* lpStr)
    {
        std::wstring wide_string = lpStr;
        std::string ansi_string;

        WideStrToAnsiStr(wide_string, ansi_string);
        return ansi_string;
    };

    static std::string ToAnsiString(const char* lpStr)
    {
        return std::string(lpStr);
    };

    static std::wstring ToWideString(const wchar_t* lpStr)
    {
        return std::wstring(lpStr);
    };

    static std::wstring ToWideString(const char* lpStr)
    {
        std::string ansi_string = lpStr;
        std::wstring wide_string;

        AnsiStrToWideStr(ansi_string, wide_string);
        return wide_string;
    };

    static std::tstring ToTString(const char* lpStr)
    {
#ifdef _UNICODE
        return ToWideString(lpStr);
#else
        return ToAnsiString(lpStr);
#endif
    };

    static std::tstring ToTString(const wchar_t* lpStr)
    {
#ifdef _UNICODE
        return ToWideString(lpStr);
#else
        return ToAnsiString(lpStr);
#endif
    };

    static int WideStrToUtf8Str(std::wstring& strSrc, std::string& strDest)
    {
        int nRet = 0;
        int nLen = 0;

        nLen = WideCharToMultiByte(CP_UTF8, 0, strSrc.c_str(), -1, NULL, 0, NULL, NULL);

        char * lpUtf8Str = new char[nLen+1];
        memset(lpUtf8Str, 0, nLen);
        nRet = WideCharToMultiByte(CP_UTF8, 0, strSrc.c_str(), -1, lpUtf8Str, nLen, NULL, NULL);
        strDest = lpUtf8Str;
        delete[] lpUtf8Str;

        return nRet;
    };

    static int AnsiStrToUtf8Str(std::string& strSrc, std::string& strDest)
    {
        int nRet = 0;
        std::wstring wide_string;

        nRet = AnsiStrToWideStr(strSrc, wide_string);
        nRet = WideStrToUtf8Str(wide_string, strDest);

        return nRet;
    };

    static int Utf8StrToWideStr(const std::string& strSrc, std::wstring& strDest)
    {
        int nRet = 0;
        int nLen = 0;

        nLen = MultiByteToWideChar(CP_UTF8, 0, strSrc.c_str(), -1, NULL, 0);

        wchar_t* lpWideStr = new wchar_t[nLen];
        memset(lpWideStr, 0, nLen*sizeof(lpWideStr[0]));
        nRet = MultiByteToWideChar(CP_UTF8, 0, strSrc.c_str(), -1, lpWideStr, nLen);
        strDest = lpWideStr;
        delete[] lpWideStr;

        return nRet;
    };

    static int Utf8StrToAnsiStr(const std::string& strSrc, std::string& strDest)
    {
        int nRet = 0;
        std::wstring wide_string;

        nRet = Utf8StrToWideStr(strSrc, wide_string);
        nRet = WideStrToAnsiStr(wide_string, strDest);

        return nRet;
    };    

    static int Utf8StrToTStr(const std::string& strSrc, std::tstring& strDest)
    {
#ifdef UNICODE
        return Utf8StrToWideStr(strSrc, strDest);
#else
        return Utf8StrToAnsiStr(strSrc, strDest);
#endif
    };    

    static std::string ToUtf8String(const std::string& str)
    {
        std::string ansi_string = str;
        std::string utf8_string;

        AnsiStrToUtf8Str(ansi_string, utf8_string);
        return utf8_string;
    };

    static std::string ToUtf8String(const std::wstring& str)
    {
        std::wstring wide_string = str;
        std::string utf8_string;

        WideStrToUtf8Str(wide_string, utf8_string);
        return utf8_string;
    };
};

https://github.com/yaocoder/utility/blob/master/src/common/EncodingConverter.h

 

posted @ 2016-09-12 23:57  findumars  Views(1607)  Comments(0Edit  收藏  举报