unicode与utf8互转
#ref: http://www.ietf.org/rfc/rfc3629.txt
#   Char. number range  |        UTF-8 octet sequence
#      (hexadecimal)    |              (binary)
#   --------------------+---------------------------------------------
#   0000 0000-0000 007F | 0xxxxxxx
#   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
#   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
#   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

def chr_encode(code):
    if code < 0x80:
        return [code]
    if code < 0x800:
        return [0xc0 + ((code>>6)&0x1f),
                      0x80 + (code&0x3f)]
    if code < 0x10000:
        return [0xe0 + ((code>>12)&0xf),
                      0x80 + ((code>>6)&0x3f),
                      0x80 + (code&0x3f)]
    if code < 0x200000:
        return [0xf0 + ((code>>18)&0x7),
                      0x80 + ((code>>12)&0x3f),
                      0x80 + ((code>>6)&0x3f),
                      0x80 + (code&0x3f)]

def utf8_encode(unicode_array):
    ret = []
    for i in unicode_array:
        ret += chr_encode(i)
    return ret

# 通过code计算需要的长度
def utf8_need_len(code):
    if code < 0x80:
        return 1
    if code < 0xdf:  #5,6
        return 2
    if code < 0xef:  #4,6,6
        return 3
    if code < 0xf7:  #3,6,6,6
        return 4

def chr_decode(array, index=0):
    code = array[index]
    if code < 0x80:  #7
        return code
    if code < 0xdf:  #5,6
        return ((array[index]&0x1f)<<6) + (array[index+1]&0x3f)
    if code < 0xef:  #4,6,6
        return ((array[index]&0xf)<<12) + ((array[index+1]&0x3f)<<6) \
               + (array[index+2]&0x3f)
    if code < 0xf7:  #3,6,6,6
        return ((array[index]&0x7)<<18) + ((array[index+1]&0x3f)<<12) \
                + ((array[index+2]&0x3f)<<6) + (array[index+3]&0x3f)

def utf8_decode(utf8_array):
    ret = []
    index = 0
    total = len(utf8_array)
    while True:
        if index >= total:
            break
        need = utf8_need_len(utf8_array[index])
        if index+need > total:
            break
        ret.append(chr_decode(utf8_array, index))
        index += need
    return ret

def test_utf8():
    s = ["你是谁", "1234434", "QQQasasd", "遗传很长的字符串啊实打实放到地方地方地方"]
    for i in s:
        text = [ord(j) for j in i]
        assert(bytes(utf8_encode(text)) == i.encode())
    for i in s:
        utf8_bytes = i.encode()
        myres = utf8_decode(utf8_bytes)
        myres = ''.join([chr(j) for j in myres])
        assert(myres == i)

test_utf8()

  

posted on 2017-11-02 09:14  reredoc  阅读(143)  评论(0)    收藏  举报