python3读取windows下的utf-8等编码格式的文本文件
在windows下,创建一个文本,默认的编码是: ANSI, 用python3的open()方法读取这样的文本很方便
直接用:
f = open("sample.txt","r")
text = f.read()
就能直接读到。但是如果存为 utf-8等其他的编码就会出现读取错误。
windows下的utf-8编码格式的文本有3个字节的BOM头, 在python3中对应codecs.BOM_UTF8。
整理了用python3读取windows下4中编码格式的文本的module作为学习。
读取模块:WinTxtReader.py
#!/usr/bin/env python3
#encoding=utf-8
import codecs
"这个模块用于读取widons下的文本文件,windows下的文本文件会存为\
(ANSI, utf-8, unicode, unicode big endian)4种编码格式\
这个模块就用于读取这4种格式的文本"
name = "WinTxtReader"
version = "1.0"
author = "vily"
email = "vily313@126.com"
def readUTF8(f_url):
try:
f = codecs.open(f_url)
except IOError as err:
print("In function readUTF8, ",err)
return ""
txt = f.read()
f.close()
if txt[:3]==codecs.BOM_UTF8:
txt = txt[3:].decode("utf-8")
return txt
return ""
def readANSI(f_url):
try:
f = open(f_url,"r")
except IOError as err:
print("In function readANSI, ",err)
return ""
txt = f.read()
return txt
def readUNICODE(f_url):
try:
f = codecs.open(f_url)
except IOError as err:
print("In function readUNICODE, ",err)
return ""
txt = f.read()
f.close()
try:
txt = txt.decode("utf-16")
return txt
except UnicodeDecodeError as err:
print("In function readUNICODE, ",err)
return ""
def readUBE(f_url):
return readUNICODE(f_url)
# 自动选择读取这四种编码格式的
def readTxt(f_url):
try:
f = open(f_url,"rb")
except IOError as err:
print("In function readTxt, ",err)
return ""
bytes = f.read()
f.close()
if bytes[:3] == codecs.BOM_UTF8:
return bytes[3:].decode("utf-8")
else:
# 先当作 ansi 编码解码
try:
txt = bytes.decode("gb2312")
#print("当作 ansi 编码解码")
return txt
except UnicodeDecodeError as err:
# 再当作unicode 编码解码
try:
txt = bytes.decode("utf-16")
#print("当作 unicode 编码解码")
return txt
except UnicodeDecodeError as err:
print("In function readTxt, ",err)
return ""
#encoding=utf-8
import codecs
"这个模块用于读取widons下的文本文件,windows下的文本文件会存为\
(ANSI, utf-8, unicode, unicode big endian)4种编码格式\
这个模块就用于读取这4种格式的文本"
name = "WinTxtReader"
version = "1.0"
author = "vily"
email = "vily313@126.com"
def readUTF8(f_url):
try:
f = codecs.open(f_url)
except IOError as err:
print("In function readUTF8, ",err)
return ""
txt = f.read()
f.close()
if txt[:3]==codecs.BOM_UTF8:
txt = txt[3:].decode("utf-8")
return txt
return ""
def readANSI(f_url):
try:
f = open(f_url,"r")
except IOError as err:
print("In function readANSI, ",err)
return ""
txt = f.read()
return txt
def readUNICODE(f_url):
try:
f = codecs.open(f_url)
except IOError as err:
print("In function readUNICODE, ",err)
return ""
txt = f.read()
f.close()
try:
txt = txt.decode("utf-16")
return txt
except UnicodeDecodeError as err:
print("In function readUNICODE, ",err)
return ""
def readUBE(f_url):
return readUNICODE(f_url)
# 自动选择读取这四种编码格式的
def readTxt(f_url):
try:
f = open(f_url,"rb")
except IOError as err:
print("In function readTxt, ",err)
return ""
bytes = f.read()
f.close()
if bytes[:3] == codecs.BOM_UTF8:
return bytes[3:].decode("utf-8")
else:
# 先当作 ansi 编码解码
try:
txt = bytes.decode("gb2312")
#print("当作 ansi 编码解码")
return txt
except UnicodeDecodeError as err:
# 再当作unicode 编码解码
try:
txt = bytes.decode("utf-16")
#print("当作 unicode 编码解码")
return txt
except UnicodeDecodeError as err:
print("In function readTxt, ",err)
return ""
测试代码:
#这是测试代码...
#!/usr/bin/env python3
import WinTxtReader
txtUrl_a = "hello_utf8.txt"
text = WinTxtReader.readUTF8(txtUrl_a)
print("utf-8 txt:",text)
txtUrl_b = "hello_ansi.txt"
text = WinTxtReader.readANSI(txtUrl_b)
print("ansi txt:",text)
txtUrl_c = "hello_unicode.txt"
text = WinTxtReader.readUNICODE(txtUrl_c)
print("unicode txt:",text)
txtUrl_d = "hello_ube.txt"
text = WinTxtReader.readUBE(txtUrl_d)
print("unicode big endian txt:",text)
print("----------------------")
text = WinTxtReader.readTxt(txtUrl_d)
print("read ",txtUrl_d,":",text)
import WinTxtReader
txtUrl_a = "hello_utf8.txt"
text = WinTxtReader.readUTF8(txtUrl_a)
print("utf-8 txt:",text)
txtUrl_b = "hello_ansi.txt"
text = WinTxtReader.readANSI(txtUrl_b)
print("ansi txt:",text)
txtUrl_c = "hello_unicode.txt"
text = WinTxtReader.readUNICODE(txtUrl_c)
print("unicode txt:",text)
txtUrl_d = "hello_ube.txt"
text = WinTxtReader.readUBE(txtUrl_d)
print("unicode big endian txt:",text)
print("----------------------")
text = WinTxtReader.readTxt(txtUrl_d)
print("read ",txtUrl_d,":",text)

浙公网安备 33010602011771号