1.中文语料常常遇到编码问题,将任意字符集文件转为utf-8编码
1 import chardet
2 import codecs
3 from django.utils.encoding import smart_text
4
5 def check_file_charset(file): #查看file文件的编码
6 with open(file, 'rb') as f:
7 return chardet.detect(f.read())
8
9 def Convert_file_character(File_path):
10 f_type = check_file_charset(File_path)
11 print (File_path,"字符集为:",f_type['encoding'])
12 try:
13 if f_type and 'encoding' in f_type.keys() and f_type['encoding'] != 'utf-8':
14 with codecs.open(File_path, 'rb', f_type['encoding'],errors='ignore') as f:
15 content = smart_text(f.read())
16 with codecs.open(File_path, 'wb', 'utf-8') as f:
17 f.write(content)
18 print ("字符集转换成功")
19 else:
20 print("字符集为 utf-8,不需要进行转换")
21 except Exception as ERR:
22 print("字符集转换失败")
23 print (ERR)
24
25 corpus_path = './unlabel'
26 raw_train_files = [corpus_path + os.sep + file_name for file_name in os.listdir(corpus_path)]
27 for raw_train_file in raw_train_files:
28 Convert_file_character(raw_train_file)
参考:https://blog.csdn.net/qq_35751770/article/details/103664496
2.将unlabel文件夹中的所有.txt文件合并,每个文件之间空一行
先调用上面的代码转换编码
1 def combine(corpus_path, outpath):
2 output = open(outpath, 'a', encoding='utf-8')
3
4 raw_train_files = [corpus_path + os.sep + file_name for file_name in os.listdir(corpus_path)]
5 for raw_train_file in raw_train_files:
6
7 f_type = check_file_charset(raw_train_file) #查看文件的编码
8 print (raw_train_file,"字符集为:",f_type['encoding'])
9 with open(raw_train_file, 'r+', encoding='utf-8') as f:
10 context = f.readlines()
11
12 for x in context:
13 output.write(x)
14 output.write('\n')
15
16 combine('./unlabel', 'all_unlabel.txt')
3.随机抽取.txt文件中的60%,20%,5%
1 def part(filename, outpath, ratio):
2 output = open(outpath, 'w+', encoding='utf-8')
3 context = []
4 with open(filename, 'r+', encoding='utf-8') as f:
5 context.extend(f.readlines())
6
7 length = len(context)
8 random_order = list(range(length))
9 np.random.shuffle(random_order)
10
11 batch_size = int(length*ratio)
12 print(batch_size)
13 for x in context[:batch_size]:
14 output.write(x)
15
16 ratio1, ratio2, ratio3 = 0.6, 0.2, 0.05
17 part('training/law_train.txt', 'training/law_train1.txt', ratio1)
18 part('training/law_train.txt', 'training/law_train2.txt', ratio2)
19 part('training/law_train.txt', 'training/law_train3.txt', ratio3)
4.将已经分好词的文件去掉空格(正则),恢复成文件原来的样子
1 def deal_data(filename, outpath):
2 output = open(outpath, 'w+', encoding='utf-8')
3
4 with open(filename, 'r+', encoding='utf-8') as f:
5 context = f.readlines()
6 for data in context: #data为某一行数据
7 x = re.sub('\s+', '', data).strip()
8 output.write(x)
9
10
11 deal_data('evaluate/law/Law_contract_test.txt', 'evaluate/gold/Law_contract_test.txt')
12 deal_data('evaluate/law/Law_marriage_test.txt', 'evaluate/gold/Law_marriage_test.txt')
13 deal_data('evaluate/law/Law_mixed_test.txt', 'evaluate/gold/Law_mixed_test.txt')
5.读取excel文件转换成.json文件
1 #coding=utf-8
2 import xlrd #对excel文件内容读取
3 import xlwt #对excel文件内容写入
4 import json
5 """
6 打开excel文件 处理成json文件 {text:,label:}
7 data.xls变成train.json、val.json、test.json
8 """
9
10 def deal_data(filename,outpath): #filename为xlsx文件路径 outputfile为json文件路径
11 wb = xlrd.open_workbook(filename) #打开excel文件读取数据
12 data_file=["train","test","val"]
13
14 for excel_name in data_file:
15 output_file = outpath + excel_name+".json" #命名处理之后的json文件名
16 output = open(output_file, "w", encoding="utf-8") #写入
17
18 excel = wb.sheet_by_name(excel_name) #根据sheet名称获取sheet内容
19 rows_n = excel.nrows #同时获取sheet总行数
20 for i in range(rows_n): #分别获取每行的第0、1、2列
21 data_dic = {}
22 data_dic["filepath"] = excel.cell_value(i , 0)
23 data_dic["text"] = excel.cell_value(i , 1).strip()
24 data_dic["label"] = tuple(excel.cell_value(i , 2).split())
25
26 output.write(json.dumps(data_dic) + "\n") #写入json文件
27 output.close()
28
29 deal_data("data01.xls","corpus/class/origin_corpus/")

浙公网安备 33010602011771号