import re
import jieba.analyse
import codecs
import pandas as pd
def word_replace(xianbingshi,hospital1):
"""替换词表"""
data = []
hospital = []
"""去重"""
with codecs.open(hospital1,'r','utf8') as f:
for line in f:
line = line.strip()
if line not in hospital:
hospital.append(line)
else:
continue
hospital.sort(key=len, reverse=True)
with codecs.open(xianbingshi,'r','utf8') as f:
"""优先级:医院、手术、检查、症状、疾病、部位、时间"""
for line in f:
for ho in hospital:
if ho in hospital:
line = line.replace(ho,'[hospital]')
line = line.strip()
data.append(line)
print(line)
with codecs.open(r'C:\Users\Administrator.SC-201812211013\PycharmProjects\词表工作代码\yiwoqu\code\xianbingshi_write_sub.txt','w','utf8') as f:
for line in data:
f.write(line + '\n')
f.close()
def word_replace3(xianbingshi2, operation1):
data = []
operation = []
with codecs.open(operation1,'r','utf8') as f:
for line in f:
line = line.strip()
if line not in operation:
operation.append(line)
else:
continue
"""排序"""
operation.sort(key=len, reverse=True)
with codecs.open(xianbingshi2,'r','utf8') as f:
"""优先级:医院、手术、检查、症状、疾病、部位、时间"""
for line in f:
for op in operation:
if op in line:
line = line.replace(op, '[operation]')
line = line.strip()
data.append(line)
print(line)
with codecs.open(xianbingshi2, 'w','utf8') as f:
for line in data:
f.write(line + '\n')
f.close()
def word_replace1(xianbingshi2,disease1):
data = []
disease = []
with codecs.open(disease1,'r','utf8') as f:
for line in f:
line = line.strip()
if line not in disease:
disease.append(line)
else:
continue
disease.sort(key=len, reverse=True)
with codecs.open(xianbingshi2,'r','utf8') as f:
"""优先级:医院、手术、检查、症状、疾病、部位、时间"""
for line in f:
for di in disease:
if di in line and len(di)>1:
line = line.replace(di, '[disease]')
line = line.strip()
data.append(line)
print(line)
with codecs.open(xianbingshi2,'w', 'utf8') as f:
for line in data:
f.write(line + '\n')
f.close()
def word_replace2(xianbingshi2, symptom1):
data = []
symptom = []
with codecs.open(symptom1,'r','utf8') as f:
for line in f:
line = line.strip()
if line not in symptom:
symptom.append(line)
else:
continue
"""排序"""
symptom.sort(key=len, reverse=True)
with codecs.open(xianbingshi2,'r','utf8') as f:
"""优先级:医院、手术、检查、症状、疾病、部位、时间"""
for line in f:
for sy in symptom:
if sy in line and len(sy) > 1:
line = line.replace(sy, '[symptom]')
line = line.strip()
data.append(line)
print(line)
with codecs.open(xianbingshi2,'w', 'utf8') as f:
for line in data:
f.write(line + '\n')
f.close()
def word_replace4(xianbingshi2, test1):
data = []
test = []
with codecs.open(test1,'r','utf8') as f:
for line in f:
line = line.strip()
if line not in test:
test.append(line)
else:
continue
"""排序"""
test.sort(key=len, reverse=True)
with codecs.open(xianbingshi2,'r','utf8') as f:
"""优先级:医院、手术、检查、症状、疾病、部位、时间"""
for line in f:
for te in test:
if te in line:
line = line.replace(te, '[test]')
line = line.strip()
data.append(line)
print(line)
with codecs.open(xianbingshi2, 'w','utf8') as f:
for line in data:
f.write(line + '\n')
f.close()
def word_replace5(xianbingshi2, time1):
data = []
time = []
with codecs.open(time1,'r','utf8') as f:
for line in f:
line = line.strip()
if line not in time:
time.append(line)
else:
continue
"""排序"""
time.sort(key=len, reverse=True)
with codecs.open(xianbingshi2,'r','utf8') as f:
"""优先级:医院、手术、检查、症状、疾病、部位、时间"""
for line in f:
for t in time:
if t in line:
line = line.replace(t,'[time]')
line = line.strip()
data.append(line)
print(line)
with codecs.open(xianbingshi2,'w', 'utf8') as f:
for line in data:
f.write(line + '\n')
f.close()
def word_replace6(xianbingshi2, organ1):
data = []
organ = []
with codecs.open(organ1,'r','utf8') as f:
for line in f:
line = line.strip()
if line not in organ:
organ.append(line)
else:
continue
"""排序"""
organ.sort(key=len, reverse=True)
with codecs.open(xianbingshi2,'r','utf8') as f:
"""优先级:医院、手术、检查、症状、疾病、部位、时间"""
for line in f:
for o in organ:
if o in line and len(o) > 1:
line = line.replace(o, '[organ]')
line = line.strip()
data.append(line)
print(line)
with codecs.open(xianbingshi2,'w', 'utf8') as f:
for line in data:
f.write(line + '\n')
f.close()
def word_replace7(xianbingshi2, symptom1):
data = []
symptom = []
with codecs.open(symptom1,'r','utf8') as f:
for line in f:
line = line.strip()
if line not in symptom and len(line) == 1:
symptom.append(line)
print(line)
else:
continue
"""排序"""
symptom.sort(key=len, reverse=True)
with codecs.open(xianbingshi2,'r','utf8') as f:
"""优先级:医院、手术、检查、症状、疾病、部位、时间"""
for line in f:
for sy in symptom:
line = line.replace(sy, '[symptom]')
line = line.strip()
data.append(line)
print(line)
with codecs.open(xianbingshi2,'w', 'utf8') as f:
for line in data:
f.write(line + '\n')
f.close()
def word_replace8(xianbingshi2, disease1):
data = []
disease = []
with codecs.open(disease1,'r','utf8') as f:
for line in f:
line = line.strip()
if line not in disease and line == 1:
disease.append(line)
else:
continue
"""排序"""
disease.sort(key=len, reverse=True)
with codecs.open(xianbingshi2,'r','utf8') as f:
"""优先级:医院、手术、检查、症状、疾病、部位、时间"""
for line in f:
for di in disease:
line = line.replace(di, '[disease]')
line = line.strip()
data.append(line)
print(line)
with codecs.open(xianbingshi2,'w', 'utf8') as f:
for line in data:
f.write(line + '\n')
f.close()
def word_replace9(xianbingshi2, organ1):
data = []
organ = []
with codecs.open(organ1,'r','utf8') as f:
for line in f:
line = line.strip()
if line not in organ and line == 1:
organ.append(line)
else:
continue
"""排序"""
organ.sort(key=len, reverse=True)
with codecs.open(xianbingshi2,'r','utf8') as f:
"""优先级:医院、手术、检查、症状、疾病、部位、时间"""
for line in f:
for o in organ:
line = line.replace(o, '[organ]')
line = line.strip()
data.append(line)
print(line)
with codecs.open(xianbingshi2,'w', 'utf8') as f:
for line in data:
f.write(line + '\n')
f.close()
if __name__ == '__main__':
disease1 =r'C:\Users\Administrator.SC-201812211013\PycharmProjects\词表工作代码\yiwoqu\TXT\disease_0903.txt'
organ1 = r"C:\Users\Administrator.SC-201812211013\PycharmProjects\词表工作代码\yiwoqu\TXT\organ_0903.txt"
test1 = r"C:\Users\Administrator.SC-201812211013\PycharmProjects\词表工作代码\yiwoqu\TXT\test_0903.txt"
time1 = r"C:\Users\Administrator.SC-201812211013\PycharmProjects\词表工作代码\yiwoqu\time1.txt"
operation1 = r"C:\Users\Administrator.SC-201812211013\PycharmProjects\词表工作代码\yiwoqu\TXT\operation_0903.txt"
symptom1 = r"C:\Users\Administrator.SC-201812211013\PycharmProjects\词表工作代码\yiwoqu\code\症状.txt"
xianbingshi = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\词表工作代码\yiwoqu\code\xianbingshi_write.txt'
xianbingshi2 =r'C:\Users\Administrator.SC-201812211013\PycharmProjects\词表工作代码\yiwoqu\code\xianbingshi_write_sub.txt'
hospital1 = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\词表工作代码\yiwoqu\TXT\hospital_0903.txt'
word_replace(xianbingshi, hospital1)
word_replace3(xianbingshi2, operation1)
word_replace1(xianbingshi2, disease1)
word_replace2(xianbingshi2, symptom1)
word_replace4(xianbingshi2, test1)
# word_replace5(xianbingshi2, time1)
word_replace6(xianbingshi2, organ1)
word_replace7(xianbingshi2, symptom1)
word_replace8(xianbingshi2, disease1)
word_replace9(xianbingshi2, organ1)