AI学习 - 诊断结论信息抽取 - 学习路径
时间安排建议
第1周:数据准备
- 转换Label Studio数据格式
- 实现数据增强,扩展到50+训练样本
- 划分训练/验证集(80/20)
第2-3周:模型训练
- 选择并测试不同预训练模型
- 微调BERT,达到基础效果
- 错误分析,针对性补充数据
第4周:优化部署
- 模型压缩与加速
- 构建推理管道
- 开发Web界面和API
关键里程碑
- ✅ 第1周末:成功训练出第一个NER模型(F1>0.7)
- ✅ 第2周末:通过数据迭代达到F1>0.85
- ✅ 第3周末:完整解析系统,支持规则+AI混合
- ✅ 第4周末:可用的Web应用和API
从10份数据开始的BERT微调全攻略
当前状态分析
- ✅ 已有10份Label Studio标注数据
- ⏳ 需要扩展到有效训练集(建议50-100份起步)
- 🎯 目标:构建心电图报告NER系统
阶段一:数据准备与增强(1-2周)
第1步:数据格式转换(关键!)
Label Studio导出的数据需要转换成BERT训练格式。
# label_studio_to_bert.py
import json
from transformers import AutoTokenizer
def convert_label_studio_to_bert(label_studio_json, tokenizer, label2id):
"""
将Label Studio格式转为BERT训练格式
label_studio_json: Label Studio导出的JSON
tokenizer: BERT tokenizer
label2id: 标签到ID的映射
"""
bert_samples = []
for item in label_studio_json:
text = item["data"]["text"]
annotations = item["annotations"][0]["result"]
# 1. 分词并获取标签对齐
tokens = tokenizer.tokenize(text)
labels = ["O"] * len(tokens) # 初始化为O
# 2. 将Label Studio的字符级标注转为token级
for ann in annotations:
if "value" in ann:
start = ann["value"]["start"]
end = ann["value"]["end"]
label = ann["value"]["labels"][0]
# 找到对应的tokens
# 这里需要处理tokenization对齐问题
# 简化版:按字符位置映射
pass
# 3. 构建训练样本
input_ids = tokenizer.convert_tokens_to_ids(tokens)
attention_mask = [1] * len(input_ids)
label_ids = [label2id.get(l, label2id["O"]) for l in labels]
bert_samples.append({
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": label_ids
})
return bert_samples
第2步:数据增强(解决数据稀缺)
用10份数据直接训练会过拟合,必须增强:
# data_augmentation.py
import random
from typing import List
def augment_ecg_report(text: str, annotations: List) -> List[tuple]:
"""
心电图报告数据增强策略
返回:(增强后文本, 增强后标注)
"""
augmented = []
# 策略1:同义词替换(医学术语特定)
medical_synonyms = {
"心率": ["心律", "心跳频率"],
"次/分": ["bpm", "次每分钟"],
"早搏": ["期前收缩", "额外收缩"],
"房性": ["心房性", "房室性"]
}
# 策略2:数字变异(保持医学合理性)
def vary_number(number_str, variation=0.1):
num = int(number_str)
# 医学数据变异范围要合理
varied = num + random.randint(-int(num*variation), int(num*variation))
return str(max(1, varied)) # 确保正数
# 策略3:句式重组
templates = [
"监测显示{content}",
"心电图报告:{content}",
"检查结果:{content}"
]
# 应用增强策略
# ... 具体实现
return augmented
第3步:主动学习标注
# active_learning.py
"""
用已有模型预测未标注数据,选择最难样本标注
"""
def select_samples_for_annotation(unlabeled_texts, model, tokenizer, n=10):
"""
选择最需要标注的样本
"""
uncertainties = []
for text in unlabeled_texts:
# 使用模型预测
inputs = tokenizer(text, return_tensors="pt", truncation=True)
outputs = model(**inputs)
probabilities = torch.softmax(outputs.logits, dim=-1)
# 计算不确定性(熵)
entropy = -torch.sum(probabilities * torch.log(probabilities), dim=-1).mean()
uncertainties.append((text, entropy.item()))
# 按不确定性排序,选择最高的
uncertainties.sort(key=lambda x: x[1], reverse=True)
return [text for text, _ in uncertainties[:n]]
阶段二:BERT模型选择与微调(2-3周)
第1步:选择预训练模型
根据你的数据特点选择:
# 选项1:通用中文BERT(起步最快)
model_name = "bert-base-chinese"
# 选项2:医学领域BERT(效果更好)
model_name = "UCSD-AI4H/MedicalBERT" # 英文医学
# 或使用中文医学BERT:
model_name = "GanjinZero/UMLSBert_zh" # 中文医学知识增强
# 选项3:轻量级模型(部署友好)
model_name = "hfl/chinese-roberta-wwm-ext" # 中文RoBERTa
第2步:环境搭建与数据加载
# train_ner.py
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
TrainingArguments,
Trainer,
DataCollatorForTokenClassification
)
from datasets import Dataset as HFDataset
import numpy as np
# 1. 定义数据集类
class ECGDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
# 2. 标签体系(根据你的标注定义)
label_list = [
"O", # 其他
"B-HR", "I-HR", # 心率
"B-VALUE", "I-VALUE", # 数值
"B-TIME", "I-TIME", # 时间
"B-EVENT", "I-EVENT", # 事件
"B-DIAG", "I-DIAG", # 诊断
]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
第3步:训练配置与微调
# 3. 加载模型和分词器
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=len(label_list),
id2label=id2label,
label2id=label2id
)
# 4. 准备数据
def prepare_data(texts, all_labels):
"""将文本和标签转为模型输入"""
encodings = tokenizer(
texts,
truncation=True,
padding=True,
max_length=256,
is_split_into_words=False # 文本未分词
)
# 标签对齐(关键步骤!)
labels = []
for i, label in enumerate(all_labels):
word_ids = encodings.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100) # 特殊token忽略
elif word_idx != previous_word_idx:
label_ids.append(label2id[label[word_idx]])
else:
# 同一个词的后继部分
label_ids.append(label2id[label[word_idx]] if label[word_idx].startswith("I") else -100)
previous_word_idx = word_idx
labels.append(label_ids)
return encodings, labels
# 5. 训练参数配置
training_args = TrainingArguments(
output_dir="./ecg-ner-model",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=10, # 小数据需要更多epoch
weight_decay=0.01,
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1",
logging_dir="./logs",
logging_steps=10,
report_to="none", # 关闭wandb等
)
# 6. 定义评估指标
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
# 移除padding和特殊token
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
# 计算指标
from seqeval.metrics import classification_report
report = classification_report(true_labels, true_predictions, output_dict=True)
return {
"precision": report["macro avg"]["precision"],
"recall": report["macro avg"]["recall"],
"f1": report["macro avg"]["f1"],
}
# 7. 创建Trainer并训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=DataCollatorForTokenClassification(tokenizer),
compute_metrics=compute_metrics,
)
trainer.train()
第4步:解决小数据训练的关键技巧
# 技巧1:分层采样确保每个标签都有样本
def stratified_split(texts, labels, test_size=0.2):
"""确保训练集和测试集都有所有标签类型"""
# 实现略
# 技巧2:学习率预热
training_args = TrainingArguments(
warmup_steps=100, # 前100步慢慢增加学习率
# ...
)
# 技巧3:梯度累积(模拟更大batch size)
training_args = TrainingArguments(
per_device_train_batch_size=4,
gradient_accumulation_steps=2, # 相当于batch_size=8
# ...
)
# 技巧4:早停防止过拟合
from transformers import EarlyStoppingCallback
trainer = Trainer(
# ...
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
阶段三:模型评估与迭代(1-2周)
第1步:错误分析
# error_analysis.py
def analyze_errors(model, tokenizer, test_samples):
"""分析模型在哪些地方出错"""
errors = {
"false_positives": [], # 误报
"false_negatives": [], # 漏报
"boundary_errors": [], # 边界错误
"label_confusion": {} # 标签混淆
}
for text, true_labels in test_samples:
inputs = tokenizer(text, return_tensors="pt", truncation=True)
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)[0]
# 转换回标签
pred_labels = [id2label[p.item()] for p in predictions[1:-1]] # 去掉[CLS]和[SEP]
# 对比分析
# ...
return errors
# 生成错误报告
errors = analyze_errors(model, tokenizer, test_data)
print("最常见的错误类型:", errors.most_common(5))
第2步:针对性数据补充
根据错误分析结果,在Label Studio中:
- 标注更多模型易错的样本类型
- 修正原有标注中的错误
- 增加边界情况的样本
第3步:模型压缩与优化
# 模型量化(减小模型大小,加速推理)
from transformers import AutoModelForTokenClassification
import torch
# 动态量化
model = AutoModelForTokenClassification.from_pretrained("./ecg-ner-model")
quantized_model = torch.quantization.quantize_dynamic(
model, {torch.nn.Linear}, dtype=torch.qint8
)
quantized_model.save_pretrained("./ecg-ner-model-quantized")
# 使用ONNX加速
from transformers.convert_graph_to_onnx import convert
convert(
framework="pt",
model="./ecg-ner-model",
output="./ecg-ner-model.onnx",
opset=12
)
阶段四:部署与应用(1周)
第1步:创建推理管道
# inference_pipeline.py
from transformers import pipeline
class ECGReportParser:
def __init__(self, model_path="./ecg-ner-model"):
self.ner_pipeline = pipeline(
"ner",
model=model_path,
tokenizer=model_path,
aggregation_strategy="simple" # 合并子词
)
self.rule_parser = RuleParser() # 你的规则系统
def parse(self, text):
# 1. NER提取
ner_results = self.ner_pipeline(text)
# 2. 结果结构化
structured = self.structure_results(ner_results)
# 3. 规则兜底(处理NER遗漏)
rule_results = self.rule_parser.parse(text)
structured = self.merge_results(structured, rule_results)
return structured
def structure_results(self, ner_results):
"""将NER结果转为结构化数据"""
result = {
"heart_rates": [],
"events": [],
"diagnoses": []
}
for entity in ner_results:
if entity["entity_group"] == "HR":
result["heart_rates"].append({
"value": entity["word"],
"confidence": entity["score"]
})
# 其他实体处理...
return result
第2步:Web界面
# app.py
import streamlit as st
import pandas as pd
import json
from inference_pipeline import ECGReportParser
st.set_page_config(page_title="ECG报告解析", layout="wide")
@st.cache_resource
def load_parser():
return ECGReportParser()
parser = load_parser()
st.title("🫀 心电图报告智能解析系统")
st.markdown("上传心电图报告文本,自动提取关键信息")
# 输入方式1:文本框
text_input = st.text_area("直接粘贴报告文本", height=200)
# 输入方式2:文件上传
uploaded_file = st.file_uploader("或上传文件", type=["txt", "docx"])
if text_input or uploaded_file:
if uploaded_file:
text_input = uploaded_file.read().decode("utf-8")
with st.spinner("解析中..."):
result = parser.parse(text_input)
# 显示结果
col1, col2 = st.columns(2)
with col1:
st.subheader("📋 原始文本")
st.text(text_input[:500] + "..." if len(text_input) > 500 else text_input)
with col2:
st.subheader("📊 结构化结果")
# 指标卡片
if result.get("heart_rates"):
avg_hr = next((hr for hr in result["heart_rates"] if "平均" in hr.get("context", "")), None)
if avg_hr:
st.metric("平均心率", f"{avg_hr['value']} 次/分")
# 详细数据
st.json(result)
# 下载按钮
st.download_button(
label="下载JSON结果",
data=json.dumps(result, indent=2, ensure_ascii=False),
file_name="ecg_analysis.json",
mime="application/json"
)
第3步:API服务
# api.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from inference_pipeline import ECGReportParser
import uvicorn
app = FastAPI(title="ECG报告解析API")
parser = ECGReportParser()
class ECGRequest(BaseModel):
text: str
format: str = "json" # json或csv
@app.post("/parse")
async def parse_ecg(request: ECGRequest):
try:
result = parser.parse(request.text)
if request.format == "csv":
# 转换为CSV格式
pass
return {"success": True, "data": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
遇到问题的解决方案
问题1:数据太少,模型过拟合
# 解决方案:使用更多正则化
training_args = TrainingArguments(
learning_rate=1e-5, # 更小的学习率
weight_decay=0.1, # 更强的权重衰减
num_train_epochs=20, # 更多epoch配合早停
# 添加dropout
)
model.config.hidden_dropout_prob = 0.3 # 增加dropout
model.config.attention_probs_dropout_prob = 0.3
问题2:标签不平衡
# 解决方案:自定义损失函数
from torch import nn
import torch
class WeightedCrossEntropyLoss(nn.Module):
def __init__(self, class_weights):
super().__init__()
self.weights = torch.tensor(class_weights)
def forward(self, inputs, targets):
ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
weights = self.weights[targets]
weighted_loss = ce_loss * weights
return weighted_loss.mean()
# 在Trainer中使用
trainer = Trainer(
# ...
compute_loss=WeightedCrossEntropyLoss(class_weights)
)
问题3:部署内存不足
# 解决方案:使用更小的模型
# 1. DistilBERT(参数减少40%)
model_name = "hfl/chinese-distilbert-base"
# 2. 模型剪枝
from transformers import AutoModelForTokenClassification
import torch.nn.utils.prune as prune
model = AutoModelForTokenClassification.from_pretrained(...)
# 剪枝20%的权重
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear):
prune.l1_unstructured(module, name='weight', amount=0.2)
下一步行动清单
-
立即行动:
# 1. 导出Label Studio数据 # 2. 运行转换脚本 python label_studio_to_bert.py --input exported.json --output train_data.json # 3. 开始第一次训练 python train_ner.py --data train_data.json --epochs 5 -
评估基线:用10份数据训练一个基础模型,看效果
-
数据迭代:用主动学习选择下一批要标注的样本
-
逐步完善:每周增加20-30份标注,重新训练模型
记住:BERT微调是一个迭代过程。先用小数据跑通整个流程,再逐步提升数据质量和数量。你现在已经完成了最困难的数据标注起步阶段,接下来就是技术实现问题了!
本文来自博客园,作者:VipSoft 转载请注明原文链接:https://www.cnblogs.com/vipsoft/p/19493648
浙公网安备 33010602011771号