NLP-transformers-多标签多分类模型搭建与训练
NLP多标签多分类使用样例
因为是医学数据,涉及最大的问题,类别不均衡问题,提供三种解决方案:
-
重采样:原数据集设置为X,划分好训练集,测试集;以训练集为例如果需要1000个,训练集一共5类,每类中随机取一次,取200次。这样训练时的数据是均衡的,但是不适合多标签。
-
调整训练过程的权重分配,数量少的类别 权重大一些,同样不适合多标签。
-
Facol Loss:通过降低容易分类的样本的权重,提高难分类样本的权重,来解决类别不均衡问题。这有助于模型更好地关注少数类别样本。
-
Kappa Loss:这是基于 Cohen's Kappa 系数的一种损失函数。它可以用于处理类别不均衡问题,并鼓励模型在少数类别上的准确分类
python代码
import torch
import torch.nn as nn
import torch.optim as optim
class FocalLoss(nn.Module):
def __init__(self, alpha=1, gamma=2, reduction='mean'):
super(FocalLoss, self).__init__()
self.alpha = alpha
self.gamma = gamma
self.reduction = reduction
def forward(self, inputs, targets):
ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
pt = torch.exp(-ce_loss)
focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
if self.reduction == 'mean':
return torch.mean(focal_loss)
elif self.reduction == 'sum':
return torch.sum(focal_loss)
return focal_loss
class KappaLoss(nn.Module):
def __init__(self):
super(KappaLoss, self).__init__()
def forward(self, y_pred, y_true, epsilon=1e-6):
observed_agreement = torch.sum(y_pred * y_true, dim=0)
expected_agreement = torch.sum(y_pred, dim=0) * torch.sum(y_true, dim=0)
total_samples = y_pred.size(0)
chance_agreement = torch.sum((expected_agreement / (total_samples ** 2)), dim=0)
kappa = (observed_agreement - chance_agreement) / (expected_agreement - chance_agreement + epsilon)
kappa_loss = 1 - torch.mean(kappa)
return kappa_loss
其他参考代码
import os,sys,time
import re
import json, jsonlines
import numpy as np
import pandas as pd
from pprint import pprint
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.preprocessing import OneHotEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertTokenizer
from keras.utils.np_utils import to_categorical
def pretreatment(comments):
result_comments=[]
punctuation='。,?!:%&~()、;“”&n|\,.?!:%&~();""'
for comment in comments:
comment= ''.join([c for c in comment if c not in punctuation])
comment= ''.join(comment.split()) #\xa0
result_comments.append(comment)
return result_comments
class FocalLoss(nn.Module):
def __init__(self, alpha=1, gamma=2, reduction='mean'):
super(FocalLoss, self).__init__()
self.alpha = alpha
self.gamma = gamma
self.reduction = reduction
def forward(self, inputs, targets):
ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
pt = torch.exp(-ce_loss)
focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
if self.reduction == 'mean':
return torch.mean(focal_loss)
elif self.reduction == 'sum':
return torch.sum(focal_loss)
return focal_loss
# 设置训练相关参数
epochs = 100
learning_rate = 1e-5
save_path = './results/'
save_epoch = 20 # 每xepoch保存一次模型参数
# 加载模型
model_name = "./premodel/bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
num_labels = 12 # 多标签转one-hot后的维度
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
# 加载数据
clean_data = pd.read_csv('你自己的数据')
datax = clean_data['txt']
datay = clean_data.values[:,1:]
datax = np.array(pretreatment(datax))
datay = to_categorical(datay)
datay = datay.reshape(datay.shape[0],datay.shape[1]*datay.shape[2])
pr = int(0.6*datax.shape[0])
x_train = datax[:pr].reshape(datax[:pr].shape[0],1)
y_train = datay[:pr].reshape(datax[:pr].shape[0],num_labels)
x_test = datax[pr:].reshape(datax[pr:].shape[0],1)
y_test = datay[pr:].reshape(datax[pr:].shape[0],num_labels)
# 数据集构建
x_train = pretreatment(x_train)
train_encoded_inputs = tokenizer.batch_encode_plus(x_train, padding=True, truncation=True, return_tensors="pt")
train_dataset = TensorDataset(train_encoded_inputs["input_ids"], train_encoded_inputs["attention_mask"], torch.tensor(y_train.tolist()))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
x_test = pretreatment(x_test)
test_encoded_inputs = tokenizer.batch_encode_plus(x_test, padding=True, truncation=True, return_tensors="pt")
test_dataset = TensorDataset(test_encoded_inputs["input_ids"], test_encoded_inputs["attention_mask"], torch.tensor(y_test.tolist()))
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
# 定义优化器和损失函数
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = FocalLoss(alpha=1, gamma=2)
# criterion = KappaLoss()
# criterion = nn.CrossEntropyLoss() # 被替换掉的常用loss,交叉熵损失函数
# 开始训练
model_history = {'epoch':[], 'train_loss':[], 'test_loss':[]}
current_time_seconds = time.time()
current_time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(current_time_seconds))
try:
os.mkdir(save_path+current_time_str)
print(save_path+current_time_str," 创建成功.")
except:
pass
writer = SummaryWriter('logs')
all_time = 0
for epoch in range(epochs):
model.train()
train_loss = 0
t1 = time.time()
for batch in train_loader:
optimizer.zero_grad()
inputs = {
"input_ids": batch[0],
"attention_mask": batch[1],
"labels": batch[2]
}
# labels = batch[2].to(device)
labels = batch[2]
outputs = model(**inputs)
logits = outputs.logits
train_step_loss = criterion(logits, labels)
train_loss += train_step_loss.item()
optimizer.zero_grad()
train_step_loss.backward()
optimizer.step()
model.eval()
test_loss = 0
with torch.no_grad():
for batch in test_loader:
inputs = {
"input_ids": batch[0],
"attention_mask": batch[1],
"labels": batch[2]
}
# labels = batch[2].to(device)
labels = batch[2]
optimizer.zero_grad()
outputs = model(**inputs)
logits = outputs.logits
test_step_loss = criterion(logits, labels)
test_loss += test_step_loss.item()
optimizer.step()
train_avg_loss = train_loss / len(train_loader)
test_avg_loss = test_loss / len(test_loader)
model_history['epoch'].append(epoch+1)
model_history['train_loss'].append(train_avg_loss)
model_history['test_loss'].append(test_avg_loss)
t2 = time.time()
all_time = all_time + t2 - t1
# 打印训练日志
print(f"Epoch {epoch+1}/{epochs}, Train-Loss: {train_avg_loss:.4f}, Test-Loss: {test_avg_loss:.4f}, make time: {t2-t1:.4f}s, all time: {all_time:.4f}s")
# 保存日志到tensorboard
writer.add_scalar('TrainLoss', train_avg_loss, epoch+1)
writer.add_scalar('TestLoss', test_avg_loss, epoch+1)
# 每 save_epoch 保存一次模型
if (epoch+1)%save_epoch==0:
try:
os.mkdir(save_path+current_time_str+'/'+str(epoch+1))
except:
pass
model.save_pretrained(save_path+current_time_str+'/'+str(epoch+1))
tokenizer.save_pretrained(save_path+current_time_str+'/'+str(epoch+1))
print("Epoch-"+str(epoch+1)+" 模型和分词器已保存.")
writer.close()
# tensorboard --logdir=logs --host=0.0.0.0 --port=6006
with open(save_path+current_time_str+'/history.json', 'w') as json_file:
json.dump(model_history, json_file)
print("训练过程已保存到 history.json 文件中.")
model.save_pretrained(save_path+current_time_str)
tokenizer.save_pretrained(save_path+current_time_str)
print("模型和分词器已保存.")

浙公网安备 33010602011771号