Loading

NLP-transformers-多标签多分类模型搭建与训练

NLP多标签多分类使用样例

因为是医学数据,涉及最大的问题,类别不均衡问题,提供三种解决方案:

  • 重采样:原数据集设置为X,划分好训练集,测试集;以训练集为例如果需要1000个,训练集一共5类,每类中随机取一次,取200次。这样训练时的数据是均衡的,但是不适合多标签。

  • 调整训练过程的权重分配,数量少的类别 权重大一些,同样不适合多标签。

  • Facol Loss:通过降低容易分类的样本的权重,提高难分类样本的权重,来解决类别不均衡问题。这有助于模型更好地关注少数类别样本。

  • Kappa Loss:这是基于 Cohen's Kappa 系数的一种损失函数。它可以用于处理类别不均衡问题,并鼓励模型在少数类别上的准确分类

python代码

import torch
import torch.nn as nn
import torch.optim as optim

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        
    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        
        if self.reduction == 'mean':
            return torch.mean(focal_loss)
        elif self.reduction == 'sum':
            return torch.sum(focal_loss)
        return focal_loss


class KappaLoss(nn.Module):
    def __init__(self):
        super(KappaLoss, self).__init__()

    def forward(self, y_pred, y_true, epsilon=1e-6):
        observed_agreement = torch.sum(y_pred * y_true, dim=0)
        expected_agreement = torch.sum(y_pred, dim=0) * torch.sum(y_true, dim=0)

        total_samples = y_pred.size(0)
        chance_agreement = torch.sum((expected_agreement / (total_samples ** 2)), dim=0)

        kappa = (observed_agreement - chance_agreement) / (expected_agreement - chance_agreement + epsilon)
        kappa_loss = 1 - torch.mean(kappa)
        
        return kappa_loss

其他参考代码

import os,sys,time
import re
import json, jsonlines
import numpy as np
import pandas as pd 
from pprint import pprint

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from sklearn.preprocessing import OneHotEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertTokenizer
from keras.utils.np_utils import to_categorical


def pretreatment(comments):
    result_comments=[]
    punctuation='。,?!:%&~()、;“”&n|\,.?!:%&~();""'
    for comment in comments:
        comment= ''.join([c for c in comment if c not in punctuation])
        comment= ''.join(comment.split())   #\xa0
        result_comments.append(comment)
    return result_comments


class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        
    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        
        if self.reduction == 'mean':
            return torch.mean(focal_loss)
        elif self.reduction == 'sum':
            return torch.sum(focal_loss)
        return focal_loss
    
# 设置训练相关参数
epochs = 100
learning_rate = 1e-5
save_path = './results/'
save_epoch = 20   # 每xepoch保存一次模型参数

# 加载模型
model_name = "./premodel/bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
num_labels = 12   # 多标签转one-hot后的维度
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


# 加载数据
clean_data = pd.read_csv('你自己的数据')

datax = clean_data['txt']
datay = clean_data.values[:,1:]
datax = np.array(pretreatment(datax))
datay = to_categorical(datay)
datay = datay.reshape(datay.shape[0],datay.shape[1]*datay.shape[2])

pr = int(0.6*datax.shape[0])
x_train = datax[:pr].reshape(datax[:pr].shape[0],1)
y_train = datay[:pr].reshape(datax[:pr].shape[0],num_labels)
x_test = datax[pr:].reshape(datax[pr:].shape[0],1)
y_test = datay[pr:].reshape(datax[pr:].shape[0],num_labels)

# 数据集构建
x_train = pretreatment(x_train)
train_encoded_inputs = tokenizer.batch_encode_plus(x_train, padding=True, truncation=True, return_tensors="pt")
train_dataset = TensorDataset(train_encoded_inputs["input_ids"], train_encoded_inputs["attention_mask"], torch.tensor(y_train.tolist()))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

x_test = pretreatment(x_test)
test_encoded_inputs = tokenizer.batch_encode_plus(x_test, padding=True, truncation=True, return_tensors="pt")
test_dataset = TensorDataset(test_encoded_inputs["input_ids"], test_encoded_inputs["attention_mask"], torch.tensor(y_test.tolist()))
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 定义优化器和损失函数
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = FocalLoss(alpha=1, gamma=2)
# criterion = KappaLoss()
# criterion = nn.CrossEntropyLoss() # 被替换掉的常用loss,交叉熵损失函数

# 开始训练
model_history = {'epoch':[], 'train_loss':[], 'test_loss':[]}
current_time_seconds = time.time()
current_time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(current_time_seconds))
try:
    os.mkdir(save_path+current_time_str)
    print(save_path+current_time_str," 创建成功.")
except:
    pass

writer = SummaryWriter('logs')
all_time = 0
for epoch in range(epochs):
    model.train()
    train_loss = 0
    t1 = time.time()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2]
        }
        # labels = batch[2].to(device)
        labels = batch[2]
        outputs = model(**inputs)
        logits = outputs.logits
        
        train_step_loss = criterion(logits, labels)
        train_loss += train_step_loss.item()

        optimizer.zero_grad()
        train_step_loss.backward()
        optimizer.step()

    model.eval()
    test_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[2]
            }
            # labels = batch[2].to(device)
            labels = batch[2]
            optimizer.zero_grad()
            outputs = model(**inputs)
            logits = outputs.logits
            
            test_step_loss = criterion(logits, labels)
            test_loss += test_step_loss.item()
            optimizer.step()  
        
    train_avg_loss = train_loss / len(train_loader)
    test_avg_loss = test_loss / len(test_loader)

    model_history['epoch'].append(epoch+1)
    model_history['train_loss'].append(train_avg_loss)
    model_history['test_loss'].append(test_avg_loss)
    t2 = time.time()
    all_time = all_time + t2 - t1
    # 打印训练日志
    print(f"Epoch {epoch+1}/{epochs}, Train-Loss: {train_avg_loss:.4f}, Test-Loss: {test_avg_loss:.4f}, make time: {t2-t1:.4f}s, all time: {all_time:.4f}s")
    
    # 保存日志到tensorboard
    writer.add_scalar('TrainLoss', train_avg_loss, epoch+1)
    writer.add_scalar('TestLoss', test_avg_loss, epoch+1)
    
    # 每 save_epoch 保存一次模型 
    if (epoch+1)%save_epoch==0:
        try:
            os.mkdir(save_path+current_time_str+'/'+str(epoch+1))
        except:
            pass
        model.save_pretrained(save_path+current_time_str+'/'+str(epoch+1))
        tokenizer.save_pretrained(save_path+current_time_str+'/'+str(epoch+1))
        print("Epoch-"+str(epoch+1)+" 模型和分词器已保存.")
writer.close()            
# tensorboard --logdir=logs --host=0.0.0.0 --port=6006

with open(save_path+current_time_str+'/history.json', 'w') as json_file:
    json.dump(model_history, json_file)
print("训练过程已保存到 history.json 文件中.")

model.save_pretrained(save_path+current_time_str)
tokenizer.save_pretrained(save_path+current_time_str)
print("模型和分词器已保存.")
posted @ 2024-09-19 14:26  绯色鱼  阅读(380)  评论(0)    收藏  举报