基于BERT的数据库字段文本分类分级任务

场景

  最近遇到一个任务,要对数据库中的字段进行一个敏感性文本分类分级。根据字段名、字段备注进行分类,分类分为4级,比如L1/L2/L3/L4这样。同时采集一部分数据发现数据分布存在长尾现象,部分分类数据很多,大部分数据很少。为了实现这个功能,采用MacBert训练分类分级模型进行实现。主要是分类分级场景下的文本分类,这个思路可以用于图书馆书籍分类、数据库数据分类,常用的模型方法还有Tree-LSTM,不过这个模型比较偏学术,相关资料比较少,为了方便实现还是采用了MacBERT加级联模型结构的方案。

  然后咱也不是专门做这个的,咱是个java开发啊🥹。当时就找了本书入门《HuggingFace自然语言处理详解——基于BERT中文模型的任务实战_李福林》,这个是真的推荐,入门很快适合实践,再借助一点点🤏的ai帮忙,最后结果如下:

解决

  大概的代码如下,后续训练出模型后,再用fastAPI写了一个前后端页面,支持直接上传文件进行数据分类,实时看分类任务的推理进度,平均可信度等信息,python还是方便啊。

import datetime
import os
import re
from turtle import pd
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
from tqdm import tqdm

MAX_LENGTH = 128
SAVE_DIR = ''
MODEL_NAME = ''
DEVICE = 'cpu'
DATA_PATH = ''
BATCH_SIZE = 16
EPOCHS = 5
BEST_F1 = []
LEARNING_RATE = 2e-6
# 字段名统一格式
def normalize_field_name(field_name: str)-> str:
    if not field_name or not isinstance(field_name, str):
        return ""
    field_name = field_name.strip()
    field_name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', field_name)
    field_name = re.sub('([A-Z]+)([A-z][a-z])', r'\1 \2', field_name)
    field_name = field_name.lower()
    field_name = re.sub('_+','_', field_name)
    field_name = field_name.strip(' ')
    return field_name
# 文件数据加载
def load_and_preprocess_data(path):
    df = pd.read_excel(path, sheet_name='ACLBASE')
    cols_to_check=['*字段名','*字段中文名']
    df = df.dropna(cols_to_check, how='all')
    df['*字段名标准化']= df['*字段名'].astype(str).apply(normalize_field_name)
    df['text']= df['*字段名标准化']+":"+df['*字段中文名'].astype(str)
    df = df.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)
    df['path']= df['数据分类*'].apply(lambda x: x.split('/') if pd.notna(x) and x.strip() != '' else [])
    #一共有4级
    for i in range(1, 5):
        df[f'L(i)']= df['path'].apply(lambda x: x[i -1] if len(x)>=i else'非敏感')
    is_nonsensitive = (df['L1']=='非敏感') & (df['L2']=='非敏感') & (df['L3']=="非敏感") & (df['L3']=="非敏感")
    sensitive_df = df[~is_nonsensitive]
    nonsensitive_df = df[is_nonsensitive]
    #数据采样策略
    if len(nonsensitive_df) > 1600:
        nonsensitive_df = nonsensitive_df.sample(n=1600, random_state=42, replace=False)
    df_balanced = pd.concat([sensitive_df, nonsensitive_df], ignore_index=True)
    class_counts = df_balanced.groupby(['Ll', 'L2', 'L3', 'L4']).size().reset_index(name='count')
    low_freg_classes = class_counts[class_counts['count'] <= 50]
    df_main = df_balanced.merge(
        low_freg_classes[['L1', 'L2','L3', 'L4']],
        on=['L1','L2','L3','L4'],
        how='left',
        indicator=True)
    df_main = df_main[df_main['_merge']=='left_only'].drop('_merge',axis=1)
    final_rows = []
    for name, group in df_main.groupby(['L1','L2','L3','L4']):
        is_pure_nonsensitive=all(label ='非敏感'for label in name)
        count =len(group)
        if is_pure_nonsensitive:
            sampled_qroup = group
        elif count >= 1000:
            sampled_group = group.copy()
        elif count >= 100 and count < 1000:
            sampled_group = group.copy()
        else:
            sampled_group = resample(group,replace = True, n_samples = 100,random_state = 42)

        final_rows.append(sampled_group)
        df_upsampled = pd.concat(final_rows, ignore_index=True)
        df_upsampled.to_csv("train balanced.csy", index=False, encoding="utf-8-sig")
        return df_upsampled[['text', 'L1', 'L2', 'L3', 'L4'].sample(frac=1, random_state=42).reset_index(drop=True)]

def get_class_weights(labels):
    classes = np.array(sorted(set(labels)))
    labels_array = np.array(labels)
    class_weights = compute_class_weight(
        class_weight='balanced',
        Classes=classes,
        Y=labels_array,)
    weights = torch.tensor(class_weights, dtype=torch.float)
    return weights

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=MAX_LENGTH):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self,idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
# 模型训练
def train_level_model(train_texts, train_labels, val_texts, val_labels, level_name, num_labels):
    model_save_path = os.path.join(SAVE_DIR, level_name)
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
    model = BertForseguenceclassification.from_pretrained(MODEL_NAME, num_labels=num_labels).to(DEVICE)
    label_encoder = LabelEncoder()
    train_labels_encoded = label_encoder.fit_transform(train_labels)
    val_labels_encoded = label_encoder.transform(val_labels)
    train_dataset = TextDataset(train_texts,train_labels_encoded, tokenizer)
    val_dataset = TextDataset(val_texts, val_labels_encoded, tokenizer)
    train_loader = Dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = Dataloader(val_dataset, batch_size=BATCH_SIZE)
    optimizer=AdamW(model.parameters(), Ir=LEARNING_RATE)
    total_steps =len(train_loader)* EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    criterion = nn.CrossEntropyLoss(weight=get_class_weights(train_labels))
    best_f1=0.0
    for epoch in range (EPOCHS):
        model.train()
        total_loss =0
        train_preds, train_true = [], []
        for batch in tqdm(train_loader,desc=f"{level_name}训练中"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            outputs = model(input_ids, attention_mask,attention_mask, labels-labels)
            loss =outputs.loss
        if level_name == 'L1':
            loss = criterion(outputs.logits, batch['labels'])
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss +=loss.item()
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        train_preds.extend(preds)
        train_true.extend (labels.cpu ().numpy())

        train_f1 = f1_score(train_true, train_preds, average='macro', zero_division=0)
        model.eval()
        val_preds, val_true = [],[]
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim= -1)
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())

        val_f1 = f1_score(val_true, val_preds, average = 'macro', zero_division = 0)
        val_acc = accuracy_score(val_true, val_preds)
        if val_f1 > best_f1:
            best_f1 = val_f1
            model.save_pretrained(model_save_path)
            tokenizer.save_pretrained(model_save_path)
            np.save(os.path.join(model_save_path, 'labels.npy'), label_encoder.classes_)

    BEST_F1.append(best_f1)
    return model_save_path
# 预测
def predict_hierarchy(text):
    path = []
    confidences=[]
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=MAX_LENGTH).to(DEVICE)
    for level in ['L1','L2', 'L3', 'L4']:
        model_path = os.path.join(SAVE_DIR, level)
        if not os.path.exists (model_path):
            break
        model = BertForSeguenceclassification.from_pretrained(model_path).to(DEVICE)
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)
            pred_idx= torch.arqmax(probs,dim=-1).item()
            confidence = probs[0][pred_idx].item()
            labels = np.load(os.path.join(model_path,'labels.npy'), allow_pickle=True)
            pred_label = labels[pred_idx]

        if pred_label =='非敏感':
            confidences.append(round(confidence, 4))
            break
        path.append(pred_label)
        confidences.append(round(confidence, 4))

    result_path = "/".join(path) if path else ""
    result_confidence = round(np.mean(confidences), 4) if confidences else 0.0
    return result_path, result_confidence

def main():
    df = load_and_preprocess_data(DATA_PATH)
    levels = ['L1','L2','L3','L4']
    data_date = datetime.now().strftime("%Y%m%d")
    for level in levels:
        if level == 'L1':
            level_df = df.copy()
        else:
            level_df = df[df[level]!='非敏感']
        if len(level_df) == 0:
            continue
        X = level_df['text'].values
        y = level_df[level].values
        x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42, stratify = y)
        num_labels = len(np.unique(y_train))
        train_level_model(x_train, y_train, x_val, y_val, level, num_labels)

问题

  最后用验证数据集进行验证,发现以下问题:

  1、有些分类的准确度很高,高达95%以上,有些分类可信度很差,应该是数据分布导致的,训练数据中数据太少学习泛化能力太弱了。

  2、级联模型结构会导致误差增大,比如L1预测可信度只有0.6,这个时候已经错了,后面再预测子类也挽回不了了。

  后续可以尝试用Tree-LSTM在该场景下的效果。

参考

 HuggingFace自然语言处理详解——基于BERT中文模型的任务实战_李福林

https://pdf.hanspub.org/sea_2691157.pdf

https://zhuanlan.zhihu.com/p/524487313

posted @ 2025-12-11 20:54  陈子白  阅读(0)  评论(0)    收藏  举报