基于BERT的数据库字段文本分类分级任务
场景
最近遇到一个任务,要对数据库中的字段进行一个敏感性文本分类分级。根据字段名、字段备注进行分类,分类分为4级,比如L1/L2/L3/L4这样。同时采集一部分数据发现数据分布存在长尾现象,部分分类数据很多,大部分数据很少。为了实现这个功能,采用MacBert训练分类分级模型进行实现。主要是分类分级场景下的文本分类,这个思路可以用于图书馆书籍分类、数据库数据分类,常用的模型方法还有Tree-LSTM,不过这个模型比较偏学术,相关资料比较少,为了方便实现还是采用了MacBERT加级联模型结构的方案。
然后咱也不是专门做这个的,咱是个java开发啊🥹。当时就找了本书入门《HuggingFace自然语言处理详解——基于BERT中文模型的任务实战_李福林》,这个是真的推荐,入门很快适合实践,再借助一点点🤏的ai帮忙,最后结果如下:
解决
大概的代码如下,后续训练出模型后,再用fastAPI写了一个前后端页面,支持直接上传文件进行数据分类,实时看分类任务的推理进度,平均可信度等信息,python还是方便啊。
import datetime import os import re from turtle import pd from sklearn.utils.class_weight import compute_class_weight import numpy as np import torch from tqdm import tqdm MAX_LENGTH = 128 SAVE_DIR = '' MODEL_NAME = '' DEVICE = 'cpu' DATA_PATH = '' BATCH_SIZE = 16 EPOCHS = 5 BEST_F1 = [] LEARNING_RATE = 2e-6 # 字段名统一格式 def normalize_field_name(field_name: str)-> str: if not field_name or not isinstance(field_name, str): return "" field_name = field_name.strip() field_name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', field_name) field_name = re.sub('([A-Z]+)([A-z][a-z])', r'\1 \2', field_name) field_name = field_name.lower() field_name = re.sub('_+','_', field_name) field_name = field_name.strip(' ') return field_name # 文件数据加载 def load_and_preprocess_data(path): df = pd.read_excel(path, sheet_name='ACLBASE') cols_to_check=['*字段名','*字段中文名'] df = df.dropna(cols_to_check, how='all') df['*字段名标准化']= df['*字段名'].astype(str).apply(normalize_field_name) df['text']= df['*字段名标准化']+":"+df['*字段中文名'].astype(str) df = df.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True) df['path']= df['数据分类*'].apply(lambda x: x.split('/') if pd.notna(x) and x.strip() != '' else []) #一共有4级 for i in range(1, 5): df[f'L(i)']= df['path'].apply(lambda x: x[i -1] if len(x)>=i else'非敏感') is_nonsensitive = (df['L1']=='非敏感') & (df['L2']=='非敏感') & (df['L3']=="非敏感") & (df['L3']=="非敏感") sensitive_df = df[~is_nonsensitive] nonsensitive_df = df[is_nonsensitive] #数据采样策略 if len(nonsensitive_df) > 1600: nonsensitive_df = nonsensitive_df.sample(n=1600, random_state=42, replace=False) df_balanced = pd.concat([sensitive_df, nonsensitive_df], ignore_index=True) class_counts = df_balanced.groupby(['Ll', 'L2', 'L3', 'L4']).size().reset_index(name='count') low_freg_classes = class_counts[class_counts['count'] <= 50] df_main = df_balanced.merge( low_freg_classes[['L1', 'L2','L3', 'L4']], on=['L1','L2','L3','L4'], how='left', indicator=True) df_main = df_main[df_main['_merge']=='left_only'].drop('_merge',axis=1) final_rows = [] for name, group in df_main.groupby(['L1','L2','L3','L4']): is_pure_nonsensitive=all(label ='非敏感'for label in name) count =len(group) if is_pure_nonsensitive: sampled_qroup = group elif count >= 1000: sampled_group = group.copy() elif count >= 100 and count < 1000: sampled_group = group.copy() else: sampled_group = resample(group,replace = True, n_samples = 100,random_state = 42) final_rows.append(sampled_group) df_upsampled = pd.concat(final_rows, ignore_index=True) df_upsampled.to_csv("train balanced.csy", index=False, encoding="utf-8-sig") return df_upsampled[['text', 'L1', 'L2', 'L3', 'L4'].sample(frac=1, random_state=42).reset_index(drop=True)] def get_class_weights(labels): classes = np.array(sorted(set(labels))) labels_array = np.array(labels) class_weights = compute_class_weight( class_weight='balanced', Classes=classes, Y=labels_array,) weights = torch.tensor(class_weights, dtype=torch.float) return weights class TextDataset(Dataset): def __init__(self, texts, labels, tokenizer, max_length=MAX_LENGTH): self.texts = texts self.labels = labels self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.texts) def __getitem__(self,idx): text = str(self.texts[idx]) label = self.labels[idx] encoding = self.tokenizer( text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt') return { 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.long) } # 模型训练 def train_level_model(train_texts, train_labels, val_texts, val_labels, level_name, num_labels): model_save_path = os.path.join(SAVE_DIR, level_name) tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) model = BertForseguenceclassification.from_pretrained(MODEL_NAME, num_labels=num_labels).to(DEVICE) label_encoder = LabelEncoder() train_labels_encoded = label_encoder.fit_transform(train_labels) val_labels_encoded = label_encoder.transform(val_labels) train_dataset = TextDataset(train_texts,train_labels_encoded, tokenizer) val_dataset = TextDataset(val_texts, val_labels_encoded, tokenizer) train_loader = Dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) val_loader = Dataloader(val_dataset, batch_size=BATCH_SIZE) optimizer=AdamW(model.parameters(), Ir=LEARNING_RATE) total_steps =len(train_loader)* EPOCHS scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) criterion = nn.CrossEntropyLoss(weight=get_class_weights(train_labels)) best_f1=0.0 for epoch in range (EPOCHS): model.train() total_loss =0 train_preds, train_true = [], [] for batch in tqdm(train_loader,desc=f"{level_name}训练中"): optimizer.zero_grad() input_ids = batch['input_ids'].to(DEVICE) attention_mask = batch['attention_mask'].to(DEVICE) labels = batch['labels'].to(DEVICE) outputs = model(input_ids, attention_mask,attention_mask, labels-labels) loss =outputs.loss if level_name == 'L1': loss = criterion(outputs.logits, batch['labels']) loss.backward() optimizer.step() scheduler.step() total_loss +=loss.item() preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy() train_preds.extend(preds) train_true.extend (labels.cpu ().numpy()) train_f1 = f1_score(train_true, train_preds, average='macro', zero_division=0) model.eval() val_preds, val_true = [],[] with torch.no_grad(): for batch in val_loader: input_ids = batch['input_ids'].to(DEVICE) attention_mask = batch['attention_mask'].to(DEVICE) labels = batch['labels'].to(DEVICE) outputs = model(input_ids, attention_mask=attention_mask) preds = torch.argmax(outputs.logits, dim= -1) val_preds.extend(preds.cpu().numpy()) val_true.extend(labels.cpu().numpy()) val_f1 = f1_score(val_true, val_preds, average = 'macro', zero_division = 0) val_acc = accuracy_score(val_true, val_preds) if val_f1 > best_f1: best_f1 = val_f1 model.save_pretrained(model_save_path) tokenizer.save_pretrained(model_save_path) np.save(os.path.join(model_save_path, 'labels.npy'), label_encoder.classes_) BEST_F1.append(best_f1) return model_save_path # 预测 def predict_hierarchy(text): path = [] confidences=[] tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=MAX_LENGTH).to(DEVICE) for level in ['L1','L2', 'L3', 'L4']: model_path = os.path.join(SAVE_DIR, level) if not os.path.exists (model_path): break model = BertForSeguenceclassification.from_pretrained(model_path).to(DEVICE) model.eval() with torch.no_grad(): outputs = model(**inputs) probs = torch.softmax(outputs.logits, dim=-1) pred_idx= torch.arqmax(probs,dim=-1).item() confidence = probs[0][pred_idx].item() labels = np.load(os.path.join(model_path,'labels.npy'), allow_pickle=True) pred_label = labels[pred_idx] if pred_label =='非敏感': confidences.append(round(confidence, 4)) break path.append(pred_label) confidences.append(round(confidence, 4)) result_path = "/".join(path) if path else "" result_confidence = round(np.mean(confidences), 4) if confidences else 0.0 return result_path, result_confidence def main(): df = load_and_preprocess_data(DATA_PATH) levels = ['L1','L2','L3','L4'] data_date = datetime.now().strftime("%Y%m%d") for level in levels: if level == 'L1': level_df = df.copy() else: level_df = df[df[level]!='非敏感'] if len(level_df) == 0: continue X = level_df['text'].values y = level_df[level].values x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42, stratify = y) num_labels = len(np.unique(y_train)) train_level_model(x_train, y_train, x_val, y_val, level, num_labels)
问题
最后用验证数据集进行验证,发现以下问题:
1、有些分类的准确度很高,高达95%以上,有些分类可信度很差,应该是数据分布导致的,训练数据中数据太少学习泛化能力太弱了。
2、级联模型结构会导致误差增大,比如L1预测可信度只有0.6,这个时候已经错了,后面再预测子类也挽回不了了。
后续可以尝试用Tree-LSTM在该场景下的效果。
参考
HuggingFace自然语言处理详解——基于BERT中文模型的任务实战_李福林

浙公网安备 33010602011771号