深度学习(Transformer文本分类)
之前用LSTM做了个文本分类,这次用Transformer实现相同的功能。
不过这里不再做分词了,而是根据当前学习文本构建字表。
同样也不使用BucketIterator做数据填充,而是先根据句子字符长度排序,然后同一个batch中做字符填充。
最后给了一个测试例子。
import torch import torch.nn as nn import pandas as pd import math class PositionalEncoding(nn.Module): def __init__(self, d_model, max_len=5000): super().__init__() pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) # Shape: (1, max_len, d_model) self.register_buffer('pe', pe) def forward(self, x): x = x + self.pe[:, :x.size(1), :] return x class ClsFormer(nn.Module): def __init__(self, vocab_size, embed_size, num_classes, num_layers, nhead=8): super(ClsFormer, self).__init__() self.embedding = nn.Embedding(vocab_size, embed_size) self.pos_encoder = PositionalEncoding(embed_size) encoder_layers = nn.TransformerEncoderLayer( d_model=embed_size, nhead=nhead, dim_feedforward=4*embed_size, batch_first=True ) self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers) self.fc = nn.Linear(embed_size, num_classes) def forward(self, x): x = self.embedding(x) # (batch_size, seq_len, embed_size) x = self.pos_encoder(x) # 添加位置编码 x = self.transformer_encoder(x) # Transformer编码 (batch_size, seq_len, embed_size) x = x.mean(dim=1) # (batch_size, embed_size) x = self.fc(x) # 分类输出 return x ## 读取数据集,构建字表 def get_voc(df): rows = df.shape[0] full_text = "" for i in range(rows): text = df.iloc[i, 0] cls = df.iloc[i, 1] full_text += text + cls voc = {} for text in full_text: if text not in voc: voc[text] = len(voc) + 1 # 字表从1开始, 0表示padding print("voc:", voc) return voc def train(): file_path = "data_single.csv" df = pd.read_csv(file_path) vocab = get_voc(df) vocab_size = len(vocab)+1 embed_size = 128 batch_size = 32 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 计算句子长度并按长度升序排序 df_sorted = ( df.assign(length=df['evaluation'].str.len()) .sort_values('length', ascending=True) .drop('length', axis=1) ) df_sorted = df_sorted.reset_index(drop=True) scentences = [] labels = [] for i in range(df_sorted.shape[0]): text = df_sorted.iloc[i, 0] scentence = [] for term in text: if term in vocab: scentence.append(vocab[term]) scentences.append(scentence) if(df_sorted.iloc[i,1]=='正面'): labels.append(1) else: labels.append(0) # 补齐句子长度,最后一个不够batch_size不管了 for i in range(0,len(scentences)-batch_size,batch_size): max_len = 0 for j in range(i,i+batch_size): if len(scentences[j]) > max_len: max_len = len(scentences[j]) for j in range(i,i+batch_size): if len(scentences[j]) < max_len: for k in range(max_len-len(scentences[j])): scentences[j].append(0) # padding model = ClsFormer(vocab_size, embed_size, 2, 2).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) criterion = nn.CrossEntropyLoss() for epoch in range(10): for i in range(0,len(scentences)-batch_size,batch_size): scentences_tensor = torch.tensor(scentences[i:i+batch_size]).to(device) labels_tensor = torch.tensor(labels[i:i+batch_size]).to(device) outputs = model(scentences_tensor) loss = criterion(outputs, labels_tensor) optimizer.zero_grad() loss.backward() optimizer.step() # 计算准确率 correct = 0 total = 0 for i in range(0,len(scentences)-batch_size,batch_size): scentences_tensor = torch.tensor(scentences[i:i+batch_size]).to(device) labels_tensor = torch.tensor(labels[i:i+batch_size]).to(device) outputs = model(scentences_tensor) _, predicted = torch.max(outputs.data, 1) total += labels_tensor.size(0) correct += (predicted == labels_tensor).sum().item() accuracy = 100 * correct / total print(f'Epoch [{epoch+1}/10], Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%') torch.save(model.state_dict(), 'model.pth') def test(): file_path = "data_single.csv" df = pd.read_csv(file_path) vocab = get_voc(df) vocab_size = len(vocab)+1 embed_size = 128 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = ClsFormer(vocab_size, embed_size, 2, 2) model.load_state_dict(torch.load('model.pth')) model.to(device) while True: text = input("请输入要预测的文本:") if text == "exit": break scentence = [] for term in text: if term in vocab: scentence.append(vocab[term]) scentence_tensor = torch.tensor(scentence).unsqueeze(0).to(device) outputs = model(scentence_tensor) _, predicted = torch.max(outputs.data, 1) if predicted.item() == 1: print("正面") else: print("负面") if __name__ == "__main__": train() test()