深度学习(Transformer文本分类)

之前用LSTM做了个文本分类,这次用Transformer实现相同的功能。

不过这里不再做分词了,而是根据当前学习文本构建字表。

同样也不使用BucketIterator做数据填充,而是先根据句子字符长度排序,然后同一个batch中做字符填充。

最后给了一个测试例子。

import torch
import torch.nn as nn
import pandas as pd
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

class ClsFormer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes, num_layers, nhead=8):
        super(ClsFormer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_encoder = PositionalEncoding(embed_size)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=embed_size,
            nhead=nhead,
            dim_feedforward=4*embed_size,
            batch_first=True  
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.fc = nn.Linear(embed_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)               # (batch_size, seq_len, embed_size)
        x = self.pos_encoder(x)             # 添加位置编码
        x = self.transformer_encoder(x)     # Transformer编码 (batch_size, seq_len, embed_size)
        x = x.mean(dim=1)                   # (batch_size, embed_size)
        x = self.fc(x)                      # 分类输出
        return x

## 读取数据集,构建字表
def get_voc(df):
    rows = df.shape[0]
    full_text = ""
    for i in range(rows):
        text = df.iloc[i, 0]
        cls = df.iloc[i, 1]
        full_text += text + cls

    voc = {}
    for text in full_text:
        if text not in voc:
            voc[text] = len(voc) + 1    # 字表从1开始, 0表示padding

    print("voc:", voc)
    return voc

def train():
    file_path = "data_single.csv"
    df = pd.read_csv(file_path)

    vocab = get_voc(df)
    vocab_size = len(vocab)+1
    embed_size = 128
    batch_size = 32
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 计算句子长度并按长度升序排序
    df_sorted = (
        df.assign(length=df['evaluation'].str.len())  
        .sort_values('length', ascending=True)      
        .drop('length', axis=1) )                    
    df_sorted = df_sorted.reset_index(drop=True)

    scentences = []
    labels = []
    for i in range(df_sorted.shape[0]):
        text = df_sorted.iloc[i, 0]

        scentence = []
        for term in text:
            if term in vocab:
                scentence.append(vocab[term])
        scentences.append(scentence)

        if(df_sorted.iloc[i,1]=='正面'):
            labels.append(1)
        else:
            labels.append(0)

    # 补齐句子长度,最后一个不够batch_size不管了
    for i in range(0,len(scentences)-batch_size,batch_size):
        max_len = 0
        for j in range(i,i+batch_size):
            if len(scentences[j]) > max_len:
                max_len = len(scentences[j])

        for j in range(i,i+batch_size):
            if len(scentences[j]) < max_len:
                for k in range(max_len-len(scentences[j])):
                    scentences[j].append(0)             # padding

    model = ClsFormer(vocab_size, embed_size, 2, 2).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(10):
        for i in range(0,len(scentences)-batch_size,batch_size):
           
            scentences_tensor = torch.tensor(scentences[i:i+batch_size]).to(device)
            labels_tensor = torch.tensor(labels[i:i+batch_size]).to(device)

            outputs = model(scentences_tensor)
            loss = criterion(outputs, labels_tensor)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
          
        # 计算准确率
        correct = 0
        total = 0
        for i in range(0,len(scentences)-batch_size,batch_size):
            scentences_tensor = torch.tensor(scentences[i:i+batch_size]).to(device)
            labels_tensor = torch.tensor(labels[i:i+batch_size]).to(device)

            outputs = model(scentences_tensor)
            _, predicted = torch.max(outputs.data, 1)
            total += labels_tensor.size(0)
            correct += (predicted == labels_tensor).sum().item()
        accuracy = 100 * correct / total
        print(f'Epoch [{epoch+1}/10], Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%')

    torch.save(model.state_dict(), 'model.pth')


def test():
    file_path = "data_single.csv"
    df = pd.read_csv(file_path)

    vocab = get_voc(df)
    vocab_size = len(vocab)+1
    embed_size = 128
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = ClsFormer(vocab_size, embed_size, 2, 2)
    model.load_state_dict(torch.load('model.pth'))
    model.to(device)

    while True:
        text = input("请输入要预测的文本:")
        if text == "exit":
            break

        scentence = []
        for term in text:
            if term in vocab:
                scentence.append(vocab[term])
        
        scentence_tensor = torch.tensor(scentence).unsqueeze(0).to(device)
        outputs = model(scentence_tensor)
        _, predicted = torch.max(outputs.data, 1)
        if predicted.item() == 1:
            print("正面")
        else:
            print("负面")

if __name__ == "__main__":

    train()
    test()
posted @ 2025-05-18 17:56  Dsp Tian  阅读(28)  评论(0)    收藏  举报