第八章 文本分类1

#onehot 文本分类
#平均池化及坍缩的恢复,很难理解啊

import numpy as np
from torch.utils.data import Dataset, DataLoader


# 定义read_data函数,读数据
def read_data(path):
    with open(path, "r", encoding="utf-8") as f:
        all_data = f.read().split("\n")  # 按行分割数据

    all_text = []
    all_label = []
    for data in all_data:
        data_s = data.split(" ")  # 行内按空格分割
        if len(data_s) != 2:  # 如果分割后长度不等于2,则跳过该行数据
            continue
        text, label = data_s  # 分割后分别取出文本和标签
        all_text.append(text)  # 将文本添加到all_text列表中
        all_label.append(int(label))  # 将标签转为int类型,添加到all_label列表中

    return all_text, all_label


# 定义get_word_2_index函数,构建词汇表
def get_word_2_index(all_text):
    word_2_index = {"PAD": 0}  # 填充标记,索引为0
    for text in all_text:  # 遍历每行文本
        for word in text:  # 遍历每行文本中的每个词,此处未作分词,默认一字一词
            if word not in word_2_index:  # 如果词不在词汇表中,则添加
                word_2_index[word] = len(
                    word_2_index
                )  # 词汇表中添加该词,并给该词一个索引,索引为当前词汇长度
    index_2_word = list(word_2_index.keys())  # 索引到词汇的映射
    return word_2_index, index_2_word  # 返回词汇表和索引到词汇的映射


# 定义get_word_onehot函数,构建词向量
def get_word_onehot(len_):
    onehot = np.zeros((len_, len_))  # 词向量矩阵
    ##这里也使用可以使用 np.identity() 或 np.eye():构建对角矩阵
    for i in range(len(onehot)):  # 遍历词向量矩阵中的每一行
        onehot[i][i] = 1  # 该行的第i个元素为1
    return onehot


# 定义softmax函数
def softmax(x):
    max_x = np.max(x, axis=-1, keepdims=True)
    x = x - max_x

    ex = np.exp(x)
    sum_ex = np.sum(ex, axis=1, keepdims=True)

    result = ex / sum_ex

    result = np.clip(result, 1e-10, 1e10)
    return result


# 编码one-hot处理
def make_onehot(labels, class_num):
    result = np.zeros((labels.shape[0], class_num))
    for idx, cls in enumerate(labels):
        result[idx, cls] = 1
    return result


# 定义数据集类,继承自pytorch.utils.data的Dataset
class MyDataset(Dataset):
    def __init__(self, all_text, all_label):
        self.all_text = all_text
        self.all_label = all_label

    def __getitem__(self, index):
        text = self.all_text[index][:max_len]  # 取出文本,并截断到max_len长度
        label = self.all_label[index]  # 取出标签

        # 文本转为索引
        text_index = [word_2_index[i] for i in text]
        text_index = text_index + [0] * (
            max_len - len(text_index)
        )  # 补齐0到max_len长度
        # 索引转为词向量
        text_emb = [word_onehot[i] for i in text_index]
        # 将词向量转为numpy数组
        text_emb = np.array(text_emb)

        return text_emb, label

    def __len__(self):
        return len(self.all_text)


# 主函数
if __name__ == "__main__":
    np.random.seed(1000)
    # 读取数据,解包为train_text和train_label
    train_text, train_label = read_data(r"D:\my code\Python\NLP basic\data\train3.txt")
    # 标签转为one-hot
    train_label = make_onehot(np.array(train_label), 2)
    # 构建词汇表
    word_2_index, index_2_word = get_word_2_index(train_text)
    # 构建词向量
    word_onehot = get_word_onehot(len(word_2_index))

    # 超参数设置
    max_len = 8
    batch_size = 3
    epoch = 10
    lr = 0.01
    shuffle = True
    w = np.random.normal(size=(len(word_2_index), 2))

    # 定义数据集和数据加载器
    tarin_dataset = MyDataset(train_text, train_label)
    train_dataloader = DataLoader(tarin_dataset, batch_size=batch_size, shuffle=False)
    # print(word_onehot)
    # 循环训练

    for e in range(epoch):

        for batch_text_emb, batch_label in train_dataloader:
            batch_text_emb = batch_text_emb.numpy()
            batch_label = batch_label.numpy()

            # forward
            # batch_text_emb大小(batch_size,max_len,len(word_2_index)),w大小(len(word_2_index),2)
            # pre大小(batch_size,max_len,2)
            pre = batch_text_emb @ w
            # 平均池化,axis=1,对max_len维度求均值,即第二维坍塌,pre_mean大小(batch_size,2)
            pre_mean = np.mean(pre, axis=1)  
            # p的大小(batch_size,2)
            p = softmax(pre_mean)

            # loss
            # 二元交叉熵损失 (Binary Cross-Entropy Loss, BCE Loss)
            loss = -np.mean(batch_label * np.log(p) + (1 - batch_label) * np.log(1 - p))

            # backward
            # G的大小(batch_size,2)
            G = p - batch_label
            # dpre的大小(batch_size,max_len,2)
            # 刚才是将第二维坍缩为均值,现在要扩展回去,以均值填充
            # 坍缩和恢复很难理解
            dpre = np.zeros_like(pre)
            for i in range(len(G)):  # 遍历批次batch_size
                for j in range(G.shape[1]):# 遍历类别 2
                    # 核心操作:将 G[i][j] 的值,广播到 dpre[i] 的整个max_len 维度
                    dpre[i][:, j] = G[i][j]

            # update
            #在二维的输入上,输入矩阵的转置乘以导数就能得到w的导数,但是三维的矩阵只转置后两个维度。
            delta_w = batch_text_emb.transpose(0, 2, 1) @ dpre
            delta_w = np.mean(delta_w, axis=0)
            w = w - lr * delta_w

        print(loss)
posted @ 2025-10-13 21:09  李大嘟嘟  阅读(11)  评论(0)    收藏  举报