#onehot 文本分类
#平均池化及坍缩的恢复,很难理解啊
import numpy as np
from torch.utils.data import Dataset, DataLoader
# 定义read_data函数,读数据
def read_data(path):
with open(path, "r", encoding="utf-8") as f:
all_data = f.read().split("\n") # 按行分割数据
all_text = []
all_label = []
for data in all_data:
data_s = data.split(" ") # 行内按空格分割
if len(data_s) != 2: # 如果分割后长度不等于2,则跳过该行数据
continue
text, label = data_s # 分割后分别取出文本和标签
all_text.append(text) # 将文本添加到all_text列表中
all_label.append(int(label)) # 将标签转为int类型,添加到all_label列表中
return all_text, all_label
# 定义get_word_2_index函数,构建词汇表
def get_word_2_index(all_text):
word_2_index = {"PAD": 0} # 填充标记,索引为0
for text in all_text: # 遍历每行文本
for word in text: # 遍历每行文本中的每个词,此处未作分词,默认一字一词
if word not in word_2_index: # 如果词不在词汇表中,则添加
word_2_index[word] = len(
word_2_index
) # 词汇表中添加该词,并给该词一个索引,索引为当前词汇长度
index_2_word = list(word_2_index.keys()) # 索引到词汇的映射
return word_2_index, index_2_word # 返回词汇表和索引到词汇的映射
# 定义get_word_onehot函数,构建词向量
def get_word_onehot(len_):
onehot = np.zeros((len_, len_)) # 词向量矩阵
##这里也使用可以使用 np.identity() 或 np.eye():构建对角矩阵
for i in range(len(onehot)): # 遍历词向量矩阵中的每一行
onehot[i][i] = 1 # 该行的第i个元素为1
return onehot
# 定义softmax函数
def softmax(x):
max_x = np.max(x, axis=-1, keepdims=True)
x = x - max_x
ex = np.exp(x)
sum_ex = np.sum(ex, axis=1, keepdims=True)
result = ex / sum_ex
result = np.clip(result, 1e-10, 1e10)
return result
# 编码one-hot处理
def make_onehot(labels, class_num):
result = np.zeros((labels.shape[0], class_num))
for idx, cls in enumerate(labels):
result[idx, cls] = 1
return result
# 定义数据集类,继承自pytorch.utils.data的Dataset
class MyDataset(Dataset):
def __init__(self, all_text, all_label):
self.all_text = all_text
self.all_label = all_label
def __getitem__(self, index):
text = self.all_text[index][:max_len] # 取出文本,并截断到max_len长度
label = self.all_label[index] # 取出标签
# 文本转为索引
text_index = [word_2_index[i] for i in text]
text_index = text_index + [0] * (
max_len - len(text_index)
) # 补齐0到max_len长度
# 索引转为词向量
text_emb = [word_onehot[i] for i in text_index]
# 将词向量转为numpy数组
text_emb = np.array(text_emb)
return text_emb, label
def __len__(self):
return len(self.all_text)
# 主函数
if __name__ == "__main__":
np.random.seed(1000)
# 读取数据,解包为train_text和train_label
train_text, train_label = read_data(r"D:\my code\Python\NLP basic\data\train3.txt")
# 标签转为one-hot
train_label = make_onehot(np.array(train_label), 2)
# 构建词汇表
word_2_index, index_2_word = get_word_2_index(train_text)
# 构建词向量
word_onehot = get_word_onehot(len(word_2_index))
# 超参数设置
max_len = 8
batch_size = 3
epoch = 10
lr = 0.01
shuffle = True
w = np.random.normal(size=(len(word_2_index), 2))
# 定义数据集和数据加载器
tarin_dataset = MyDataset(train_text, train_label)
train_dataloader = DataLoader(tarin_dataset, batch_size=batch_size, shuffle=False)
# print(word_onehot)
# 循环训练
for e in range(epoch):
for batch_text_emb, batch_label in train_dataloader:
batch_text_emb = batch_text_emb.numpy()
batch_label = batch_label.numpy()
# forward
# batch_text_emb大小(batch_size,max_len,len(word_2_index)),w大小(len(word_2_index),2)
# pre大小(batch_size,max_len,2)
pre = batch_text_emb @ w
# 平均池化,axis=1,对max_len维度求均值,即第二维坍塌,pre_mean大小(batch_size,2)
pre_mean = np.mean(pre, axis=1)
# p的大小(batch_size,2)
p = softmax(pre_mean)
# loss
# 二元交叉熵损失 (Binary Cross-Entropy Loss, BCE Loss)
loss = -np.mean(batch_label * np.log(p) + (1 - batch_label) * np.log(1 - p))
# backward
# G的大小(batch_size,2)
G = p - batch_label
# dpre的大小(batch_size,max_len,2)
# 刚才是将第二维坍缩为均值,现在要扩展回去,以均值填充
# 坍缩和恢复很难理解
dpre = np.zeros_like(pre)
for i in range(len(G)): # 遍历批次batch_size
for j in range(G.shape[1]):# 遍历类别 2
# 核心操作:将 G[i][j] 的值,广播到 dpre[i] 的整个max_len 维度
dpre[i][:, j] = G[i][j]
# update
#在二维的输入上,输入矩阵的转置乘以导数就能得到w的导数,但是三维的矩阵只转置后两个维度。
delta_w = batch_text_emb.transpose(0, 2, 1) @ dpre
delta_w = np.mean(delta_w, axis=0)
w = w - lr * delta_w
print(loss)