Transformer step by step

不考虑可用性,草稿式的一步一步拼接 Transformer 流程

step 1: Tokenizer

训练 tokenizer (使用Hugging face tokenizers库)

from tokenizers import Tokenizer, models, trainers

tokenizer = Tokenizer(models.BPE())
trainer = trainers.BpeTrainer(
    special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"],
    vocab_size=10000,
    min_frequency=2,
)

tokenizer.train(["data.txt"], trainer)

encoded = tokenizer.encode("This is a test sentence")

print(encoded.tokens)

训练用的假数据

This is a sample text for tokenizer training. It includes various characters, numbers, and special symbols.
这是一些中文文本,用于测试分词器的训练效果。包含汉字、标点符号和数字123。
Lorem ipsum dolor sit amet, consectetur adipiscing elit. 987 @$#% symbols!
Machine learning models need diverse text samples to properly learn tokenization.
Some technical terms: API, JSON, HTTP, REST, XML, OAuth, TCP/IP, DNS.
Email addresses like example@domain.com or URLs like https://www.example.org.
Special 文字 mixing with English in the 同一个 sentence to test 边界处理.
数字1234与letters混合Text以及标点符号!?.,;:'"(){}.
Multi-line
text with
different indentation
    levels and spacing.
Programming keywords: function, class, import, return, async, await, var, const.
This is
a student

测试:

[00:00:00] Pre-processing files (0 Mo)    ███████████████████████████████████████████████████████████                100%
[00:00:00] Tokenize words                 ███████████████████████████████████████████████████████████ 15       /       15
[00:00:00] Count pairs                    ███████████████████████████████████████████████████████████ 15       /       15
[00:00:00] Compute merges                 ███████████████████████████████████████████████████████████ 96       /       96
['This ', 'is ', 'a', ' t', 'es', 't', ' s', 'ent', 'en', 'c', 'e']

Tokenizer 手工实现版(TODO):

from collections import defaultdict, Counter
import numpy as np

class Bpe:
    def __init__(self, vocab_size=10):
        """初始化BPE分词器"""
        self.vocab_size = vocab_size  # 目标词汇表大小
        self.vocab = set()  # 词汇表
        self.word_freq = Counter()  # 词频统计
        self.merges = []  # 记录合并规则

    def _initialize_corpus(self, corpus):
        """初始化语料,分解为字符并添加词尾标记"""
        for line in corpus:
            words = line.split()
            for word in words:
                # 将词分解为字符序列,添加</w>标记
                self.word_freq[' '.join(list(word) + ['</w>'])] += 1
        # 初始化词汇表为所有单个字符
        self.vocab = set(''.join(corpus).replace(' ', '') + '</w>')

    def _get_pairs(self):
        """统计所有相邻符号对的频率"""
        pairs = defaultdict(int)
        for word, freq in self.word_freq.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[(symbols[i], symbols[i + 1])] += freq
        return pairs

    def _merge_pair(self, pair):
        """合并指定符号对,更新语料"""
        new_corpus = {}
        pair_str = ' '.join(pair)
        new_symbol = ''.join(pair)
        for word, freq in self.word_freq.items():
            # 如果pair出现在word中,替换为新符号
            new_word = word.replace(pair_str, new_symbol)
            new_corpus[new_word] = freq
        self.word_freq = new_corpus
        return new_symbol

    def train(self, corpus):
        """训练BPE模型,支持直接传入字符串列表或文件路径列表"""
        # 处理输入可能是文件路径的情况
        processed_corpus = []
        for item in corpus:
            # 检查item是否可能是文件路径
            if isinstance(item, str) and item.endswith('.txt'):
                try:
                    with open(item, 'r', encoding='utf-8') as f:
                        processed_corpus.extend(f.read().splitlines())
                except (FileNotFoundError, IOError):
                    # 如果打开失败,则当作普通文本处理
                    processed_corpus.append(item)
            else:
                processed_corpus.append(item)

        # Step 1: 初始化语料
        self._initialize_corpus(processed_corpus)

        # Step 2: 迭代合并直到达到vocab_size
        while len(self.vocab) < self.vocab_size:
            pairs = self._get_pairs()
            if not pairs:  # 没有可合并的对
                break
            # 选择频率最高的符号对
            best_pair = max(pairs, key=pairs.get)
            new_symbol = self._merge_pair(best_pair)
            self.vocab.add(new_symbol)
            self.merges.append(best_pair)
            # print(f"Merged {best_pair} -> {new_symbol}, Vocab size: {len(self.vocab)}")

    def _split_word(self, word):
        """将单词分解为初始字符序列"""
        return list(word) + ['</w>']

    def _merge_word(self, symbols):
        """根据训练时的合并规则合并符号"""
        symbols = symbols.copy()
        for pair in self.merges:
            i = 0
            while i < len(symbols) - 1:
                if (symbols[i], symbols[i + 1]) == pair:
                    symbols[i:i + 2] = [''.join(pair)]
                    i = 0  # 从头重新检查
                else:
                    i += 1
        return symbols

    def tokenize(self, text):
        """对新文本进行分词"""
        words = text.split()
        tokenized = []
        for word in words:
            # 分解为字符
            symbols = self._split_word(word)
            # 应用合并规则
            merged = self._merge_word(symbols)
            # 移除</w>(可选)
            if merged[-1] == '</w>':
                merged = merged[:-1]
            tokenized.extend(merged)
        return tokenized

测试:

# 创建BPE实例
bpe = Bpe(vocab_size=10)

# 训练
bpe.train(['data.txt'])

# 输出结果
print("词汇表:", sorted(bpe.vocab))
print("合并规则:", bpe.merges)

# 分词
text = "lowest newest"
tokens = bpe.tokenize(text)
print("分词结果:", tokens)

输出:

词汇表: ['!', '"', '#', '$', '%', "'", '(', ')', ',', '-', '.', '/', '1', '2', '3', '4', '7', '8', '9', ':', ';', '<', '>', '?', '@', 'A', 'C', 'D', 'E', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'X', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '、', '。', '一', '与', '个', '中', '于', '些', '以', '分', '包', '及', '号', '合', '同', '含', '和', '器', '处', '字', '效', '数', '文', '是', '本', '果', '标', '汉', '测', '混', '点', '理', '用', '界', '的', '符', '练', '训', '词', '试', '边', '这', ',']
合并规则: []
分词结果: ['l', 'o', 'w', 'e', 's', 't', 'n', 'e', 'w', 'e', 's', 't']

step2: Emedding 层

这层就是为上一层训练的词汇表,每个元素建立一个初始化的 Embedding 向量,如果有 n 个词,每个词 d 维,就是一个 nxd 的矩阵,这个矩阵在后续的训练中

  • 神经网络向前的时候,被用来查询给定句子的 embedding 向量数组
  • 神经网络向后的时候,被用来更新权重(也就是词表里每个词对应的 embedding 向量被训练的过程)

Pytorch 示例代码

mport torch
import torch.nn as nn

# 参数
vocab_size = 3
embedding_dim = 2

# 初始化Embedding层
torch.manual_seed(42)
embedding = nn.Embedding(vocab_size, embedding_dim)
nn.init.normal_(embedding.weight, mean=0, std=0.1)

# 输入和模拟损失
input_ids = torch.tensor([0, 1])  # "A B"
optimizer = torch.optim.SGD(embedding.parameters(), lr=0.1)

# 前向传播
output = embedding(input_ids)
print("训练前权重:")
print(embedding.weight.detach())

# 假设后续层传回的梯度
# grad_output是假设从后续层传回的梯度,backward根据输入索引更新对应行的权重。
grad_output = torch.tensor([[0.1, -0.2], [-0.3, 0.4]])

# 反向传播
output.backward(grad_output)
optimizer.step()

print("\n训练后权重:")
print(embedding.weight.detach())

输出:

训练前权重:
tensor([[ 0.2208, -0.0638],
        [ 0.0462,  0.0267],
        [ 0.0535,  0.0809]])

训练后权重:
tensor([[ 0.2108, -0.0438],
        [ 0.0762, -0.0133],
        [ 0.0535,  0.0809]])

手工实现的版本:

import numpy as np

class Embedding:
    def __init__(self, vocab_size, embedding_dim, std=0.1):
        """初始化Embedding层"""
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        # 随机初始化权重矩阵
        np.random.seed(42)
        self.weight = np.random.normal(loc=0, scale=std, size=(vocab_size, embedding_dim))
        # 存储输入索引,用于backward
        self.input_ids = None

    def forward(self, input_ids):
        """前向传播:根据输入索引返回嵌入向量"""
        self.input_ids = input_ids  # 保存输入,以便backward使用
        return self.weight[input_ids]

    def backward(self, grad_output, learning_rate=0.1):
        """反向传播:根据梯度更新权重"""
        # grad_output 的形状是 (sequence_length, embedding_dim)
        # 只更新出现在input_ids中的词的权重
        for idx, grad in zip(self.input_ids, grad_output):
            self.weight[idx] -= learning_rate * grad

    def get_weight(self):
        """返回当前权重矩阵"""
        return self.weight

# 测试代码
# 参数
vocab_size = 3
embedding_dim = 2

# 创建Embedding实例
embedding = Embedding(vocab_size, embedding_dim)

# 输入序列
input_ids = np.array([0, 1])  # "A B"

# 前向传播
output = embedding.forward(input_ids)
print("训练前权重:")
print(embedding.get_weight())

# 假设后续层传回的梯度
grad_output = np.array([[0.1, -0.2], [-0.3, 0.4]])

# 反向传播更新权重
embedding.backward(grad_output, learning_rate=0.1)

print("\n训练后权重:")
print(embedding.get_weight())

测试输出:

训练前权重:
[[ 0.04967142 -0.01382643]
 [ 0.06476885  0.15230299]
 [-0.02341534 -0.0234137 ]]

训练后权重:
[[ 0.03967142  0.00617357]
 [ 0.09476885  0.11230299]
 [-0.02341534 -0.0234137 ]]

// 未完待续

posted @ 2025-03-24 14:57  ffl  阅读(28)  评论(0)    收藏  举报