Transformer step by step
不考虑可用性,草稿式的一步一步拼接 Transformer 流程
step 1: Tokenizer
训练 tokenizer (使用Hugging face tokenizers库)
from tokenizers import Tokenizer, models, trainers
tokenizer = Tokenizer(models.BPE())
trainer = trainers.BpeTrainer(
special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"],
vocab_size=10000,
min_frequency=2,
)
tokenizer.train(["data.txt"], trainer)
encoded = tokenizer.encode("This is a test sentence")
print(encoded.tokens)
训练用的假数据
This is a sample text for tokenizer training. It includes various characters, numbers, and special symbols.
这是一些中文文本,用于测试分词器的训练效果。包含汉字、标点符号和数字123。
Lorem ipsum dolor sit amet, consectetur adipiscing elit. 987 @$#% symbols!
Machine learning models need diverse text samples to properly learn tokenization.
Some technical terms: API, JSON, HTTP, REST, XML, OAuth, TCP/IP, DNS.
Email addresses like example@domain.com or URLs like https://www.example.org.
Special 文字 mixing with English in the 同一个 sentence to test 边界处理.
数字1234与letters混合Text以及标点符号!?.,;:'"(){}.
Multi-line
text with
different indentation
levels and spacing.
Programming keywords: function, class, import, return, async, await, var, const.
This is
a student
测试:
[00:00:00] Pre-processing files (0 Mo) ███████████████████████████████████████████████████████████ 100%
[00:00:00] Tokenize words ███████████████████████████████████████████████████████████ 15 / 15
[00:00:00] Count pairs ███████████████████████████████████████████████████████████ 15 / 15
[00:00:00] Compute merges ███████████████████████████████████████████████████████████ 96 / 96
['This ', 'is ', 'a', ' t', 'es', 't', ' s', 'ent', 'en', 'c', 'e']
Tokenizer 手工实现版(TODO):
from collections import defaultdict, Counter
import numpy as np
class Bpe:
def __init__(self, vocab_size=10):
"""初始化BPE分词器"""
self.vocab_size = vocab_size # 目标词汇表大小
self.vocab = set() # 词汇表
self.word_freq = Counter() # 词频统计
self.merges = [] # 记录合并规则
def _initialize_corpus(self, corpus):
"""初始化语料,分解为字符并添加词尾标记"""
for line in corpus:
words = line.split()
for word in words:
# 将词分解为字符序列,添加</w>标记
self.word_freq[' '.join(list(word) + ['</w>'])] += 1
# 初始化词汇表为所有单个字符
self.vocab = set(''.join(corpus).replace(' ', '') + '</w>')
def _get_pairs(self):
"""统计所有相邻符号对的频率"""
pairs = defaultdict(int)
for word, freq in self.word_freq.items():
symbols = word.split()
for i in range(len(symbols) - 1):
pairs[(symbols[i], symbols[i + 1])] += freq
return pairs
def _merge_pair(self, pair):
"""合并指定符号对,更新语料"""
new_corpus = {}
pair_str = ' '.join(pair)
new_symbol = ''.join(pair)
for word, freq in self.word_freq.items():
# 如果pair出现在word中,替换为新符号
new_word = word.replace(pair_str, new_symbol)
new_corpus[new_word] = freq
self.word_freq = new_corpus
return new_symbol
def train(self, corpus):
"""训练BPE模型,支持直接传入字符串列表或文件路径列表"""
# 处理输入可能是文件路径的情况
processed_corpus = []
for item in corpus:
# 检查item是否可能是文件路径
if isinstance(item, str) and item.endswith('.txt'):
try:
with open(item, 'r', encoding='utf-8') as f:
processed_corpus.extend(f.read().splitlines())
except (FileNotFoundError, IOError):
# 如果打开失败,则当作普通文本处理
processed_corpus.append(item)
else:
processed_corpus.append(item)
# Step 1: 初始化语料
self._initialize_corpus(processed_corpus)
# Step 2: 迭代合并直到达到vocab_size
while len(self.vocab) < self.vocab_size:
pairs = self._get_pairs()
if not pairs: # 没有可合并的对
break
# 选择频率最高的符号对
best_pair = max(pairs, key=pairs.get)
new_symbol = self._merge_pair(best_pair)
self.vocab.add(new_symbol)
self.merges.append(best_pair)
# print(f"Merged {best_pair} -> {new_symbol}, Vocab size: {len(self.vocab)}")
def _split_word(self, word):
"""将单词分解为初始字符序列"""
return list(word) + ['</w>']
def _merge_word(self, symbols):
"""根据训练时的合并规则合并符号"""
symbols = symbols.copy()
for pair in self.merges:
i = 0
while i < len(symbols) - 1:
if (symbols[i], symbols[i + 1]) == pair:
symbols[i:i + 2] = [''.join(pair)]
i = 0 # 从头重新检查
else:
i += 1
return symbols
def tokenize(self, text):
"""对新文本进行分词"""
words = text.split()
tokenized = []
for word in words:
# 分解为字符
symbols = self._split_word(word)
# 应用合并规则
merged = self._merge_word(symbols)
# 移除</w>(可选)
if merged[-1] == '</w>':
merged = merged[:-1]
tokenized.extend(merged)
return tokenized
测试:
# 创建BPE实例
bpe = Bpe(vocab_size=10)
# 训练
bpe.train(['data.txt'])
# 输出结果
print("词汇表:", sorted(bpe.vocab))
print("合并规则:", bpe.merges)
# 分词
text = "lowest newest"
tokens = bpe.tokenize(text)
print("分词结果:", tokens)
输出:
词汇表: ['!', '"', '#', '$', '%', "'", '(', ')', ',', '-', '.', '/', '1', '2', '3', '4', '7', '8', '9', ':', ';', '<', '>', '?', '@', 'A', 'C', 'D', 'E', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'X', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '、', '。', '一', '与', '个', '中', '于', '些', '以', '分', '包', '及', '号', '合', '同', '含', '和', '器', '处', '字', '效', '数', '文', '是', '本', '果', '标', '汉', '测', '混', '点', '理', '用', '界', '的', '符', '练', '训', '词', '试', '边', '这', ',']
合并规则: []
分词结果: ['l', 'o', 'w', 'e', 's', 't', 'n', 'e', 'w', 'e', 's', 't']
step2: Emedding 层
这层就是为上一层训练的词汇表,每个元素建立一个初始化的 Embedding 向量,如果有 n 个词,每个词 d 维,就是一个 nxd 的矩阵,这个矩阵在后续的训练中
- 神经网络向前的时候,被用来查询给定句子的 embedding 向量数组
- 神经网络向后的时候,被用来更新权重(也就是词表里每个词对应的 embedding 向量被训练的过程)
Pytorch 示例代码
mport torch
import torch.nn as nn
# 参数
vocab_size = 3
embedding_dim = 2
# 初始化Embedding层
torch.manual_seed(42)
embedding = nn.Embedding(vocab_size, embedding_dim)
nn.init.normal_(embedding.weight, mean=0, std=0.1)
# 输入和模拟损失
input_ids = torch.tensor([0, 1]) # "A B"
optimizer = torch.optim.SGD(embedding.parameters(), lr=0.1)
# 前向传播
output = embedding(input_ids)
print("训练前权重:")
print(embedding.weight.detach())
# 假设后续层传回的梯度
# grad_output是假设从后续层传回的梯度,backward根据输入索引更新对应行的权重。
grad_output = torch.tensor([[0.1, -0.2], [-0.3, 0.4]])
# 反向传播
output.backward(grad_output)
optimizer.step()
print("\n训练后权重:")
print(embedding.weight.detach())
输出:
训练前权重:
tensor([[ 0.2208, -0.0638],
[ 0.0462, 0.0267],
[ 0.0535, 0.0809]])
训练后权重:
tensor([[ 0.2108, -0.0438],
[ 0.0762, -0.0133],
[ 0.0535, 0.0809]])
手工实现的版本:
import numpy as np
class Embedding:
def __init__(self, vocab_size, embedding_dim, std=0.1):
"""初始化Embedding层"""
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
# 随机初始化权重矩阵
np.random.seed(42)
self.weight = np.random.normal(loc=0, scale=std, size=(vocab_size, embedding_dim))
# 存储输入索引,用于backward
self.input_ids = None
def forward(self, input_ids):
"""前向传播:根据输入索引返回嵌入向量"""
self.input_ids = input_ids # 保存输入,以便backward使用
return self.weight[input_ids]
def backward(self, grad_output, learning_rate=0.1):
"""反向传播:根据梯度更新权重"""
# grad_output 的形状是 (sequence_length, embedding_dim)
# 只更新出现在input_ids中的词的权重
for idx, grad in zip(self.input_ids, grad_output):
self.weight[idx] -= learning_rate * grad
def get_weight(self):
"""返回当前权重矩阵"""
return self.weight
# 测试代码
# 参数
vocab_size = 3
embedding_dim = 2
# 创建Embedding实例
embedding = Embedding(vocab_size, embedding_dim)
# 输入序列
input_ids = np.array([0, 1]) # "A B"
# 前向传播
output = embedding.forward(input_ids)
print("训练前权重:")
print(embedding.get_weight())
# 假设后续层传回的梯度
grad_output = np.array([[0.1, -0.2], [-0.3, 0.4]])
# 反向传播更新权重
embedding.backward(grad_output, learning_rate=0.1)
print("\n训练后权重:")
print(embedding.get_weight())
测试输出:
训练前权重:
[[ 0.04967142 -0.01382643]
[ 0.06476885 0.15230299]
[-0.02341534 -0.0234137 ]]
训练后权重:
[[ 0.03967142 0.00617357]
[ 0.09476885 0.11230299]
[-0.02341534 -0.0234137 ]]
// 未完待续