nn.Embedding
我来通过几个具体例子讲解 nn.Embedding 的使用场景:
1. 基础示例:词嵌入
import torch
import torch.nn as nn
# 示例1:创建简单的词嵌入层
# 假设我们有5个词,每个词用3维向量表示
embedding = nn.Embedding(num_embeddings=5, embedding_dim=3)
# 随机初始化权重(默认从标准正态分布初始化)
print("原始权重矩阵:")
print(embedding.weight)
print(f"形状: {embedding.weight.shape}") # (5, 3)
# 输入单词的索引
word_indices = torch.tensor([0, 2, 4]) # 表示取第0、2、4个词
word_vectors = embedding(word_indices)
print("\n词向量:")
print(word_vectors)
print(f"形状: {word_vectors.shape}") # (3, 3)
输出示例:
原始权重矩阵:
tensor([[ 0.6614, 0.2669, 0.0617],
[ 0.6213, -0.4519, -0.1661],
[-1.5228, 0.3817, -1.0276],
[-0.5631, -0.8923, -0.0583],
[-0.1955, -0.9656, 0.4224]], requires_grad=True)
形状: torch.Size([5, 3])
词向量:
tensor([[ 0.6614, 0.2669, 0.0617],
[-1.5228, 0.3817, -1.0276],
[-0.1955, -0.9656, 0.4224]], grad_fn=<EmbeddingBackward>)
形状: torch.Size([3, 3])
2. 实际应用:文本分类模型
# 示例2:构建一个简单的文本分类器
class TextClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_size, num_classes):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.fc1 = nn.Linear(embed_dim, hidden_size)
self.fc2 = nn.Linear(hidden_size, num_classes)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.5)
def forward(self, x):
# x形状: (batch_size, seq_len)
x = self.embedding(x) # 形状: (batch_size, seq_len, embed_dim)
# 对序列维度求平均,得到每个句子的表示
x = x.mean(dim=1) # 形状: (batch_size, embed_dim)
# 通过分类层
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
# 创建模型
vocab_size = 10000 # 假设词汇表有10000个词
embed_dim = 256 # 每个词用256维向量表示
hidden_size = 128
num_classes = 2 # 二分类(正面/负面)
model = TextClassifier(vocab_size, embed_dim, hidden_size, num_classes)
# 模拟一批数据
batch_size = 4
seq_len = 10
input_data = torch.randint(0, vocab_size, (batch_size, seq_len))
# 前向传播
output = model(input_data)
print(f"输入形状: {input_data.shape}")
print(f"输出形状: {output.shape}")
print(f"输出值:\n{output}")
3. 使用预训练词向量
# 示例3:加载预训练的GloVe词向量
def load_pretrained_embeddings():
# 创建一个小型模拟词汇表
vocab = ["<pad>", "<unk>", "apple", "banana", "orange", "fruit", "eat"]
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
# 模拟预训练词向量(实际中从文件加载)
pretrained_vectors = {
"apple": [0.8, 0.1, 0.2],
"banana": [0.7, 0.3, 0.1],
"orange": [0.9, 0.2, 0.3],
"fruit": [0.5, 0.4, 0.2],
"eat": [0.1, 0.9, 0.0]
}
return vocab, word_to_idx, pretrained_vectors
# 创建嵌入层
vocab, word_to_idx, pretrained_vectors = load_pretrained_embeddings()
vocab_size = len(vocab)
embed_dim = 3
embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
# 用预训练向量初始化
embedding_matrix = torch.randn(vocab_size, embed_dim) # 先随机初始化
# 将预训练向量复制到对应位置
for word, idx in word_to_idx.items():
if word in pretrained_vectors:
embedding_matrix[idx] = torch.tensor(pretrained_vectors[word])
# 将预训练权重加载到嵌入层
embedding.weight.data.copy_(embedding_matrix)
embedding.weight.requires_grad = False # 冻结词向量(不更新)
# 测试
sentence = ["apple", "banana", "fruit"]
indices = [word_to_idx[word] for word in sentence]
input_tensor = torch.tensor(indices).unsqueeze(0) # (1, 3)
output = embedding(input_tensor)
print("句子向量:")
print(output)
print(f"形状: {output.shape}")
4. 处理不同长度的序列(使用padding)
# 示例4:处理变长序列(带填充)
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
# 准备一批不同长度的句子
sentences = [
[1, 2, 3, 4], # 长度4
[5, 6, 7], # 长度3
[8, 9, 10, 11, 12] # 长度5
]
# 创建嵌入层
embedding = nn.Embedding(num_embeddings=20, embedding_dim=4, padding_idx=0)
# 填充到相同长度
padded_sentences = pad_sequence([torch.tensor(s) for s in sentences],
batch_first=True,
padding_value=0)
print("填充后的序列:")
print(padded_sentences)
print(f"形状: {padded_sentences.shape}")
# 创建长度信息
lengths = torch.tensor([len(s) for s in sentences])
# 通过嵌入层
embedded = embedding(padded_sentences)
print(f"\n嵌入后形状: {embedded.shape}")
# 使用pack_padded_sequence处理变长序列
packed_input = pack_padded_sequence(embedded, lengths,
batch_first=True,
enforce_sorted=False)
print(f"\n打包后数据: {packed_input}")
5. 在Transformer中的应用
# 示例5:结合位置编码的Transformer词嵌入
class TransformerEmbedding(nn.Module):
def __init__(self, vocab_size, embed_dim, max_len=512, dropout=0.1):
super().__init__()
self.token_embedding = nn.Embedding(vocab_size, embed_dim)
self.position_embedding = nn.Embedding(max_len, embed_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
# x形状: (batch_size, seq_len)
seq_len = x.size(1)
# 词嵌入
token_embeds = self.token_embedding(x) # (batch_size, seq_len, embed_dim)
# 位置嵌入
positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
position_embeds = self.position_embedding(positions) # (1, seq_len, embed_dim)
# 相加
embeddings = token_embeds + position_embeds
embeddings = self.dropout(embeddings)
return embeddings
# 使用
vocab_size = 5000
embed_dim = 512
model = TransformerEmbedding(vocab_size, embed_dim)
# 模拟输入
batch_size = 2
seq_len = 20
input_tensor = torch.randint(0, vocab_size, (batch_size, seq_len))
output = model(input_tensor)
print(f"输入形状: {input_tensor.shape}")
print(f"输出形状: {output.shape}")
6. 可视化词向量相似度
# 示例6:查看词向量之间的相似度
from sklearn.metrics.pairwise import cosine_similarity
# 创建嵌入层并手动设置权重
embedding = nn.Embedding(6, 4)
# 手动设置权重以便观察
manual_weights = torch.tensor([
[1.0, 0.0, 0.0, 0.0], # 索引0: "king"
[0.8, 0.2, 0.0, 0.0], # 索引1: "queen"
[0.0, 1.0, 0.0, 0.0], # 索引2: "woman"
[0.0, 0.0, 1.0, 0.0], # 索引3: "man"
[0.5, 0.5, 0.0, 0.0], # 索引4: "royal"
[0.0, 0.0, 0.0, 1.0] # 索引5: "apple"
])
embedding.weight.data.copy_(manual_weights)
# 计算相似度
indices = torch.arange(6)
vectors = embedding(indices).detach().numpy()
# 余弦相似度矩阵
similarity_matrix = cosine_similarity(vectors)
print("词向量余弦相似度矩阵:")
for i in range(6):
words = ["king", "queen", "woman", "man", "royal", "apple"]
print(f"{words[i]:>6}: {similarity_matrix[i].round(3)}")
# 观察:king和queen相似度高,和apple相似度低
7. 实际训练示例
# 示例7:完整的小型训练示例
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# 创建简单的数据集
class WordDataset(Dataset):
def __init__(self, vocab_size=100, seq_len=5, num_samples=1000):
self.data = torch.randint(0, vocab_size, (num_samples, seq_len))
# 简单的标签规则:如果包含词0或词1,标签为1,否则为0
self.labels = ((self.data == 0) | (self.data == 1)).any(dim=1).long()
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx], self.labels[idx]
# 创建模型
class SimpleClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, num_classes):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.fc = nn.Linear(embed_dim, num_classes)
def forward(self, x):
# x形状: (batch_size, seq_len)
embeds = self.embedding(x) # (batch_size, seq_len, embed_dim)
embeds_mean = embeds.mean(dim=1) # (batch_size, embed_dim)
output = self.fc(embeds_mean)
return output
# 训练设置
vocab_size = 100
embed_dim = 10
num_classes = 2
batch_size = 32
epochs = 3
# 创建数据
dataset = WordDataset(vocab_size=vocab_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# 模型、损失函数、优化器
model = SimpleClassifier(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练循环
for epoch in range(epochs):
total_loss = 0
correct = 0
total = 0
for batch_data, batch_labels in dataloader:
optimizer.zero_grad()
outputs = model(batch_data)
loss = criterion(outputs, batch_labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = torch.max(outputs, 1)
total += batch_labels.size(0)
correct += (predicted == batch_labels).sum().item()
accuracy = 100 * correct / total
print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}, Accuracy = {accuracy:.2f}%")
关键点总结:
-
输入必须是整数张量(索引)
-
输出是连续向量,可以通过训练优化
-
常用场景:
- 文本分类/情感分析
- 机器翻译
- 命名实体识别
- 推荐系统(用户ID/物品ID嵌入)
- 类别特征编码
-
实用技巧:
- 使用
padding_idx处理变长序列 - 可以用预训练向量初始化
- 可以冻结嵌入层(
requires_grad=False) - 嵌入维度通常为50、100、200、300等
- 使用
通过这些例子,你可以看到 nn.Embedding 在各种场景下的实际应用方式。

浙公网安备 33010602011771号