nn.Embedding

我来通过几个具体例子讲解 nn.Embedding 的使用场景:

1. 基础示例:词嵌入

import torch
import torch.nn as nn

# 示例1:创建简单的词嵌入层
# 假设我们有5个词,每个词用3维向量表示
embedding = nn.Embedding(num_embeddings=5, embedding_dim=3)

# 随机初始化权重(默认从标准正态分布初始化)
print("原始权重矩阵:")
print(embedding.weight)
print(f"形状: {embedding.weight.shape}")  # (5, 3)

# 输入单词的索引
word_indices = torch.tensor([0, 2, 4])  # 表示取第0、2、4个词
word_vectors = embedding(word_indices)

print("\n词向量:")
print(word_vectors)
print(f"形状: {word_vectors.shape}")  # (3, 3)

输出示例:

原始权重矩阵:
tensor([[ 0.6614,  0.2669,  0.0617],
        [ 0.6213, -0.4519, -0.1661],
        [-1.5228,  0.3817, -1.0276],
        [-0.5631, -0.8923, -0.0583],
        [-0.1955, -0.9656,  0.4224]], requires_grad=True)
形状: torch.Size([5, 3])

词向量:
tensor([[ 0.6614,  0.2669,  0.0617],
        [-1.5228,  0.3817, -1.0276],
        [-0.1955, -0.9656,  0.4224]], grad_fn=<EmbeddingBackward>)
形状: torch.Size([3, 3])

2. 实际应用:文本分类模型

# 示例2:构建一个简单的文本分类器
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        # x形状: (batch_size, seq_len)
        x = self.embedding(x)  # 形状: (batch_size, seq_len, embed_dim)
        
        # 对序列维度求平均,得到每个句子的表示
        x = x.mean(dim=1)  # 形状: (batch_size, embed_dim)
        
        # 通过分类层
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# 创建模型
vocab_size = 10000  # 假设词汇表有10000个词
embed_dim = 256     # 每个词用256维向量表示
hidden_size = 128
num_classes = 2     # 二分类(正面/负面)

model = TextClassifier(vocab_size, embed_dim, hidden_size, num_classes)

# 模拟一批数据
batch_size = 4
seq_len = 10
input_data = torch.randint(0, vocab_size, (batch_size, seq_len))

# 前向传播
output = model(input_data)
print(f"输入形状: {input_data.shape}")
print(f"输出形状: {output.shape}")
print(f"输出值:\n{output}")

3. 使用预训练词向量

# 示例3:加载预训练的GloVe词向量
def load_pretrained_embeddings():
    # 创建一个小型模拟词汇表
    vocab = ["<pad>", "<unk>", "apple", "banana", "orange", "fruit", "eat"]
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    
    # 模拟预训练词向量(实际中从文件加载)
    pretrained_vectors = {
        "apple": [0.8, 0.1, 0.2],
        "banana": [0.7, 0.3, 0.1],
        "orange": [0.9, 0.2, 0.3],
        "fruit": [0.5, 0.4, 0.2],
        "eat": [0.1, 0.9, 0.0]
    }
    
    return vocab, word_to_idx, pretrained_vectors

# 创建嵌入层
vocab, word_to_idx, pretrained_vectors = load_pretrained_embeddings()
vocab_size = len(vocab)
embed_dim = 3

embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

# 用预训练向量初始化
embedding_matrix = torch.randn(vocab_size, embed_dim)  # 先随机初始化

# 将预训练向量复制到对应位置
for word, idx in word_to_idx.items():
    if word in pretrained_vectors:
        embedding_matrix[idx] = torch.tensor(pretrained_vectors[word])

# 将预训练权重加载到嵌入层
embedding.weight.data.copy_(embedding_matrix)
embedding.weight.requires_grad = False  # 冻结词向量(不更新)

# 测试
sentence = ["apple", "banana", "fruit"]
indices = [word_to_idx[word] for word in sentence]
input_tensor = torch.tensor(indices).unsqueeze(0)  # (1, 3)

output = embedding(input_tensor)
print("句子向量:")
print(output)
print(f"形状: {output.shape}")

4. 处理不同长度的序列(使用padding)

# 示例4:处理变长序列(带填充)
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# 准备一批不同长度的句子
sentences = [
    [1, 2, 3, 4],      # 长度4
    [5, 6, 7],         # 长度3
    [8, 9, 10, 11, 12] # 长度5
]

# 创建嵌入层
embedding = nn.Embedding(num_embeddings=20, embedding_dim=4, padding_idx=0)

# 填充到相同长度
padded_sentences = pad_sequence([torch.tensor(s) for s in sentences], 
                                batch_first=True, 
                                padding_value=0)
print("填充后的序列:")
print(padded_sentences)
print(f"形状: {padded_sentences.shape}")

# 创建长度信息
lengths = torch.tensor([len(s) for s in sentences])

# 通过嵌入层
embedded = embedding(padded_sentences)
print(f"\n嵌入后形状: {embedded.shape}")

# 使用pack_padded_sequence处理变长序列
packed_input = pack_padded_sequence(embedded, lengths, 
                                    batch_first=True, 
                                    enforce_sorted=False)

print(f"\n打包后数据: {packed_input}")

5. 在Transformer中的应用

# 示例5:结合位置编码的Transformer词嵌入
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len=512, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(max_len, embed_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x形状: (batch_size, seq_len)
        seq_len = x.size(1)
        
        # 词嵌入
        token_embeds = self.token_embedding(x)  # (batch_size, seq_len, embed_dim)
        
        # 位置嵌入
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
        position_embeds = self.position_embedding(positions)  # (1, seq_len, embed_dim)
        
        # 相加
        embeddings = token_embeds + position_embeds
        embeddings = self.dropout(embeddings)
        
        return embeddings

# 使用
vocab_size = 5000
embed_dim = 512
model = TransformerEmbedding(vocab_size, embed_dim)

# 模拟输入
batch_size = 2
seq_len = 20
input_tensor = torch.randint(0, vocab_size, (batch_size, seq_len))

output = model(input_tensor)
print(f"输入形状: {input_tensor.shape}")
print(f"输出形状: {output.shape}")

6. 可视化词向量相似度

# 示例6:查看词向量之间的相似度
from sklearn.metrics.pairwise import cosine_similarity

# 创建嵌入层并手动设置权重
embedding = nn.Embedding(6, 4)

# 手动设置权重以便观察
manual_weights = torch.tensor([
    [1.0, 0.0, 0.0, 0.0],  # 索引0: "king"
    [0.8, 0.2, 0.0, 0.0],  # 索引1: "queen"
    [0.0, 1.0, 0.0, 0.0],  # 索引2: "woman"
    [0.0, 0.0, 1.0, 0.0],  # 索引3: "man"
    [0.5, 0.5, 0.0, 0.0],  # 索引4: "royal"
    [0.0, 0.0, 0.0, 1.0]   # 索引5: "apple"
])

embedding.weight.data.copy_(manual_weights)

# 计算相似度
indices = torch.arange(6)
vectors = embedding(indices).detach().numpy()

# 余弦相似度矩阵
similarity_matrix = cosine_similarity(vectors)
print("词向量余弦相似度矩阵:")
for i in range(6):
    words = ["king", "queen", "woman", "man", "royal", "apple"]
    print(f"{words[i]:>6}: {similarity_matrix[i].round(3)}")

# 观察:king和queen相似度高,和apple相似度低

7. 实际训练示例

# 示例7:完整的小型训练示例
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# 创建简单的数据集
class WordDataset(Dataset):
    def __init__(self, vocab_size=100, seq_len=5, num_samples=1000):
        self.data = torch.randint(0, vocab_size, (num_samples, seq_len))
        # 简单的标签规则:如果包含词0或词1,标签为1,否则为0
        self.labels = ((self.data == 0) | (self.data == 1)).any(dim=1).long()
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# 创建模型
class SimpleClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)
    
    def forward(self, x):
        # x形状: (batch_size, seq_len)
        embeds = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        embeds_mean = embeds.mean(dim=1)  # (batch_size, embed_dim)
        output = self.fc(embeds_mean)
        return output

# 训练设置
vocab_size = 100
embed_dim = 10
num_classes = 2
batch_size = 32
epochs = 3

# 创建数据
dataset = WordDataset(vocab_size=vocab_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 模型、损失函数、优化器
model = SimpleClassifier(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练循环
for epoch in range(epochs):
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_data, batch_labels in dataloader:
        optimizer.zero_grad()
        
        outputs = model(batch_data)
        loss = criterion(outputs, batch_labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}, Accuracy = {accuracy:.2f}%")

关键点总结:

  1. 输入必须是整数张量(索引)

  2. 输出是连续向量,可以通过训练优化

  3. 常用场景

    • 文本分类/情感分析
    • 机器翻译
    • 命名实体识别
    • 推荐系统(用户ID/物品ID嵌入)
    • 类别特征编码
  4. 实用技巧

    • 使用 padding_idx 处理变长序列
    • 可以用预训练向量初始化
    • 可以冻结嵌入层(requires_grad=False
    • 嵌入维度通常为50、100、200、300等

通过这些例子,你可以看到 nn.Embedding 在各种场景下的实际应用方式。

posted @ 2025-12-24 21:58  saulstavo  阅读(0)  评论(0)    收藏  举报