sgns方法加注释

# Defined in Section 5.2.3.3
#基于负采样


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights


class SGNSDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2, n_negatives=5, ns_dist=None):
        #corpus是一个列表,包含多个句子,每个句子是词汇索引的列表。
        #vocab:词汇表,一个字典,键为词汇,值为索引。
        #context_size=2:上下文窗口的大小,默认为2。
        #n_negatives=5:每个正样本对应的负样本数量,默认为5。
        #ns_dist=None:负采样分布,默认为None,表示使用uniform分布。
        self.data = []#存储数据,每个元素是一个元组,包含一个单词的索引和上下文单词的索引列表。
        self.bos = vocab[BOS_TOKEN]#词汇表索引bos
        self.eos = vocab[EOS_TOKEN]#词汇表索引eos
        self.pad = vocab[PAD_TOKEN]#词汇表索引pad
        for sentence in tqdm(corpus, desc="Dataset Construction"):#遍历句子
            sentence = [self.bos] + sentence + [self.eos]
            for i in range(1, len(sentence)-1):
                # 模型输入:(w, context) ;输出为0/1,表示context是否为负样本
                w = sentence[i]
                left_context_index = max(0, i - context_size)
                right_context_index = min(len(sentence), i + context_size)
                context = sentence[left_context_index:i] + sentence[i+1:right_context_index+1]
                # 对于非2*上下文大小的context用pad补齐
                context += [self.pad] * (2 * context_size - len(context))
                self.data.append((w, context))

        # 负样本数量
        self.n_negatives = n_negatives
        # 负采样分布:若参数ns_dist为None,则使用uniform分布
        self.ns_dist = ns_dist if ns_dist is not None else torch.ones(len(vocab))

    def __len__(self):
        return len(self.data)#返回数据集的长度

    def __getitem__(self, i):#输入i,一个整数,表示索引
        return self.data[i]#输出第i个样本,即(词索引, 上下文词索引列表)。

    def collate_fn(self, examples):#输入examples,一个列表,包含了批次中的样本,每个样本是(词索引, 上下文词索引列表)对。
        words = torch.tensor([ex[0] for ex in examples], dtype=torch.long)#样本的词索引
        contexts = torch.tensor([ex[1] for ex in examples], dtype=torch.long)#样本的上下文索引
        batch_size, context_size = contexts.shape#样本的数量,上下文窗口大小
        # 负样本数量
        neg_contexts = []#负样本索引
        # 对batch内的样本分别进行负采样
        for i in range(batch_size):
            # 保证负样本不包含当前样本中的context
            # 0,按行填充, contexts[i]:contexts对应的行index, .0:填充值
            ns_dist = self.ns_dist.index_fill(0, contexts[i], .0) #真实的上下文词的权重为0
            # ns_dist 采样权重(全为1,则均匀采样),self.n_negatives * context_size:采样大小多少
            neg_contexts.append(torch.multinomial(ns_dist, self.n_negatives * context_size, replacement=True)) #按照给定的采样权重采样,返回index
        neg_contexts = torch.stack(neg_contexts, dim=0)
        return words, contexts, neg_contexts#输出一个元组,包含三个张量:词索引、上下文索引和负样本上下文索引


class SGNSModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):#输入vocab_size:词汇表大小,embedding_dim:词嵌入维度。
        super(SGNSModel, self).__init__()
        # 词嵌入
        self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 上下文嵌入
        self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward_w(self, words):
        w_embeds = self.w_embeddings(words)
        return w_embeds

    def forward_c(self, contexts):
        c_embeds = self.c_embeddings(contexts)
        return c_embeds


def get_unigram_distribution(corpus, vocab_size):#输入corpus:文本数据列表,vocab_size:词汇表大小。
    # 从给定语料中统计unigram概率分布
    token_counts = torch.tensor([0] * vocab_size)
    total_count = 0
    for sentence in corpus:
        total_count += len(sentence)
        for token in sentence:
            token_counts[token] += 1
    unigram_dist = torch.div(token_counts.float(), total_count)
    return unigram_dist#返回一个张量,包含每个词汇的unigram概率分布。

embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10
n_negatives = 10

# 读取文本数据
corpus, vocab = load_reuters()
# 计算unigram概率分布
unigram_dist = get_unigram_distribution(corpus, len(vocab))
# 根据unigram分布计算负采样分布: p(w)**0.75
negative_sampling_dist = unigram_dist ** 0.75
negative_sampling_dist /= negative_sampling_dist.sum() #归一化
# 构建SGNS训练数据集
dataset = SGNSDataset(
    corpus,
    vocab,
    context_size=context_size,
    n_negatives=n_negatives,
    ns_dist=negative_sampling_dist
)
data_loader = get_loader(dataset, batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SGNSModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        words, contexts, neg_contexts = [x.to(device) for x in batch]
        optimizer.zero_grad()
        batch_size = words.shape[0]
        # 提取batch内词、上下文以及负样本的向量表示
        word_embeds = model.forward_w(words).unsqueeze(dim=2)
        context_embeds = model.forward_c(contexts)
        neg_context_embeds = model.forward_c(neg_contexts)
        # 正样本的分类(对数)似然
        # word_embeds.shape()=(batch_size, embed_size, 1)
        # context_embeds.shape()=(batch_size, embed_size, context_size)
        context_loss = F.logsigmoid(torch.bmm(context_embeds, word_embeds).squeeze(dim=2))
        context_loss = context_loss.mean(dim=1)
        # 负样本的分类(对数)似然
        neg_context_loss = F.logsigmoid(torch.bmm(neg_context_embeds, word_embeds).squeeze(dim=2).neg())
        neg_context_loss = neg_context_loss.view(batch_size, -1, n_negatives).sum(dim=2)
        neg_context_loss = neg_context_loss.mean(dim=1)
        # 损失:负对数似然
        loss = -(context_loss + neg_context_loss).mean()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# 合并词嵌入矩阵与上下文嵌入矩阵,作为最终的预训练词向量
combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
save_pretrained(vocab, combined_embeds.data, "sgns.vec")

k-means 聚类并可视化

# 导入所需库
from gensim.models.keyedvectors import KeyedVectors
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np


# 加载词向量模型
word_vectors = KeyedVectors.load_word2vec_format('sgns.vec', binary=False)

# 设定聚类数量K,这里以5为例
K = 5

# 准备词向量数据用于聚类
vectors = word_vectors.vectors

# 应用K-means聚类
kmeans = KMeans(n_clusters=K, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(vectors)

# 获取每个词的聚类标签
labels = kmeans.labels_

# 可视化聚类结果
# 首先使用PCA降维
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(vectors)

# 绘制聚类结果
plt.figure(figsize=(10, 8))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
markers = ['o', '^', 's', 'p', '*', 'h', 'x']

for i, color in zip(range(K), colors):
    # 选择属于当前类别的点
    class_member_mask = (labels == i)
    xy = reduced_vectors[class_member_mask]
    plt.scatter(xy[:, 0], xy[:, 1], 
                c=color, 
                marker=markers[i % len(markers)],
                alpha=0.5,
                label=f'Cluster {i}')

plt.title('K-means Clustering of Word Vectors')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(scatterpoints=1)
plt.grid(True)
plt.show()

print("完成聚类并可视化展示。")

t-SNE方法降维

import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE

# 加载word2vec模型(这里假设vec文件是Google News Word2Vec格式)
# 注意:路径替换为你的sgns.vec文件的实际路径
model_path = 'sgns.vec'
# 注意:以下代码假设vec文件的第一行是词的数量和维度,需要跳过
model = KeyedVectors.load_word2vec_format(model_path, binary=False)

# 获取词向量
words = model.index_to_key[:1000]  # 仅取前1000个词作为示例,可以根据需要调整
vectors = np.array([model[word] for word in words])

# 使用t-SNE降维,这里以2D为例
tsne = TSNE(n_components=2, random_state=42)
vectors_2d = tsne.fit_transform(vectors)

# 可视化降维后的结果
plt.figure(figsize=(10, 8))
for i, word in enumerate(words):
    x, y = vectors_2d[i]
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), textcoords='offset points', xytext=(0, 0), ha='right', va='bottom')

plt.title('t-SNE Visualization of Word Embeddings')
plt.show()

选取50个词进行词向量可视化

import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE

# 加载word2vec模型
model_path = 'sgns.vec'
model = KeyedVectors.load_word2vec_format(model_path, binary=False)


plants = ['rose', 'oak', 'maple', 'bamboo', 'orchid', 'cactus', 'palm', 'iris', 'daisy', 'lotus']
titles = ['president', 'doctor', 'engineer', 'artist', 'teacher', 'lawyer', 'architect', 'nurse', 'writer', 'scientist']
honorifics = ['mr', 'mrs', 'ms', 'dr', 'professor', 'sir', 'madam', 'lord', 'lady', 'captain']
countries = ['usa', 'china', 'france', 'brazil', 'japan', 'germany', 'india', 'australia', 'canada', 'russia']

# 合并所有词汇列表
selected_words = plants + titles + honorifics + countries

# 确保所有词汇都在模型中
valid_words = [word for word in selected_words if word in model.index_to_key]

# 获取选定词汇的词向量
vectors = np.array([model[word] for word in valid_words])

# 使用t-SNE降维至2D,确保perplexity <= len(valid_words) - 1
tsne = TSNE(n_components=2, perplexity=min(30, len(valid_words) - 1), random_state=42)
vectors_2d = tsne.fit_transform(vectors)

# 可视化
plt.figure(figsize=(12, 8))

# 根据词汇类别设定不同的颜色和标记
word_to_color = {word: 'r' if word in plants else 'g' if word in titles else 'b' if word in honorifics else 'y' for word in valid_words}
word_to_marker = {word: 'o' if word in plants else '^' if word in titles else 's' if word in honorifics else '*' for word in valid_words}

for i, (word, color, marker) in enumerate(zip(valid_words, word_to_color.values(), word_to_marker.values())):
    x, y = vectors_2d[i]
    plt.scatter(x, y, c=color, marker=marker, alpha=0.6)
    plt.annotate(word, xy=(x, y), textcoords='offset points', xytext=(0, 0), ha='right', va='bottom')

plt.title('t-SNE Visualization of Selected Word Embeddings')
plt.legend(handles=[
    plt.Line2D([0], [0], marker='o', color='w', label='Plants', markerfacecolor='r', markersize=10),
    plt.Line2D([0], [0], marker='^', color='w', label='Titles', markerfacecolor='g', markersize=10),
    plt.Line2D([0], [0], marker='s', color='w', label='Honorifics', markerfacecolor='b', markersize=10),
    plt.Line2D([0], [0], marker='*', color='w', label='Countries', markerfacecolor='y', markersize=10)
])
plt.show()
posted on 2024-05-27 00:29  漫卷  阅读(45)  评论(0)    收藏  举报