sgns方法加注释
# Defined in Section 5.2.3.3
#基于负采样
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights
class SGNSDataset(Dataset):
def __init__(self, corpus, vocab, context_size=2, n_negatives=5, ns_dist=None):
#corpus是一个列表,包含多个句子,每个句子是词汇索引的列表。
#vocab:词汇表,一个字典,键为词汇,值为索引。
#context_size=2:上下文窗口的大小,默认为2。
#n_negatives=5:每个正样本对应的负样本数量,默认为5。
#ns_dist=None:负采样分布,默认为None,表示使用uniform分布。
self.data = []#存储数据,每个元素是一个元组,包含一个单词的索引和上下文单词的索引列表。
self.bos = vocab[BOS_TOKEN]#词汇表索引bos
self.eos = vocab[EOS_TOKEN]#词汇表索引eos
self.pad = vocab[PAD_TOKEN]#词汇表索引pad
for sentence in tqdm(corpus, desc="Dataset Construction"):#遍历句子
sentence = [self.bos] + sentence + [self.eos]
for i in range(1, len(sentence)-1):
# 模型输入:(w, context) ;输出为0/1,表示context是否为负样本
w = sentence[i]
left_context_index = max(0, i - context_size)
right_context_index = min(len(sentence), i + context_size)
context = sentence[left_context_index:i] + sentence[i+1:right_context_index+1]
# 对于非2*上下文大小的context用pad补齐
context += [self.pad] * (2 * context_size - len(context))
self.data.append((w, context))
# 负样本数量
self.n_negatives = n_negatives
# 负采样分布:若参数ns_dist为None,则使用uniform分布
self.ns_dist = ns_dist if ns_dist is not None else torch.ones(len(vocab))
def __len__(self):
return len(self.data)#返回数据集的长度
def __getitem__(self, i):#输入i,一个整数,表示索引
return self.data[i]#输出第i个样本,即(词索引, 上下文词索引列表)。
def collate_fn(self, examples):#输入examples,一个列表,包含了批次中的样本,每个样本是(词索引, 上下文词索引列表)对。
words = torch.tensor([ex[0] for ex in examples], dtype=torch.long)#样本的词索引
contexts = torch.tensor([ex[1] for ex in examples], dtype=torch.long)#样本的上下文索引
batch_size, context_size = contexts.shape#样本的数量,上下文窗口大小
# 负样本数量
neg_contexts = []#负样本索引
# 对batch内的样本分别进行负采样
for i in range(batch_size):
# 保证负样本不包含当前样本中的context
# 0,按行填充, contexts[i]:contexts对应的行index, .0:填充值
ns_dist = self.ns_dist.index_fill(0, contexts[i], .0) #真实的上下文词的权重为0
# ns_dist 采样权重(全为1,则均匀采样),self.n_negatives * context_size:采样大小多少
neg_contexts.append(torch.multinomial(ns_dist, self.n_negatives * context_size, replacement=True)) #按照给定的采样权重采样,返回index
neg_contexts = torch.stack(neg_contexts, dim=0)
return words, contexts, neg_contexts#输出一个元组,包含三个张量:词索引、上下文索引和负样本上下文索引
class SGNSModel(nn.Module):
def __init__(self, vocab_size, embedding_dim):#输入vocab_size:词汇表大小,embedding_dim:词嵌入维度。
super(SGNSModel, self).__init__()
# 词嵌入
self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)
# 上下文嵌入
self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)
def forward_w(self, words):
w_embeds = self.w_embeddings(words)
return w_embeds
def forward_c(self, contexts):
c_embeds = self.c_embeddings(contexts)
return c_embeds
def get_unigram_distribution(corpus, vocab_size):#输入corpus:文本数据列表,vocab_size:词汇表大小。
# 从给定语料中统计unigram概率分布
token_counts = torch.tensor([0] * vocab_size)
total_count = 0
for sentence in corpus:
total_count += len(sentence)
for token in sentence:
token_counts[token] += 1
unigram_dist = torch.div(token_counts.float(), total_count)
return unigram_dist#返回一个张量,包含每个词汇的unigram概率分布。
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10
n_negatives = 10
# 读取文本数据
corpus, vocab = load_reuters()
# 计算unigram概率分布
unigram_dist = get_unigram_distribution(corpus, len(vocab))
# 根据unigram分布计算负采样分布: p(w)**0.75
negative_sampling_dist = unigram_dist ** 0.75
negative_sampling_dist /= negative_sampling_dist.sum() #归一化
# 构建SGNS训练数据集
dataset = SGNSDataset(
corpus,
vocab,
context_size=context_size,
n_negatives=n_negatives,
ns_dist=negative_sampling_dist
)
data_loader = get_loader(dataset, batch_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SGNSModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train()
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
words, contexts, neg_contexts = [x.to(device) for x in batch]
optimizer.zero_grad()
batch_size = words.shape[0]
# 提取batch内词、上下文以及负样本的向量表示
word_embeds = model.forward_w(words).unsqueeze(dim=2)
context_embeds = model.forward_c(contexts)
neg_context_embeds = model.forward_c(neg_contexts)
# 正样本的分类(对数)似然
# word_embeds.shape()=(batch_size, embed_size, 1)
# context_embeds.shape()=(batch_size, embed_size, context_size)
context_loss = F.logsigmoid(torch.bmm(context_embeds, word_embeds).squeeze(dim=2))
context_loss = context_loss.mean(dim=1)
# 负样本的分类(对数)似然
neg_context_loss = F.logsigmoid(torch.bmm(neg_context_embeds, word_embeds).squeeze(dim=2).neg())
neg_context_loss = neg_context_loss.view(batch_size, -1, n_negatives).sum(dim=2)
neg_context_loss = neg_context_loss.mean(dim=1)
# 损失:负对数似然
loss = -(context_loss + neg_context_loss).mean()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Loss: {total_loss:.2f}")
# 合并词嵌入矩阵与上下文嵌入矩阵,作为最终的预训练词向量
combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
save_pretrained(vocab, combined_embeds.data, "sgns.vec")
k-means 聚类并可视化
# 导入所需库
from gensim.models.keyedvectors import KeyedVectors
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
# 加载词向量模型
word_vectors = KeyedVectors.load_word2vec_format('sgns.vec', binary=False)
# 设定聚类数量K,这里以5为例
K = 5
# 准备词向量数据用于聚类
vectors = word_vectors.vectors
# 应用K-means聚类
kmeans = KMeans(n_clusters=K, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(vectors)
# 获取每个词的聚类标签
labels = kmeans.labels_
# 可视化聚类结果
# 首先使用PCA降维
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(vectors)
# 绘制聚类结果
plt.figure(figsize=(10, 8))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
markers = ['o', '^', 's', 'p', '*', 'h', 'x']
for i, color in zip(range(K), colors):
# 选择属于当前类别的点
class_member_mask = (labels == i)
xy = reduced_vectors[class_member_mask]
plt.scatter(xy[:, 0], xy[:, 1],
c=color,
marker=markers[i % len(markers)],
alpha=0.5,
label=f'Cluster {i}')
plt.title('K-means Clustering of Word Vectors')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(scatterpoints=1)
plt.grid(True)
plt.show()
print("完成聚类并可视化展示。")
t-SNE方法降维
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
# 加载word2vec模型(这里假设vec文件是Google News Word2Vec格式)
# 注意:路径替换为你的sgns.vec文件的实际路径
model_path = 'sgns.vec'
# 注意:以下代码假设vec文件的第一行是词的数量和维度,需要跳过
model = KeyedVectors.load_word2vec_format(model_path, binary=False)
# 获取词向量
words = model.index_to_key[:1000] # 仅取前1000个词作为示例,可以根据需要调整
vectors = np.array([model[word] for word in words])
# 使用t-SNE降维,这里以2D为例
tsne = TSNE(n_components=2, random_state=42)
vectors_2d = tsne.fit_transform(vectors)
# 可视化降维后的结果
plt.figure(figsize=(10, 8))
for i, word in enumerate(words):
x, y = vectors_2d[i]
plt.scatter(x, y)
plt.annotate(word, xy=(x, y), textcoords='offset points', xytext=(0, 0), ha='right', va='bottom')
plt.title('t-SNE Visualization of Word Embeddings')
plt.show()
选取50个词进行词向量可视化
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
# 加载word2vec模型
model_path = 'sgns.vec'
model = KeyedVectors.load_word2vec_format(model_path, binary=False)
plants = ['rose', 'oak', 'maple', 'bamboo', 'orchid', 'cactus', 'palm', 'iris', 'daisy', 'lotus']
titles = ['president', 'doctor', 'engineer', 'artist', 'teacher', 'lawyer', 'architect', 'nurse', 'writer', 'scientist']
honorifics = ['mr', 'mrs', 'ms', 'dr', 'professor', 'sir', 'madam', 'lord', 'lady', 'captain']
countries = ['usa', 'china', 'france', 'brazil', 'japan', 'germany', 'india', 'australia', 'canada', 'russia']
# 合并所有词汇列表
selected_words = plants + titles + honorifics + countries
# 确保所有词汇都在模型中
valid_words = [word for word in selected_words if word in model.index_to_key]
# 获取选定词汇的词向量
vectors = np.array([model[word] for word in valid_words])
# 使用t-SNE降维至2D,确保perplexity <= len(valid_words) - 1
tsne = TSNE(n_components=2, perplexity=min(30, len(valid_words) - 1), random_state=42)
vectors_2d = tsne.fit_transform(vectors)
# 可视化
plt.figure(figsize=(12, 8))
# 根据词汇类别设定不同的颜色和标记
word_to_color = {word: 'r' if word in plants else 'g' if word in titles else 'b' if word in honorifics else 'y' for word in valid_words}
word_to_marker = {word: 'o' if word in plants else '^' if word in titles else 's' if word in honorifics else '*' for word in valid_words}
for i, (word, color, marker) in enumerate(zip(valid_words, word_to_color.values(), word_to_marker.values())):
x, y = vectors_2d[i]
plt.scatter(x, y, c=color, marker=marker, alpha=0.6)
plt.annotate(word, xy=(x, y), textcoords='offset points', xytext=(0, 0), ha='right', va='bottom')
plt.title('t-SNE Visualization of Selected Word Embeddings')
plt.legend(handles=[
plt.Line2D([0], [0], marker='o', color='w', label='Plants', markerfacecolor='r', markersize=10),
plt.Line2D([0], [0], marker='^', color='w', label='Titles', markerfacecolor='g', markersize=10),
plt.Line2D([0], [0], marker='s', color='w', label='Honorifics', markerfacecolor='b', markersize=10),
plt.Line2D([0], [0], marker='*', color='w', label='Countries', markerfacecolor='y', markersize=10)
])
plt.show()