• 博客园logo
  • 会员
  • 周边
  • 新闻
  • 博问
  • 闪存
  • 众包
  • 赞助商
  • Chat2DB
    • 搜索
      所有博客
    • 搜索
      当前博客
  • 写随笔 我的博客 短消息 简洁模式
    用户头像
    我的博客 我的园子 账号设置 会员中心 简洁模式 ... 退出登录
    注册 登录
华东 博客
目前在某大模型创业公司工作,研究方向大模型、智能体 新浪博客: http://blog.sina.com.cn/u/2463286753
博客园    首页    新随笔    联系   管理    订阅  订阅
利用BERT得到句子的表示向量(pytorch)

 

在文本分类和文本相似度匹配中,经常用预训练语言模型BERT来得到句子的表示向量,下面给出了pytorch环境下的操作的方法:

  • 这里使用huggingface的transformers中BERT, 需要先安装该依赖包(pip install transformers)
  • 具体实现如下:import torchfrom tqdm import tqdm

 

from tqdm import tqdm
import torch
import
joblib import numpy as np from torch.utils.data import DataLoader,Dataset from sklearn.datasets import fetch_20newsgroups from transformers import BertTokenizer,BertModel class NewDataset(Dataset): def __init__(self, bert_train, mask_train=None, seg_ids_train=None): self.bert_train = bert_train self.mask_train = mask_train self.seg_ids_train = seg_ids_train def __getitem__(self, i): return torch.LongTensor(self.bert_train[i]), \ torch.LongTensor(self.mask_train[i]), \ torch.LongTensor(self.seg_ids_train[i]) def __len__(self): return len(self.bert_train) newsgroups_train = fetch_20newsgroups(subset='train').data newsgroups_test = fetch_20newsgroups(subset='test').data train_label = fetch_20newsgroups(subset='train').target test_label = fetch_20newsgroups(subset='test').target L=512 N = len(newsgroups_train) bert_train,mask_train,seg_ids_train = [], [],[] all_sents = newsgroups_train+newsgroups_test tokenizer=BertTokenizer.from_pretrained('bert-base-uncased') for sent in tqdm(all_sents): tokens = tokenizer.tokenize(sent) tokens = ['[CLS]'] + tokens + ['[SEP]'] padded_tokens = tokens[:L] + ['[PAD]' for _ in range(L - len(tokens))] attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens] sent_ids = tokenizer.convert_tokens_to_ids(padded_tokens) seg_ids = [0 for _ in range(len(padded_tokens))] bert_train.append(sent_ids) mask_train.append(attn_mask) seg_ids_train.append(seg_ids) torch.device("cuda" if torch.cuda.is_available() else "cpu") device = "cuda:0" data = NewDataset(bert_train,mask_train=mask_train,seg_ids_train=seg_ids_train) bert_model = BertModel.from_pretrained('bert-base-uncased').to(device) reps = [] batchsize = 5 for batch in tqdm(DataLoader(data, shuffle=False, batch_size=batchsize)): bert_train, mask_train, seg_ids_train = batch #hidden_reps, cls_head = bert_model(bert_train.cuda(), attention_mask=mask_train.cuda(), token_type_ids=seg_ids_train.cuda()) #reps+=list(cls_head.detach().cpu().numpy()) output = bert_model(bert_train.cuda(), attention_mask=mask_train.cuda(), token_type_ids=seg_ids_train.cuda()) reps+=list(output.pooler_output.detach().cpu().numpy()) reps_train = reps[:N] reps_test = reps[N:] newsgroups_data = {'train_vecs': reps_train, 'train_label': train_label, 'test_vecs': reps_test,'test_label': test_label} joblib.dump(newsgroups_data,"newsgroups_data.pkl")

 

下面供参考,采用prompt获取句子的表示,提取[MASK]位置的特征向量作为句子特征:

    import torch
    import joblib
    import numpy as np
    from tqdm import tqdm
    from torch.utils.data import DataLoader,Dataset
    from sklearn.datasets import fetch_20newsgroups
    from transformers import BertTokenizer,BertModel
    
    class NewDataset(Dataset):
        def __init__(self, bert_train, mask_train=None, seg_ids_train=None):
            self.bert_train = bert_train
            self.mask_train = mask_train
            self.seg_ids_train = seg_ids_train
        def __getitem__(self, i):
            return torch.LongTensor(self.bert_train[i]), \
                torch.LongTensor(self.mask_train[i]), \
                torch.LongTensor(self.seg_ids_train[i])
    
        def __len__(self):
            return len(self.bert_train)
    
    
    newsgroups_train = fetch_20newsgroups(subset='train').data
    newsgroups_test = fetch_20newsgroups(subset='test').data
    train_label = fetch_20newsgroups(subset='train').target
    test_label = fetch_20newsgroups(subset='test').target
    
    L=512
    N = len(newsgroups_train)
    bert_train,mask_train,seg_ids_train = [], [],[]
    all_sents = newsgroups_train+newsgroups_test
    tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
    prompt = "The sentence's topic is [MASK]."
    prompt_tokens = tokenizer.tokenize(prompt)
    LP = len(prompt_tokens)

    for sent in tqdm(all_sents):
        tokens = tokenizer.tokenize(sent)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        padded_tokens = tokens[:L-LP] +prompt_tokens + ['[PAD]' for _ in range(L-LP - len(tokens))]
        attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
        sent_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
        seg_ids = [0 for _ in range(len(padded_tokens))]
        bert_train.append(sent_ids)
        mask_train.append(attn_mask)
        seg_ids_train.append(seg_ids)
    
    torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = "cuda:0"
    data = NewDataset(bert_train,mask_train=mask_train,seg_ids_train=seg_ids_train)
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
    

    reps = []
    batchsize = 5
    for batch in tqdm(DataLoader(data, shuffle=False, batch_size=batchsize)):
        bert_train, mask_train, seg_ids_train = batch
        #hidden_reps, cls_head = bert_model(bert_train.cuda(), attention_mask=mask_train.cuda(), token_type_ids=seg_ids_train.cuda())
        #reps+=list(cls_head.detach().cpu().numpy())
        output = bert_model(bert_train.cuda(), attention_mask=mask_train.cuda(), token_type_ids=seg_ids_train.cuda())
        # reps+=list(output.pooler_output.detach().cpu().numpy())
        last_hidden_state = output.last_hidden_state[bert_train==103] #103是[MASK]的id, shape 是 (batchsize, hiddensize)
        reps+=list(last_hidden_state.detach().cpu().numpy())
 
    reps_train = reps[:N]
    reps_test = reps[N:]
    
    newsgroups_data = {'train_vecs': reps_train, 'train_label': train_label, 'test_vecs': reps_test,'test_label': test_label}
    joblib.dump(newsgroups_data,"newsgroups_data.pkl")

  

 

posted on 2021-06-17 15:46  华东博客  阅读(3470)  评论(0)    收藏  举报
刷新页面返回顶部
博客园  ©  2004-2026
浙公网安备 33010602011771号 浙ICP备2021040463号-3