BERT基本使用

 1 import torch
 2 from transformers import BertTokenizer, BertModel, BertForMaskedLM
 3 from transformers import logging
 4 logging.set_verbosity_error()
 5 
 6 #############################################预处理###################################################
 7 # 加载预训练的模型标记器(词汇表)
 8 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 9 max_sentence_len = 10
10 
11 # 标记输入
12 text = "[CLS] I love you [SEP]"
13 tokenized_text = tokenizer.tokenize(text)
14 print(tokenized_text)
15 
16 # 单词to编号
17 tokenized_ids = tokenizer.convert_tokens_to_ids(tokenized_text) # 注意是** to ##,别写颠倒了
18 tokenized_ids += [0] * (max_sentence_len - len(tokenized_ids)) # 句子不够长度,用[PAD]补足(对应id为0)
19 print(tokenized_ids)
20 
21 # 定义attention_mask,用来标记哪是有用token(对应1),哪是[PAD]的token(对应0)
22 att_mask = [1] * len(tokenized_ids)
23 att_mask += [0] * (max_sentence_len - len(tokenized_ids))
24 
25 # 转张量(1*N)
26 tokenized_ids = torch.tensor(tokenized_ids).reshape(1,-1)
27 att_mask = torch.tensor(att_mask).reshape(1,-1)
28 
29 #############################################使用模型##################################################
30 model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = False)
31 hidden_rep, cls_head = model(tokenized_ids, attention_mask=att_mask, return_dict=False) # attention_mask用来标记哪是有用token(对应1),哪是[PAD]的token(对应0)
32 print(hidden_rep.shape) # torch.Size([1, 10, 768]) 1:batch(句子个数)  10:token(单词)个数  768:每个token的嵌入表示维度大小
33 print(cls_head.shape) # torch.Size([1, 768]) 整个句子的聚合表示 1:batch(句子个数)  768:句子嵌入表示维度大小

 

参考文献:

https://blog.csdn.net/yjw123456/article/details/120232707

https://pytorchchina.com/2020/02/29/transformers-quick-start/

https://blog.csdn.net/weixin_46425692/article/details/108890831

 http://wp.iter-cc.com/?p=1378

posted @ 2022-04-24 20:25  zxcayumi  阅读(252)  评论(0)    收藏  举报