BERT基本使用
1 import torch 2 from transformers import BertTokenizer, BertModel, BertForMaskedLM 3 from transformers import logging 4 logging.set_verbosity_error() 5 6 #############################################预处理################################################### 7 # 加载预训练的模型标记器(词汇表) 8 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 9 max_sentence_len = 10 10 11 # 标记输入 12 text = "[CLS] I love you [SEP]" 13 tokenized_text = tokenizer.tokenize(text) 14 print(tokenized_text) 15 16 # 单词to编号 17 tokenized_ids = tokenizer.convert_tokens_to_ids(tokenized_text) # 注意是** to ##,别写颠倒了 18 tokenized_ids += [0] * (max_sentence_len - len(tokenized_ids)) # 句子不够长度,用[PAD]补足(对应id为0) 19 print(tokenized_ids) 20 21 # 定义attention_mask,用来标记哪是有用token(对应1),哪是[PAD]的token(对应0) 22 att_mask = [1] * len(tokenized_ids) 23 att_mask += [0] * (max_sentence_len - len(tokenized_ids)) 24 25 # 转张量(1*N) 26 tokenized_ids = torch.tensor(tokenized_ids).reshape(1,-1) 27 att_mask = torch.tensor(att_mask).reshape(1,-1) 28 29 #############################################使用模型################################################## 30 model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = False) 31 hidden_rep, cls_head = model(tokenized_ids, attention_mask=att_mask, return_dict=False) # attention_mask用来标记哪是有用token(对应1),哪是[PAD]的token(对应0) 32 print(hidden_rep.shape) # torch.Size([1, 10, 768]) 1:batch(句子个数) 10:token(单词)个数 768:每个token的嵌入表示维度大小 33 print(cls_head.shape) # torch.Size([1, 768]) 整个句子的聚合表示 1:batch(句子个数) 768:句子嵌入表示维度大小
参考文献:
https://blog.csdn.net/yjw123456/article/details/120232707
https://pytorchchina.com/2020/02/29/transformers-quick-start/
https://blog.csdn.net/weixin_46425692/article/details/108890831
http://wp.iter-cc.com/?p=1378

浙公网安备 33010602011771号