from torchtext.legacy import data
from torchtext import datasets
import torch
import torch.nn as nn
from transformers import BertTokenizer,BertModel
import torch.optim as optim
from tqdm import tqdm
import time
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
bert=BertModel.from_pretrained('bert-base-uncased')
max_input_length=tokenizer.max_model_input_sizes['bert-base-uncased']
#print(max_input_length)#512 单句最大输入长度
def tokenizer_and_cut(sentence):
tokens=tokenizer.tokenize(sentence)
tokens=tokens[:max_input_length-2]
return tokens
REVIEW=data.Field(
batch_first=True,
use_vocab=False,
tokenize=tokenizer_and_cut,
preprocessing=tokenizer.convert_tokens_to_ids,#将token转化为id
init_token=tokenizer.cls_token_id,#101
eos_token=tokenizer.sep_token_id,#102
pad_token=tokenizer.pad_token_id,#0
unk_token=tokenizer.unk_token_id#100
)
SENTIMENT=data.LabelField(dtype=torch.float)
#PreTrainedTokenizer(
# name_or_path='bert-base-uncased',
# vocab_size=30522,
# model_max_len=512,
# is_fast=False,
# padding_side='right',
# special_tokens={
# 'unk_token': '[UNK]',
# 'sep_token': '[SEP]',
# 'pad_token': '[PAD]',
# 'cls_token': '[CLS]',
# 'mask_token': '[MASK]'})
fields={'review':('r',REVIEW),'sentiment':('s',SENTIMENT)}
#sen_code=tokenizer.tokenize('Hello you where you are')
#['hello', 'you', 'where', 'you', 'are']返回token
#sen_code=tokenizer.encode('Hello you where you are')
#[101, 7592, 2017, 2073, 2017, 2024, 102]返回字典中的id
#[<CLS> Hello you where you are <SEP>]
#sen_code=tokenizer.decode(102)
#[ S E P ]
IMDB_data=data.TabularDataset(
path='/mnt/disk2/std2021/hejiabang-data/transformer_data/IMDB_Dataset.csv',
format='csv',
fields=fields
)
train_data,valid_data,test_data=IMDB_data.split(
split_ratio=[0.8,0.1,0.1]
)
SENTIMENT.build_vocab(train_data)
#print(len(SENTIMENT.vocab))
# 2
#print(SENTIMENT.vocab.itos[:])
# ['positive', 'negative']
#print(SENTIMENT.vocab.stoi)
# defaultdict(None, {'positive': 0, 'negative': 1})
#print(SENTIMENT.vocab.stoi['positive'])
# 0
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE=16
train_iterator,valid_iterator,test_iterator=data.BucketIterator.splits(
(train_data,valid_data,test_data),
sort=False,
batch_size=BATCH_SIZE,
device=device
)
'''
for batch in train_iterator:
print(batch)
print(batch.s)
break
for batch in valid_iterator:
print(batch)
print(batch.s)
break
[torchtext.legacy.data.batch.Batch of size 16]
[.r]:[torch.LongTensor of size 16x512]
[.s]:[torch.FloatTensor of size 16]
tensor([1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0.])
[torchtext.legacy.data.batch.Batch of size 16]
[.r]:[torch.LongTensor of size 16x512]
[.s]:[torch.FloatTensor of size 16]
tensor([1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0.])
'''
class BERTGRUSentiment(nn.Module):
def __init__(self,
bert,
hidden_dim,
output_dim,
n_layers,
bidirectional,
dropout):
super(BERTGRUSentiment, self).__init__()
self.bert=bert
self.embedding_dim=bert.config.to_dict()['hidden_size']#768
self.rnn=nn.GRU(
self.embedding_dim,
hidden_dim,
num_layers=n_layers,
bidirectional=bidirectional,
batch_first=True,
dropout=0 if n_layers<2 else dropout
)
#GRU(768, 256, num_layers=2, batch_first=True, dropout=0.25, bidirectional=True)
self.output=nn.Linear(hidden_dim*2 if bidirectional else hidden_dim,output_dim)
self.dropout=nn.Dropout(dropout)
def forward(self,text):
#text:[16,512]
with torch.no_grad():
#last_hidden_state:[0]
#pooler_output:[1]
#hidden_states:[2]
#[batch_size,seq_len,embed_dim]
embeded=self.bert(text)[0]#torch.Size([16, 512, 768]) [batch,seq_len,emb_dim]
_,hidden=self.rnn(embeded)
#hidden: torch.Size([4, 16, 256])
#[n_layers*n_directional,batch,hidden_dim]
if self.rnn.bidirectional:
#torch.Size([16, 256*2])
hidden=self.dropout(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1))
else:
hidden=self.dropout(hidden[-1,:,:])
#hidden_dim=[batch_size,hid_dim]
output=self.output(hidden)
#[batch,1]
return output
HIDDEN_DIM=256
OUTPUT_DIM=1
N_LAYERS=2
BIDIRECTIONAL=True
DROPOUT=0.25
model=BERTGRUSentiment(bert,
HIDDEN_DIM,
OUTPUT_DIM,
N_LAYERS,
BIDIRECTIONAL,
DROPOUT)
for name, param in model.named_parameters():
if name.startswith('bert'):
#只是用bert来提取更好的参数
param.requires_grad=False
optimizer=optim.Adam(model.parameters())
criterion=nn.BCEWithLogitsLoss()
model=model.to(device)
criterion=criterion.to(device)
def binary_accuracy(preds,y):
rounded_preds=torch.round(torch.sigmoid(preds))
correct=(rounded_preds==y).float()
acc=correct.sum()/len(correct)
return acc
def train(model,iterator,optimizer,criterion):
epoch_loss=0
epoch_acc=0
model.train()
with tqdm(total=len(iterator)) as pbar:
for batch in train_iterator:
optimizer.zero_grad()
input_tensor=batch.r
#[16,512]
ground_y=batch.s.squeeze(0)
#[16]
predictions=model(input_tensor).squeeze(1)
loss=criterion(predictions,ground_y)
acc=binary_accuracy(predictions,ground_y)
loss.backward()
optimizer.step()
epoch_loss+=loss.item()
epoch_acc+=acc.item()
pbar.update(1)
return epoch_loss/len(iterator),epoch_acc/len(iterator)
def evaluate(model,iterator,criterion):
epoch_loss=0
epoch_acc=0
model.eval()
with torch.no_grad():
for batch in iterator:
input_tensor=batch.r
ground_y=batch.s.squeeze(0)
predictions=model(input_tensor).squeeze(1)
loss=criterion(predictions,ground_y)
acc=binary_accuracy(predictions,ground_y)
epoch_loss+=loss
epoch_acc+=acc
return epoch_loss/len(iterator),epoch_acc/len(iterator)
def epoch_time(start_time,end_time):
elapsed_time=end_time-start_time
elapsed_m=int(elapsed_time/60)
elapsed_s=int(elapsed_time%60)
return elapsed_m,elapsed_s
if __name__=='__main__':
N_EPOCH=5
best_valid_loss=float('inf')
for epoch in range(N_EPOCH):
start_time=time.time()
train_loss,train_acc=train(model,train_iterator,optimizer,criterion)
valid_loss,valid_acc=evaluate(model,valid_iterator,criterion)
end_time=time.time()
epoch_m,epoch_s=epoch_time(start_time,end_time)
if valid_loss<best_valid_loss:
best_valid_loss=valid_loss
torch.save(model.state_dict(),'tu6_model.pt')
print(f'Epoch:{epoch+1:02} | Epoch Time:{epoch_m}m {epoch_s}s')
print(f'\tTrain Loss:{train_loss:.3f} | Train Acc:{train_acc*100:.2f}%')
print(f'\tVal Loss:{valid_loss:.3f} | Val Loss:{valid_loss*100:.2f}%')
'''
0%| | 0/2500 [00:00<?, ?it/s]tensor([1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0.])
0%| | 0/2500 [00:34<?, ?it/s]tensor([[ 0.2931],
[ 0.1090],
[-0.0323],
[ 0.5297],
[-0.0970],
[ 0.1319],
[ 0.2649],
[ 0.2163],
[-0.0164],
[ 0.0005],
[ 0.1306],
[ 0.0286],
[ 0.1215],
[ 0.0406],
[ 0.0540],
[-0.0504]], grad_fn=<AddmmBackward0>)
a=torch.Tensor([[ 0.2931],
[ 0.1090],
[-0.0323],
[ 0.5297],
[-0.0970],
[ 0.1319],
[ 0.2649],
[ 0.2163],
[-0.0164],
[ 0.0005],
[ 0.1306],
[ 0.0286],
[ 0.1215],
[ 0.0406],
[ 0.0540],
[-0.0504]])
print(torch.sigmoid(a))
print(torch.round(torch.sigmoid(a)))
tensor([[0.5728],
[0.5272],
[0.4919],
[0.6294],
[0.4758],
[0.5329],
[0.5658],
[0.5539],
[0.4959],
[0.5001],
[0.5326],
[0.5071],
[0.5303],
[0.5101],
[0.5135],
[0.4874]])
tensor([[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.]])
'''

import torch
import torch.nn as nn
from transformers import BertTokenizer,BertModel
from torchtext.legacy import data
from torchtext import datasets
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
bert=BertModel.from_pretrained('bert-base-uncased')
max_input_length=tokenizer.max_model_input_sizes['bert-base-uncased']
def tokenize_and_cut(sentence):
tokens=tokenizer.tokenize(sentence)
tokens=tokens[:max_input_length-2]
return tokens
REVIEW=data.Field(
batch_first=True,
use_vocab=False,
tokenize=tokenize_and_cut,
preprocessing=tokenizer.convert_tokens_to_ids,
init_token=tokenizer.cls_token_id,
eos_token=tokenizer.sep_token_id,
pad_token=tokenizer.pad_token_id,
unk_token=tokenizer.unk_token_id
)
SENTIMENT=data.LabelField(dtype=torch.float)
fields={'review':('r',REVIEW),'sentiment':('s',SENTIMENT)}
IMDB_data=data.TabularDataset(
path='/mnt/disk2/std2021/hejiabang-data/transformer_data/IMDB_Dataset.csv',
format='csv',
fields=fields)
train_data,valid_data,test_data=IMDB_data.split(split_ratio=[0.8,0.1,0.1])
SENTIMENT.build_vocab(train_data)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE=16
train_iterator,valid_iterator,test_iterator=data.BucketIterator.splits(
(train_data,valid_data,test_data),
sort=False,
batch_size=BATCH_SIZE,
device=device)
class BERTGRUSentiment(nn.Module):
def __init__(self,
bert,
hidden_dim,
output_dim,
n_layers,
bidirectional,
dropout):
super().__init__()
self.bert = bert
embedding_dim = bert.config.to_dict()['hidden_size']
self.rnn = nn.GRU(embedding_dim,
hidden_dim,
num_layers=n_layers,
bidirectional=bidirectional,
batch_first=True,
dropout=0 if n_layers < 2 else dropout)
self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
# text = [batch size, sent len]
with torch.no_grad():
embedded = self.bert(text)[0]
# embedded = [batch size, sent len, emb dim]
_, hidden = self.rnn(embedded)
# hidden = [n layers * n directions, batch size, emb dim]
if self.rnn.bidirectional:
hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
else:
hidden = self.dropout(hidden[-1, :, :])
# hidden = [batch size, hid dim]
out = self.out(hidden)
# output = [batch size, out dim]
return out
HIDDEN_DIM=256
OUTPUT_DIM=1
N_LAYERS=2
BIDIRECTIONAL=True
DROPOUT=0.25
model=BERTGRUSentiment(
bert,
HIDDEN_DIM,
OUTPUT_DIM,
N_LAYERS,
BIDIRECTIONAL,
DROPOUT)
def binary_acc(preds,y):
rounded_preds=torch.round(torch.sigmoid(preds))
correct=(rounded_preds==y).float()
acc=correct.sum()/len(correct)
return acc
def evaluate(model,iterator,criterion):
epoch_loss=0
epoch_acc=0
model.eval()
with torch.no_grad():
for batch in iterator:
input_tensor=batch.r
ground_y=batch.s.squeeze(0)
predictions=model(input_tensor).squeeze(1)
loss=criterion(predictions,ground_y)
acc=binary_acc(predictions,ground_y)
epoch_loss+=loss.item()
epoch_acc+=acc.item()
return epoch_loss/len(iterator),epoch_acc/len(iterator)
#要加上False
model.load_state_dict(torch.load('./tu6_model.pt'),False)
criterion=nn.BCEWithLogitsLoss()
model.to(device)
criterion.to(device)
test_loss,test_acc=evaluate(model,valid_iterator,criterion)
print(f'Test Loss:{test_loss:.3f} | Text Acc:{test_acc * 100:.2f}%')
def prediction_sentiment(model,tokenizer,sentence):
model.eval()
tokens=tokenizer.tokenize(sentence)
tokens=tokens[:max_input_length]
indexes=[tokenizer.cls_token_id]+tokenizer.convert_tokens_to_ids(tokens)+[tokenizer.sep_token_id]
tensor=torch.LongTensor(indexes).to(device)
tensor=tensor.unsqueeze(0)#[batch,seq_len]
prediction=torch.round(torch.sigmoid(model(tensor))).item()
result=SENTIMENT.vocab.itos[int(prediction)]
return result
print(prediction_sentiment(model,tokenizer,"Wanyin is the most beautiful girl!"))

