2. Sentiment_Analysis

1. 代码链接

from torchtext.legacy import data
from torchtext import datasets

import torch
import torch.nn as nn
from transformers import BertTokenizer,BertModel
import torch.optim as optim
from tqdm import tqdm
import time

tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
bert=BertModel.from_pretrained('bert-base-uncased')

max_input_length=tokenizer.max_model_input_sizes['bert-base-uncased']
#print(max_input_length)#512 单句最大输入长度

def tokenizer_and_cut(sentence):
    tokens=tokenizer.tokenize(sentence)
    tokens=tokens[:max_input_length-2]
    return tokens

REVIEW=data.Field(
    batch_first=True,
    use_vocab=False,
    tokenize=tokenizer_and_cut,
    preprocessing=tokenizer.convert_tokens_to_ids,#将token转化为id
    init_token=tokenizer.cls_token_id,#101
    eos_token=tokenizer.sep_token_id,#102
    pad_token=tokenizer.pad_token_id,#0
    unk_token=tokenizer.unk_token_id#100
)

SENTIMENT=data.LabelField(dtype=torch.float)
#PreTrainedTokenizer(
# name_or_path='bert-base-uncased',
# vocab_size=30522,
# model_max_len=512,
# is_fast=False,
# padding_side='right',
# special_tokens={
# 'unk_token': '[UNK]',
# 'sep_token': '[SEP]',
# 'pad_token': '[PAD]',
# 'cls_token': '[CLS]',
# 'mask_token': '[MASK]'})
fields={'review':('r',REVIEW),'sentiment':('s',SENTIMENT)}

#sen_code=tokenizer.tokenize('Hello you where you are')
#['hello', 'you', 'where', 'you', 'are']返回token
#sen_code=tokenizer.encode('Hello you where you are')
#[101, 7592, 2017, 2073, 2017, 2024, 102]返回字典中的id
#[<CLS> Hello you where you are <SEP>]
#sen_code=tokenizer.decode(102)
#[ S E P ]


IMDB_data=data.TabularDataset(
    path='/mnt/disk2/std2021/hejiabang-data/transformer_data/IMDB_Dataset.csv',
    format='csv',
    fields=fields
)

train_data,valid_data,test_data=IMDB_data.split(
    split_ratio=[0.8,0.1,0.1]
)

SENTIMENT.build_vocab(train_data)
#print(len(SENTIMENT.vocab))
# 2
#print(SENTIMENT.vocab.itos[:])
# ['positive', 'negative']
#print(SENTIMENT.vocab.stoi)
# defaultdict(None, {'positive': 0, 'negative': 1})
#print(SENTIMENT.vocab.stoi['positive'])
# 0

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE=16

train_iterator,valid_iterator,test_iterator=data.BucketIterator.splits(
    (train_data,valid_data,test_data),
    sort=False,
    batch_size=BATCH_SIZE,
    device=device
)
'''
for batch in train_iterator:
  print(batch)
  print(batch.s)
  break
for batch in valid_iterator:
  print(batch)
  print(batch.s)
  break
  
[torchtext.legacy.data.batch.Batch of size 16]
	[.r]:[torch.LongTensor of size 16x512]
	[.s]:[torch.FloatTensor of size 16]
tensor([1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0.])

[torchtext.legacy.data.batch.Batch of size 16]
	[.r]:[torch.LongTensor of size 16x512]
	[.s]:[torch.FloatTensor of size 16]
tensor([1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0.])
'''


class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        super(BERTGRUSentiment, self).__init__()
        self.bert=bert
        self.embedding_dim=bert.config.to_dict()['hidden_size']#768
        self.rnn=nn.GRU(
            self.embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=0 if n_layers<2 else dropout
        )
        #GRU(768, 256, num_layers=2, batch_first=True, dropout=0.25, bidirectional=True)
        self.output=nn.Linear(hidden_dim*2 if bidirectional else hidden_dim,output_dim)
        self.dropout=nn.Dropout(dropout)

    def forward(self,text):
        #text:[16,512]
        with torch.no_grad():
            #last_hidden_state:[0]
            #pooler_output:[1]
            #hidden_states:[2]
            #[batch_size,seq_len,embed_dim]
            embeded=self.bert(text)[0]#torch.Size([16, 512, 768]) [batch,seq_len,emb_dim]

        _,hidden=self.rnn(embeded)
        #hidden: torch.Size([4, 16, 256])
        #[n_layers*n_directional,batch,hidden_dim]
            
        if self.rnn.bidirectional:
            #torch.Size([16, 256*2])
            hidden=self.dropout(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1))
        else:
            hidden=self.dropout(hidden[-1,:,:])
        #hidden_dim=[batch_size,hid_dim]

        output=self.output(hidden)
        #[batch,1]
        return output
HIDDEN_DIM=256
OUTPUT_DIM=1
N_LAYERS=2
BIDIRECTIONAL=True
DROPOUT=0.25

model=BERTGRUSentiment(bert,
                       HIDDEN_DIM,
                       OUTPUT_DIM,
                       N_LAYERS,
                       BIDIRECTIONAL,
                       DROPOUT)

for name, param in model.named_parameters():
    if name.startswith('bert'):
        #只是用bert来提取更好的参数
        param.requires_grad=False

optimizer=optim.Adam(model.parameters())

criterion=nn.BCEWithLogitsLoss()

model=model.to(device)
criterion=criterion.to(device)

def binary_accuracy(preds,y):
    rounded_preds=torch.round(torch.sigmoid(preds))
    correct=(rounded_preds==y).float()
    acc=correct.sum()/len(correct)
    return acc

def train(model,iterator,optimizer,criterion):
    epoch_loss=0
    epoch_acc=0

    model.train()
    with tqdm(total=len(iterator)) as pbar:
        for batch in train_iterator:
            optimizer.zero_grad()

            input_tensor=batch.r
            #[16,512]
            ground_y=batch.s.squeeze(0)
            #[16]
            predictions=model(input_tensor).squeeze(1)

            loss=criterion(predictions,ground_y)
            acc=binary_accuracy(predictions,ground_y)

            loss.backward()

            optimizer.step()

            epoch_loss+=loss.item()
            epoch_acc+=acc.item()

            pbar.update(1)
        return epoch_loss/len(iterator),epoch_acc/len(iterator)

def evaluate(model,iterator,criterion):
    epoch_loss=0
    epoch_acc=0

    model.eval()
    with torch.no_grad():
        for batch in iterator:
             input_tensor=batch.r
             ground_y=batch.s.squeeze(0)

             predictions=model(input_tensor).squeeze(1)

             loss=criterion(predictions,ground_y)
             acc=binary_accuracy(predictions,ground_y)

             epoch_loss+=loss
             epoch_acc+=acc
        return epoch_loss/len(iterator),epoch_acc/len(iterator)

def epoch_time(start_time,end_time):
    elapsed_time=end_time-start_time
    elapsed_m=int(elapsed_time/60)
    elapsed_s=int(elapsed_time%60)
    return elapsed_m,elapsed_s

if __name__=='__main__':

    N_EPOCH=5

    best_valid_loss=float('inf')

    for epoch in range(N_EPOCH):
        start_time=time.time()

        train_loss,train_acc=train(model,train_iterator,optimizer,criterion)
        valid_loss,valid_acc=evaluate(model,valid_iterator,criterion)

        end_time=time.time()
        epoch_m,epoch_s=epoch_time(start_time,end_time)

        if valid_loss<best_valid_loss:
            best_valid_loss=valid_loss
            torch.save(model.state_dict(),'tu6_model.pt')

        print(f'Epoch:{epoch+1:02}   |   Epoch Time:{epoch_m}m {epoch_s}s')
        print(f'\tTrain Loss:{train_loss:.3f}   |   Train Acc:{train_acc*100:.2f}%')
        print(f'\tVal Loss:{valid_loss:.3f}     |   Val Loss:{valid_loss*100:.2f}%')

'''
0%|          | 0/2500 [00:00<?, ?it/s]tensor([1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0.])
  0%|          | 0/2500 [00:34<?, ?it/s]tensor([[ 0.2931],
        [ 0.1090],
        [-0.0323],
        [ 0.5297],
        [-0.0970],
        [ 0.1319],
        [ 0.2649],
        [ 0.2163],
        [-0.0164],
        [ 0.0005],
        [ 0.1306],
        [ 0.0286],
        [ 0.1215],
        [ 0.0406],
        [ 0.0540],
        [-0.0504]], grad_fn=<AddmmBackward0>)
a=torch.Tensor([[ 0.2931],
        [ 0.1090],
        [-0.0323],
        [ 0.5297],
        [-0.0970],
        [ 0.1319],
        [ 0.2649],
        [ 0.2163],
        [-0.0164],
        [ 0.0005],
        [ 0.1306],
        [ 0.0286],
        [ 0.1215],
        [ 0.0406],
        [ 0.0540],
        [-0.0504]])
print(torch.sigmoid(a))
print(torch.round(torch.sigmoid(a)))
tensor([[0.5728],
        [0.5272],
        [0.4919],
        [0.6294],
        [0.4758],
        [0.5329],
        [0.5658],
        [0.5539],
        [0.4959],
        [0.5001],
        [0.5326],
        [0.5071],
        [0.5303],
        [0.5101],
        [0.5135],
        [0.4874]])
tensor([[1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.]])
'''

2.代码链接

import torch
import torch.nn as nn
from transformers import BertTokenizer,BertModel

from torchtext.legacy import data
from torchtext import datasets


tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
bert=BertModel.from_pretrained('bert-base-uncased')
max_input_length=tokenizer.max_model_input_sizes['bert-base-uncased']

def tokenize_and_cut(sentence):
  tokens=tokenizer.tokenize(sentence)
  tokens=tokens[:max_input_length-2]
  return tokens

REVIEW=data.Field(
    batch_first=True,
    use_vocab=False,
    tokenize=tokenize_and_cut,
    preprocessing=tokenizer.convert_tokens_to_ids,
    init_token=tokenizer.cls_token_id,
    eos_token=tokenizer.sep_token_id,
    pad_token=tokenizer.pad_token_id,
    unk_token=tokenizer.unk_token_id
)
SENTIMENT=data.LabelField(dtype=torch.float)

fields={'review':('r',REVIEW),'sentiment':('s',SENTIMENT)}

IMDB_data=data.TabularDataset(
    path='/mnt/disk2/std2021/hejiabang-data/transformer_data/IMDB_Dataset.csv',
    format='csv',
    fields=fields)

train_data,valid_data,test_data=IMDB_data.split(split_ratio=[0.8,0.1,0.1])


SENTIMENT.build_vocab(train_data)

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE=16

train_iterator,valid_iterator,test_iterator=data.BucketIterator.splits(
    (train_data,valid_data,test_data),
    sort=False,
    batch_size=BATCH_SIZE,
    device=device)

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):

        super().__init__()

        self.bert = bert

        embedding_dim = bert.config.to_dict()['hidden_size']

        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          batch_first=True,
                          dropout=0 if n_layers < 2 else dropout)

        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        # text = [batch size, sent len]

        with torch.no_grad():
            embedded = self.bert(text)[0]

        # embedded = [batch size, sent len, emb dim]

        _, hidden = self.rnn(embedded)

        # hidden = [n layers * n directions, batch size, emb dim]

        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])

        # hidden = [batch size, hid dim]

        out = self.out(hidden)

        # output = [batch size, out dim]

        return out
HIDDEN_DIM=256
OUTPUT_DIM=1
N_LAYERS=2
BIDIRECTIONAL=True
DROPOUT=0.25

model=BERTGRUSentiment(
    bert,
    HIDDEN_DIM,
    OUTPUT_DIM,
    N_LAYERS,
    BIDIRECTIONAL,
    DROPOUT)

def binary_acc(preds,y):
  rounded_preds=torch.round(torch.sigmoid(preds))
  correct=(rounded_preds==y).float()
  acc=correct.sum()/len(correct)
  return acc

def evaluate(model,iterator,criterion):
  epoch_loss=0
  epoch_acc=0

  model.eval()

  with torch.no_grad():
    for batch in iterator:
      input_tensor=batch.r
      ground_y=batch.s.squeeze(0)

      predictions=model(input_tensor).squeeze(1)
      loss=criterion(predictions,ground_y)
      acc=binary_acc(predictions,ground_y)

      epoch_loss+=loss.item()
      epoch_acc+=acc.item()
  return epoch_loss/len(iterator),epoch_acc/len(iterator)

#要加上False
model.load_state_dict(torch.load('./tu6_model.pt'),False)

criterion=nn.BCEWithLogitsLoss()
model.to(device)
criterion.to(device)

test_loss,test_acc=evaluate(model,valid_iterator,criterion)

print(f'Test Loss:{test_loss:.3f}   |   Text Acc:{test_acc * 100:.2f}%')


def prediction_sentiment(model,tokenizer,sentence):
    model.eval()
    tokens=tokenizer.tokenize(sentence)
    tokens=tokens[:max_input_length]
    indexes=[tokenizer.cls_token_id]+tokenizer.convert_tokens_to_ids(tokens)+[tokenizer.sep_token_id]
    tensor=torch.LongTensor(indexes).to(device)
    tensor=tensor.unsqueeze(0)#[batch,seq_len]
    prediction=torch.round(torch.sigmoid(model(tensor))).item()
    result=SENTIMENT.vocab.itos[int(prediction)]
    return result

print(prediction_sentiment(model,tokenizer,"Wanyin is the most beautiful girl!"))

posted @ 2022-01-29 19:20  Tsukinousag1  阅读(88)  评论(0)    收藏  举报