NTU ML2023Spring Part3.7 Bert 微调
在 evaluate 里有这样一行:
def evaluate(data, output):
##### TODO: Postprocessing #####
# There is a bug and room for improvement in postprocessing
# Hint: Open your prediction file to see what is wrong
一开始认为是窗口太小导致的,于是尝试取每个窗口里概率最高的值.但结果不忍直视:
ID,Answer
0,國以1-
1,[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
2,[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
3,[PAD][PAD]
4,部、
5,[PAD][PAD][PAD]
重新用回一开始给的版本,发现结果中有一些 [UNK] 的 token,应该把它们还原.
215,白[UNK]紀中期
为了还原,我把 class QA_Dataset 改造了一下,在 validation/testing 的时候输出原文相关信息.
class QA_Dataset(Dataset):
def __init__(self, split, questions, paragraphs, tokenized_questions, tokenized_paragraphs):
self.split = split
self.questions = questions
self.paragraphs = paragraphs
self.tokenized_questions = tokenized_questions
self.tokenized_paragraphs = tokenized_paragraphs
self.max_question_len = 60
self.max_paragraph_len = 150
##### TODO: Change value of doc_stride #####
#self.doc_stride = 150
self.doc_stride = 50
# Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1
def __len__(self):
return len(self.questions)
def __getitem__(self, idx):
question = self.questions[idx]
paragraph = self.paragraphs[question["paragraph_id"]]
tokenized_question = self.tokenized_questions[idx]
tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]
offset_mapping = self.tokenized_paragraphs.offset_mapping[question["paragraph_id"]]
##### TODO: Preprocessing #####
# Hint: How to prevent model from learning something it should not learn
if self.split == "train":
# Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph
answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])
# A single window is obtained by slicing the portion of paragraph containing the answer
mid = answer_start_token + round(float((answer_end_token - answer_start_token) * torch.rand(1) ))
paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))
paragraph_end = paragraph_start + self.max_paragraph_len
# Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]
# Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window
answer_start_token += len(input_ids_question) - paragraph_start
answer_end_token += len(input_ids_question) - paragraph_start
# Pad sequence and obtain inputs to model
input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token
# Validation/Testing
else:
input_ids_list, token_type_ids_list, attention_mask_list, token_offset_list = [], [], [], []
# Paragraph is split into several windows, each with start positions separated by step "doc_stride"
input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
for i in range(0, len(tokenized_paragraph), self.doc_stride):
# Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]
# Pad sequence and obtain inputs to model
input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
input_ids_list.append(input_ids)
token_type_ids_list.append(token_type_ids)
attention_mask_list.append(attention_mask)
token_offset_list.append(-len(input_ids_question) + i)
return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list), paragraph, offset_mapping, token_offset_list
def padding(self, input_ids_question, input_ids_paragraph):
# Pad zeros if sequence length is shorter than max_seq_len
padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
# Indices of input sequence tokens in the vocabulary
input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
# Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
# Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
return input_ids, token_type_ids, attention_mask
train_set = QA_Dataset("train", train_questions, train_paragraphs, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_paragraphs, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_paragraphs, test_questions_tokenized, test_paragraphs_tokenized)
现在看起来就可以了:
215,白堊紀中期
代码里还有几处 todo 让我做 gradient accumulation 和 linear learning rate decay. 尝试了一下 train_batch_size=32, gradient_accumulation_steps=32,scheduler 调成这样 scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=1000),把 num_epoch 开到 10,信仰跑了一发,然后就寄了.看起来应该是发生了过拟合,train acc 0.88,valid acc 0.68,交上去之后 private score 甚至只有 0.64,连 medium baseline 都过不了,跑了 2h 还不如之前用 colab 训了 10min 之后的结果.
开摆.

浙公网安备 33010602011771号