NTU ML2023Spring Part3.7 Bert 微调

在 evaluate 里有这样一行:

def evaluate(data, output):
    ##### TODO: Postprocessing #####
    # There is a bug and room for improvement in postprocessing
    # Hint: Open your prediction file to see what is wrong

一开始认为是窗口太小导致的,于是尝试取每个窗口里概率最高的值.但结果不忍直视:

ID,Answer
0,國以1-
1,[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
2,[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
3,[PAD][PAD]
4,部、
5,[PAD][PAD][PAD]

重新用回一开始给的版本,发现结果中有一些 [UNK] 的 token,应该把它们还原.

215,白[UNK]紀中期

为了还原,我把 class QA_Dataset 改造了一下,在 validation/testing 的时候输出原文相关信息.

class QA_Dataset(Dataset):
    def __init__(self, split, questions, paragraphs, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.paragraphs = paragraphs
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 60
        self.max_paragraph_len = 150

        ##### TODO: Change value of doc_stride #####
        #self.doc_stride = 150
        self.doc_stride = 50

        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        paragraph = self.paragraphs[question["paragraph_id"]]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]
        offset_mapping = self.tokenized_paragraphs.offset_mapping[question["paragraph_id"]]

        ##### TODO: Preprocessing #####
        # Hint: How to prevent model from learning something it should not learn
        if self.split == "train":
            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph
            answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
            answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])

            # A single window is obtained by slicing the portion of paragraph containing the answer
            mid = answer_start_token + round(float((answer_end_token - answer_start_token) * torch.rand(1) ))
            paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))
            paragraph_end = paragraph_start + self.max_paragraph_len

            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]

            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start

            # Pad sequence and obtain inputs to model
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation/Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list, token_offset_list = [], [], [], []

            # Paragraph is split into several windows, each with start positions separated by step "doc_stride"
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]

                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)

                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
                token_offset_list.append(-len(input_ids_question) + i)

            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list), paragraph, offset_mapping, token_offset_list

    def padding(self, input_ids_question, input_ids_paragraph):
        # Pad zeros if sequence length is shorter than max_seq_len
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len

        return input_ids, token_type_ids, attention_mask

train_set = QA_Dataset("train", train_questions, train_paragraphs, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_paragraphs, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_paragraphs, test_questions_tokenized, test_paragraphs_tokenized)

现在看起来就可以了:

215,白堊紀中期

代码里还有几处 todo 让我做 gradient accumulation 和 linear learning rate decay. 尝试了一下 train_batch_size=32, gradient_accumulation_steps=32,scheduler 调成这样 scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=1000),把 num_epoch 开到 10,信仰跑了一发,然后就寄了.看起来应该是发生了过拟合,train acc 0.88,valid acc 0.68,交上去之后 private score 甚至只有 0.64,连 medium baseline 都过不了,跑了 2h 还不如之前用 colab 训了 10min 之后的结果.

开摆.

posted @ 2025-05-11 19:56  383494  阅读(21)  评论(0)    收藏  举报