[知识图谱项目--实体识别模块]推特威胁情报实体识别

本项目基于推特威胁情报为数据集构建实体识别测试。
测试目的：构建一个威胁情报自动化实体识别模型，实现知识图谱的第一步。
实验数据：推特获取。
实验模型：bert，bert-crf，bert-bilstm-crf
实验环境：python-3.7 ，torch-1.7.1 ， transformers 4.2.1

知识准备：

预备知识：

bert，bilstm，crf模型知识的掌握
威胁情报stix2相关实体的掌握
自然语言处理Ner任务的掌握

数据准备：

数据获取模块

数据获取模块是从推特中爬取用户发表的句子，要获取推文，您需要一个有效的 Twitter 开发人员帐户，并安装 Tweepy 库。此链接描述了获取开发人员状态所需的步骤，以及相应的消费者密钥、消费者秘密、访问令牌和访问令牌密钥。

文本分类模块

文本分类模块主要将不含威胁情报的文本去除，筛选只含有威胁情报的文本，其主要包括文章级分类和句子级分类。采取分类方式：nlp-Textcnn。

数据集的构建

Text：text主要为文章中的每一行句子，通过tokenizer分解成tokens，采用bert-tokenizer的方法，将句子分解成wordpiece。
Tag：实体标签主要包含（O，ORG，VER，VUL，ID）各标签表达意思如下表：

标签	描述
O	没有表达任何意思
ORG	公司或组织
PRO	产品或物品
VER	产品或者物品的版本号
VUL	可能是指存在威胁或漏洞
ID	一种标识符，来自公共存储库（如国家数据库）漏洞数据库（NVD）[21]，或来自更新或修补程序。

标签描述

模型准备

本实验采用了3种模型结构来测试：

bert-Linear：bert预训练加上多分类层
bert-crf：bert预训练加条件随机场
bert-bilstm-crf：bert预训练+循环神经网络+条件随机场

开始实验

数据预处理：

class pre_datas():
    def __init__(self):
        self.tagdict = tagdict

    def datas(self,path):
        with open(path,'r',encoding='utf-8') as f:
            contexts = f.readlines()
        return contexts


    def tokenize_and_preserve_labels(sekf,sentence, text_labels):
        sentence = sentence.split(" ")
        text_labels = re.sub('\n','',text_labels)
        text_labels = text_labels.split(" ")

        tokenized_sentence = []
        labels = []

        for word, label in zip(sentence, text_labels):

            tokenized_word = tokenizer.tokenize(word)
            n_subwords = len(tokenized_word)
            tokenized_sentence.extend(tokenized_word)
            labels.extend([label] * n_subwords)

        tokenized_sentence = ['CLS'] + tokenized_sentence
        labels = ['O'] + labels
        return tokenized_sentence, labels

    def pad_sequences(self,texts,tags):
        input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in texts],
                                  maxlen = 50, dtype="long", value=0.0,
                                  truncating="post", padding="post")

        tags = pad_sequences([[self.tagdict[l] for l in lab] for lab in tags],
                             maxlen=50, value=self.tagdict["PAD"], padding="post",
                             dtype="long", truncating="post")
        attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]
        return (input_ids,tags,attention_masks)


    def tokenize_texts_and_labels(self,_texts,_tags):
        assert (len(_texts) == len(_tags))
        texts,tags = [],[]
        for _text,_tag in zip(_texts,_tags):
            text,label = self.tokenize_and_preserve_labels(_text,_tag)
            assert len(text) == len(label)
            texts.append(text)
            tags.append(label)
        return (texts,tags)

模型构建：

bert-crf，bert-lstm-crf

class BertLstmCrf(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config,need_bilstm = False,rnn_dim = 128):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_dim = config.hidden_size
        self.need_bilstm = need_bilstm
        if need_bilstm:
            self.bilstm = nn.LSTM(config.hidden_size, rnn_dim, num_layers=1, bidirectional=True, batch_first=True)
            self.out_dim = 2*rnn_dim
        self.liner = nn.Linear(self.out_dim, config.num_labels)
        self.crf = CRF(config.num_labels,batch_first=True)


    def forward(self,input_ids=None,attention_mask=None,token_type_ids=None,labels=None,):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        sequence_output = outputs[0]
        if self.need_bilstm:
            sequence_output,_ = self.bilstm(sequence_output)
        sequence_output = self.dropout(sequence_output)
        sequence_output = self.liner(sequence_output)
        loss = -1 * self.crf(sequence_output, labels, mask=attention_mask.byte())
        output = self.crf.decode(sequence_output, attention_mask.byte())

        return [loss,output] if loss is not None else output

构建优化器

    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0}
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=lr,
        eps=eps
    )

    total_steps = len(train_dataloader) * epoch

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

训练模型：

def train(train_dataloader,model,optimizer,scheduler):
    max_grad_norm = 1.0
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids.type(torch.LongTensor), attention_mask=b_input_mask.type(torch.LongTensor),
                        labels=b_labels.type(torch.LongTensor))
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

评估模型

def eval(valid_dataloader,model,use_official_model = False):
    if use_official_model:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions, true_labels = [], []
        for batch in valid_dataloader:
            batch = tuple(t for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                outputs = model(b_input_ids.type(torch.LongTensor), attention_mask=b_input_mask.type(torch.LongTensor),
                                labels=b_labels.type(torch.LongTensor))

            logits = outputs[1].detach().numpy()
            label_ids = b_labels.numpy()

            eval_loss += outputs[0].mean().item()
            eval_accuracy += flat_accuracy(logits, label_ids)
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.extend(label_ids)

            nb_eval_examples += b_input_ids.size(0)
            nb_eval_steps += 1

        pred_tags = [[tag_values[p_i] for p, l in zip(predictions, true_labels)
                      for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]]
        valid_tags = [[tag_values[l_i] for l in true_labels
                       for l_i in l if tag_values[l_i] != "PAD"]]
        print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

    else:
        model.eval()
        predictions, true_labels = [], []
        for batch in valid_dataloader:
            batch = tuple(t for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                outputs = model(b_input_ids.type(torch.LongTensor), attention_mask=b_input_mask.type(torch.LongTensor),
                                labels=b_labels.type(torch.LongTensor))
            logits = outputs[1]
            label_ids = b_labels.numpy()

            predictions.extend(logits)
            true_labels.extend(list(label_ids))

        pred_tags = [[tag_values[p_i] for p, l in zip(predictions, true_labels)
                      for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]]
        valid_tags = [[tag_values[l_i] for l in true_labels
                       for l_i in l if tag_values[l_i] != "PAD"]]
        print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

训练结果

Bert-bilstm-crf:

step	Train loss	validation F1-Score
1	513.742	0.626
2	153.405	0.767
5	46.638	0.785
10	15.026	0.823
20	2.302	0.827

Bert-crf：

step	Train loss	validation F1-Score
1	308.777	0.723
2	62.385	0.795
5	10.9026	0.798
10	5.498	0.820
20	0.803	0.817

posted @ 2021-03-09 12:36 ripking 阅读(86) 评论(0) 收藏举报

刷新页面返回顶部

ripking