基于word2vec+TextCNN 作文本分类

基于TextCNN作文本分类模型

一. 准备工作:

  1. 环境:python3.7+torch+GPU

  2. 数据集:网上下载的4分类中文文本,如下图:

  3. 模块使用:

import os
import jieba
import torch
import joblib
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
  1. 模型参数配置
# 模型配置参数
class Config:
    label_map_dict = {'体育': 0, '女性': 1, '文学': 2, '校园': 3}  # 类别映射字典
    stopword = [i.strip() for i in open('./datasets/stop/stopword.txt', encoding='utf-8')]  # 停用词
    sentence_lenth = 50  # 句子截断长度
    batch_size = 10  # 批次数
    output_channel = 3  # 输出通道
    embedding_size = 5  # 嵌入词向量长度
    vocab_dict_ = joblib.load('./models/vocab_dict')
  1. 中文分词后编辑词汇表
# 生成词汇表
def vocab_dict(*args, base_path='./datasets/train'):
    vocab = set()
    for i in args:
        dataset = Dataset_(base_path, i)
        for s in range(len(dataset)):
            for v in dataset[s][0]:
                vocab.add(v)
    vocab_dict = {word: idx+1 for idx, word in enumerate(vocab)}
    vocab_dict['unkown'] = 0
    return vocab_dict
# 保存响应词汇表,供神经网络中Embedding使用
joblib.dump(vocab_dict('体育', '女性', '文学', '校园'), './models/vocab_dict')

二. 数据加载器

  1. 数据集类编写

    class Dataset_(Dataset):
    
        def __init__(self, base_path, label_path):
            self.base_path = base_path
            self.label = label_path
            self.items_path = os.path.join(base_path, self.label)
            self.items = os.listdir(self.items_path)
    
        def __getitem__(self, index):
            label = Config.label_map_dict.get(self.label)
            sentence = list(self.cut_word(open(os.path.join(self.items_path, self.items[index])).read()))
            sentence_ = []
            for i in sentence:
                if Config.vocab_dict_.get(i, None):
                    sentence_.append(Config.vocab_dict_[i])
                else:
                    sentence_.append(Config.vocab_dict_['unkown'])
            if len(sentence_) > Config.sentence_lenth:
                sentence_ = sentence_[:Config.sentence_lenth]
            elif len(sentence_) < Config.sentence_lenth:
                sentence_.extend([0 for i in range(Config.sentence_lenth-len(sentence_))])
            return sentence_, label
    
        def __len__(self):
            return len(self.items)
    
        @staticmethod
        def cut_word(text):
            return [i.strip() for i in jieba.cut(text) if i.strip() not in Config.stopword and i.strip() is not None]
    
  2. 数据加载器格式化输出

    # 自定义Dataloder加载器数据输出格式
    def truncate(data_list):
        """ 传进一个batch_size大小的数据"""
        x_train = []
        label = []
        for data in data_list:
            x_train.append((data[0]))
            label.append(data[1])
        return torch.tensor(x_train), torch.tensor(label)
    
    
    loader_t = DataLoader(Dataset_('./datasets/train', '体育'), batch_size=10, collate_fn=truncate)
    loader_n = DataLoader(Dataset_('./datasets/train', '女性'), batch_size=10, collate_fn=truncate)
    loader_w = DataLoader(Dataset_('./datasets/train', '文学'), batch_size=10, collate_fn=truncate)
    loader_x = DataLoader(Dataset_('./datasets/train', '校园'), batch_size=10, collate_fn=truncate)
    

    三. 模型搭建

    1. TextCNN类

      # 模型搭建
      class TextCNN(nn.Module):
      
          def __init__(self):
              super(TextCNN, self).__init__()
              self.embeding = nn.Embedding(num_embeddings=len(Config.vocab_dict_), embedding_dim=Config.embedding_size)
              self.cord = nn.Sequential(
                  nn.Conv2d(in_channels=1, out_channels=Config.output_channel, kernel_size=(2, Config.embedding_size)),
                  nn.ReLU(),
                  # nn.MaxPool1d(3, stride=2), 2维卷积需要使用2维池化
                  nn.MaxPool2d((2, 1), stride=2)
              )
              self.fc = nn.Linear(Config.output_channel * 24, len(Config.label_map_dict))
              self.sf = nn.Softmax(dim=1)
      
          def forward(self, X):
              batch_size = X.shape[0]
              embedding_x = self.embeding(X)
              embedding_x = embedding_x.unsqueeze(1)  # 卷积神经网络输入4维,
              conved = self.cord(embedding_x)
              flatten = conved.view(batch_size, -1)  # 展平成一维
              output = self.sf(self.fc(flatten))
              return output
      

      四. 训练模型

      for epoch in range(5000):
          for i in ['t', 'n', 'w', 'x']:
              for X_train, y_train in eval(f'loader_{i}'):
                  pred = textCNN(X_train.to(device))
                  loss = criterion(pred, y_train.to(device))
      
                  optimizer.zero_grad()
                  loss.backward()
                  optimizer.step()
          print(f'轮训第{epoch+1}遍,损失{loss.data:.6f}')
      

      五. 模型保存

       # 保留模型参数
       # 保存
       torch.save(model.state_dict(), '\parameter.pkl')
       # 加载
       model = TheModelClass(...)
       # model.load_state_dict(torch.load('\parameter.pkl'))
      
       # 保存完整模型
       # 保存
       torch.save(TextCNN, './models/model.pkl')
       # 加载
       model = torch.load('./models/model.pkl')
      

      训练如下图:

posted @ 2021-04-16 09:29  旁人怎会懂  阅读(516)  评论(0编辑  收藏  举报