paddlepaddle(3)

------------恢复内容开始------------

一、背景

本项目为疫情期间网民情绪识别,PaddlePaddle出品的预训练模型管理和迁移学习工具,便捷地获取PaddlePaddle生态下的预训练模型,完成模型的管理和一键预测。配合使用Fine-tune API,可以基于大规模预训练模型快速完成迁移学习,让预训练模型能更好地服务于用户特定场景的应用。所以本项目将采用百度出品的PaddleHub预训练模型微调工具,快速构建比赛方案。

二、代码

# # 解压数据集
# !cd data/data22724 && unzip test_dataset.zip
# !cd data/data22724 && unzip "train_ dataset.zip"
# !hub install ernie
import pandas as pd
import numpy as np

import jieba
import re

import paddlehub as hub
from sklearn.model_selection import StratifiedKFold
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

from matplotlib import pyplot as plt
%matplotlib inline

unuseful = ['\t', '\n', '2[\u4e00-\u9fa5]{2,7}·.*?\?', '【.*?】', '//@.*?:', '//@.*?:', '回复@.*?:', 'O网页链接', '\?展开全文c', '我免费围观了.*?~O微博问答?', '#.*?#', '\?', '[A-Za-z0-9]', 
            '/u0800-/u4e00', '-', '、','~','『','』','—','(.*?)','年','月','日','\(.*?\)', '◎', '"','"']

with open(file='/home/aistudio/data/data22724/nCoV_100k_train.labled.csv', mode='r',encoding='gb18030', errors='ignore') as fp:
    train_labled = pd.read_csv(fp)
    train_labled = train_labled[train_labled['情感倾向'].isin(['-1','0','1'])]
    for content in unuseful:
        train_labled['微博中文内容'] = train_labled['微博中文内容'].str.replace(content,'')
    train_labled['微博中文内容'] = train_labled['微博中文内容'].str.replace('!{2,}','!').replace("《", "").replace("》", "").replace('。{2,}','。').replace('\.+','.').replace('【','').replace('】','')
    train_labled['微博中文内容'] = train_labled['微博中文内容'].str.replace('?{2,}','?').replace('.{2,}','.').replace(' ','')

with open(file='/home/aistudio/data/data22724/nCov_10k_test.csv', mode='r',encoding='gb18030', errors='ignore') as fp:
    test = pd.read_csv(fp)
    for content in unuseful:
        test['微博中文内容'] = test['微博中文内容'].str.replace(content,'')
    test['微博中文内容'] = test['微博中文内容'].str.replace('!{2,}','!').replace("《", "").replace("》", "").replace('。{2,}','。').replace('\.+','.').replace('【','').replace('】','')
    test['微博中文内容'] = test['微博中文内容'].str.replace('?{2,}','?').replace('.{2,}','.').replace(' ','')
# train_labled[['微博中文内容', '情感倾向']].to_csv('train.txt')
#去除无意义字符
def del_reply_mark(sentence):
    output = re.sub(unuseful[0], '', sentence)
    for cont in unuseful[1:]:
        output = re.sub(cont, '', output)
    # for cont in stars[1:]:
    #     output = re.sub(cont, '', output)
    if output == "":
        output = "***"
    return output

#字符替换
def rep_chn_punc(sentence):
    table = {ord(f): ord(t) for f, t in zip(
        u',。!?【】()%#@&1234567890①②③④⑤、·:[]():;',
        u',.!?....%#@&123456789012345,........')}
    output = sentence.translate(table).replace("...", "…").replace("《", "").replace("》", "").replace("℃", "度")\
        .replace("——", "").replace("..", "…").replace("「", "").replace("」", "").replace("....", "…")\
        .replace(".....", "…").replace("T T", "TT")\
        .replace("T_T", "TT")
    return output

def chn_tokenize(sentence):
    line_list = jieba.lcut(sentence, HMM=True)
    out_str = ''
    for word in line_list:
        if word not in stopwords:
            if word != '\t':
                out_str += word
    return out_str


folds=5
sfolder = StratifiedKFold(n_splits=folds,random_state=1,shuffle=True)
train_labled = train_labled[['微博中文内容', '情感倾向']]
fold=0
for train_index, valid_index in sfolder.split(train_labled['微博中文内容'], train_labled['情感倾向']):
    train = train_labled.iloc[train_index.tolist()]
    valid = train_labled.iloc[valid_index.tolist()]
    train.to_csv('train_' + str(fold) + '.txt', index=False, header=False, sep='\t')
    valid.to_csv('valid_' + str(fold) + '.txt', index=False, header=False, sep='\t')
    fold +=1
class MyDataset(BaseNLPDataset):
        """DemoDataset"""
        def __init__(self,train_file_path="train_0.txt",dev_file_path="valid_0.txt"):
            # 数据集存放位置
            self.dataset_dir = "./"
            super(MyDataset, self).__init__(
                base_path=self.dataset_dir,
                train_file=train_file_path,
                dev_file=dev_file_path,
                train_file_with_header=False,
                dev_file_with_header=False,
                test_file_with_header=False,
                # 数据集类别集合
                label_list=["-1", "0", "1"])


p_idx = 2
for fold in range(0,folds):
    if fold != p_idx:
        continue
    module = hub.Module(name="ernie")
    strategy = hub.AdamWeightDecayStrategy(
                weight_decay=0.01,
                warmup_proportion=0.1,
                learning_rate=5e-5)
    data = test[['微博中文内容']].fillna(' ').values.tolist()
    dataset = MyDataset(train_file_path='train_' + str(fold) + '.txt', dev_file_path='valid_' + str(fold) + '.txt')
    reader = hub.reader.ClassifyReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        sp_model_path=module.get_spm_path(),
        word_dict_path=module.get_word_dict_path(),
        max_seq_len=170)
    inv_label_map = {val: key for key, val in reader.label_map.items()}
    config = hub.RunConfig(
        use_cuda=True,
        num_epoch=3,
        checkpoint_dir="model_"+str(fold),
        batch_size=64,
        eval_interval=500,
        strategy=strategy)
    
    inputs, outputs, program = module.context(trainable=True, max_seq_len=170)
    pooled_output = outputs["pooled_output"]

    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    cls_task = hub.TextClassifierTask(
            data_reader=reader,
            feature=pooled_output,
            feed_list=feed_list,
            num_classes=dataset.num_labels,
            config=config,
            metrics_choices=["f1"])
    
    run_states = cls_task.finetune_and_eval()
    run_states = cls_task.predict(data=data)
    results = [run_state.run_results for run_state in run_states]
    try:
        proba += np.vstack([r[0] for r in results])/5
    except:
        proba = np.vstack([r[0] for r in results])/5

prediction = list(np.argmax(proba, axis=1))
prediction = [inv_label_map[p] for p in prediction]
        
submission = pd.DataFrame()
submission['id'] = test['微博id'].values
submission['id'] = submission['id'].astype(str) + ' '
submission['y'] = prediction
np.save('proba' + str(p_idx) +'.npy', proba)
submission.to_csv('result.csv', index=False)
submission['text'] = test[['微博中文内容']].fillna(' ').values
submission['label'] = submission['y'].map({-1: '消极', 0: '中性', 1: '积极'})
result = pd.read_csv('result.csv')
result.isna().sum()
len(result)
sub = pd.read_csv('/home/aistudio/data/data22724/submit_example.csv')
result['id'] = sub['id']
result.to_csv('result.csv', index=False)
proba0 = np.load('proba0.npy')
proba1 = np.load('proba1.npy')
proba2 = np.load('proba2.npy')
proba = proba0 + proba1 + proba2
prediction = list(np.argmax(proba, axis=1))
prediction = [inv_label_map[p] for p in prediction]
        
submission = pd.DataFrame()
submission['id'] = test['微博id'].values
submission['id'] = submission['id'].astype(str) + ' '
submission['y'] = prediction
np.save('proba' + str(p_idx) +'.npy', proba)
submission.to_csv('result.csv', index=False)

submission['text'] = test[['微博中文内容']].fillna(' ').values
submission['label'] = submission['y'].map({-1: '消极', 0: '中性', 1: '积极'})
result = pd.read_csv('result.csv')
result.isna().sum()
len(result)
sub = pd.read_csv('/home/aistudio/data/data22724/submit_example.csv')
result['id'] = sub['id']
result.to_csv('result.csv', index=False)

三、总结

这个假期了解并学习paddlepaddle、python、linux相关的知识,可以说只学到了一点皮毛,只掌握了python等一些基本知识,对paddlehub的相关应用不是特别的熟悉,不能够独立完整的进行代码相关的思考,我应该更加努力学习,寻找更多的资源,将新知识以及陌生的知识点演变为属于自己的知识。

posted @ 2020-08-31 21:49  咕噜要努力  阅读(181)  评论(0)    收藏  举报