Task04:论文种类分类

Task4:论文种类分类(3天)Link

  • 学习主题:论文种类分类(数据建模任务),利用已有数据建模,对新论文进行类别分类;
  • 学习内容:使用论文标题完成类别分类;
  • 学习成果:学会文本分类的基本方法、TFIDF等;
# 导入所需的package
import seaborn as sns #用于画图
from bs4 import BeautifulSoup #用于爬取arxiv的数据
import re #用于正则表达式,匹配字符串的模式
import requests #用于网络连接,发送网络请求,使用域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图工具
import json
import time
json_filename='D:/BaiduNetdiskDownload/archive/arxiv-metadata-oai-snapshot.json'
data = []
#使用with语句优势:1.自动关闭文件句柄;2.自动显示(处理)文件读取数据异常
with open(json_filename, 'r') as f: 
    for idx, line in enumerate(f): 
        d = json.loads(line)
        d = {'title': d['title'], 'categories': d['categories'], 'abstract': d['abstract']}
        data.append(d)
        
        # 选择部分数据
        if idx > 200000:
            break
        
data = pd.DataFrame(data) #将list变为dataframe格式,方便使用pandas进行分析
data['text'] = data['title'] + data['abstract']

data['text'] = data['text'].apply(lambda x: x.replace('\n',' '))
data['text'] = data['text'].apply(lambda x: x.lower())
data = data.drop(['abstract', 'title'], axis=1)
# 多个类别,包含子分类
data['categories'] = data['categories'].apply(lambda x : x.split(' '))

# 单个类别,不包含子分类
data['categories_big'] = data['categories'].apply(lambda x : [xx.split('.')[0] for xx in x])
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data['categories_big'].iloc[:])
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
data_tfidf = vectorizer.fit_transform(data['text'].iloc[:])
# 划分训练集和验证集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_tfidf, data_label,
                                                 test_size = 0.2,random_state = 1)

# 构建多标签分类模型
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))
             precision    recall  f1-score   support

          0       0.95      0.85      0.90      7872
          1       0.85      0.78      0.81      7329
          2       0.77      0.72      0.74      2970
          3       0.00      0.00      0.00         2
          4       0.72      0.47      0.57      2149
          5       0.51      0.67      0.58       993
          6       0.89      0.35      0.50       538
          7       0.71      0.68      0.70      3657
          8       0.75      0.62      0.68      3382
          9       0.85      0.88      0.86     10809
         10       0.41      0.11      0.18      1796
         11       0.80      0.04      0.07       737
         12       0.44      0.33      0.38       540
         13       0.52      0.34      0.41      1070
         14       0.70      0.15      0.25      3435
         15       0.83      0.19      0.31       687
         16       0.88      0.18      0.30       249
         17       0.89      0.43      0.58      2565
         18       0.79      0.36      0.49       689

avg / total       0.80      0.65      0.69     51469
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['text'].iloc[:], data_label,
                                                 test_size = 0.2,random_state = 1)
# parameter
max_features= 500
max_len= 150
embed_size=100
batch_size = 128
epochs = 5

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

tokens = Tokenizer(num_words = max_features)
tokens.fit_on_texts(list(x_train)+list(x_test))

x_sub_train = tokens.texts_to_sequences(x_train)
x_sub_test = tokens.texts_to_sequences(x_test)

x_sub_train=sequence.pad_sequences(x_sub_train, maxlen=max_len)
x_sub_test=sequence.pad_sequences(x_sub_test, maxlen=max_len)
# LSTM model
# Keras Layers:
from tensorflow.keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from tensorflow.keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D# Keras Callback Functions:
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
sequence_input = Input(shape=(max_len, ))
x = Embedding(max_features, embed_size,trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
preds = Dense(19, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
#model.fit(x_sub_train, y_train, batch_size=batch_size, epochs=epochs)
model.summary()
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 150, 100)     50000       input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 150, 100)     0           embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 150, 256)     176640      spatial_dropout1d[0][0]          
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 148, 64)      49216       bidirectional[0][0]              
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 64)           0           conv1d[0][0]                     
__________________________________________________________________________________________________
global_max_pooling1d (GlobalMax (None, 64)           0           conv1d[0][0]                     
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 128)          0           global_average_pooling1d[0][0]   
                                                                 global_max_pooling1d[0][0]       
__________________________________________________________________________________________________
dense (Dense)                   (None, 19)           2451        concatenate[0][0]                
==================================================================================================
Total params: 278,307
Trainable params: 228,307
Non-trainable params: 50,000
__________________________________________________________________________________________________
model.fit(x_sub_train, y_train, batch_size=batch_size, epochs=epochs)
Epoch 1/5
1251/1251 [==============================] - 2021s 2s/step - loss: 0.2254 - accuracy: 0.3126
Epoch 2/5
1251/1251 [==============================] - 2283s 2s/step - loss: 0.1701 - accuracy: 0.4879
Epoch 3/5
1251/1251 [==============================] - 2305s 2s/step - loss: 0.1416 - accuracy: 0.5835
Epoch 4/5
1251/1251 [==============================] - 3886s 3s/step - loss: 0.1306 - accuracy: 0.6236
Epoch 5/5
1251/1251 [==============================] - 2276s 2s/step - loss: 0.1238 - accuracy: 0.6444
<tensorflow.python.keras.callbacks.History at 0x1fc82f91390>
posted @ 2021-01-21 18:08  Vincy_Lemon  阅读(118)  评论(0)    收藏  举报