Keras下的文本情感分析简介。与MLP,RNN,LSTM模型下的文本情感测试

  1 # coding: utf-8
  2 
  3 # In[1]:
  4 
  5 
  6 import urllib.request
  7 import os
  8 import tarfile
  9 
 10 
 11 # In[2]:
 12 
 13 
 14 url="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
 15 filepath="example/data/aclImdb_v1.tar.gz"
 16 if not os.path.isfile(filepath):
 17     result=urllib.request.urlretrieve(url,filepath)
 18     print('downloaded:',result)
 19 if not os.path.exists("example/data/aclImdb_v1/aclImdb"):
 20     tfile = tarfile.open("data/aclImdb_v1.tar.gz", 'r:gz')
 21     result=tfile.extractall('data/')
 22 
 23 
 24 # In[3]:
 25 
 26 
 27 from keras.datasets import imdb
 28 from keras.preprocessing import sequence
 29 from keras.preprocessing.text import Tokenizer
 30 
 31 
 32 # In[4]:
 33 
 34 
 35 import re
 36 def rm_tags(text):
 37     re_tag = re.compile(r'<[^>]+>')
 38     return re_tag.sub('', text)
 39 
 40 
 41 # In[5]:
 42 
 43 
 44 import os
 45 def read_files(filetype):
 46     path = "example/data/aclImdb_v1/aclImdb/"
 47     file_list=[]
 48 
 49     positive_path=path + filetype+"/pos/"
 50     for f in os.listdir(positive_path):
 51         file_list+=[positive_path+f]
 52     
 53     negative_path=path + filetype+"/neg/"
 54     for f in os.listdir(negative_path):
 55         file_list+=[negative_path+f]
 56         
 57     print('read',filetype, 'files:',len(file_list))
 58     all_labels = ([1] * 12500 + [0] * 12500) 
 59     
 60     all_texts  = []
 61     for fi in file_list:
 62         with open(fi,encoding='utf8') as file_input:
 63             all_texts += [rm_tags(" ".join(file_input.readlines()))]
 64             
 65     return all_labels,all_texts
 66 
 67 
 68 # In[6]:
 69 
 70 
 71 y_train,train_text=read_files("train")
 72 
 73 
 74 # In[7]:
 75 
 76 
 77 y_test,test_text=read_files("test")
 78 
 79 
 80 # In[8]:
 81 
 82 
 83 train_text[0]
 84 
 85 
 86 # In[9]:
 87 
 88 
 89 y_train[0]
 90 
 91 
 92 # In[10]:
 93 
 94 
 95 train_text[12500]
 96 
 97 
 98 # In[11]:
 99 
100 
101 y_train[12500]
102 
103 
104 # In[12]:
105 
106 
107 token = Tokenizer(num_words=2000)
108 token.fit_on_texts(train_text)
109 
110 
111 # In[13]:
112 
113 
114 print(token.document_count)
115 print(token.word_index)
116 
117 
118 # In[14]:
119 
120 
121 x_train_seq = token.texts_to_sequences(train_text)
122 x_test_seq  = token.texts_to_sequences(test_text)
123 
124 
125 # In[15]:
126 
127 
128 print(x_train_seq[0])
129 
130 
131 # In[16]:
132 
133 
134 x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
135 x_test  = sequence.pad_sequences(x_test_seq,  maxlen=100)
136 
137 
138 # In[17]:
139 
140 
141 x_train[0]
142 
143 
144 # In[18]:
145 
146 
147 from keras.models import Sequential
148 from keras.layers.core import Dense, Dropout, Activation,Flatten
149 from keras.layers.embeddings import Embedding
150 model = Sequential()
151 model.add(Embedding(output_dim=32,
152                     input_dim=2000, 
153                     input_length=100))
154 model.add(Dropout(0.2))
155 model.add(Flatten())
156 model.add(Dense(units=256,
157                 activation='relu' ))
158 model.add(Dropout(0.2))
159 model.add(Dense(units=1,
160                 activation='sigmoid' ))
161 model.summary()
162 
163 
164 # In[19]:
165 
166 
167 model.compile(loss='binary_crossentropy', 
168               optimizer='adam', 
169               metrics=['accuracy'])
170 train_history =model.fit(x_train, y_train,batch_size=100, 
171                          epochs=10,verbose=2,
172                          validation_split=0.2)
173 
174 
175 # In[20]:
176 
177 
178 get_ipython().magic('pylab inline')
179 import matplotlib.pyplot as plt
180 def show_train_history(train_history,train,validation):
181     plt.plot(train_history.history[train])
182     plt.plot(train_history.history[validation])
183     plt.title('Train History')
184     plt.ylabel(train)
185     plt.xlabel('Epoch')
186     plt.legend(['train', 'validation'], loc='upper left')
187     plt.show()
188 
189 
190 # In[21]:
191 
192 
193 show_train_history(train_history,'acc','val_acc')
194 show_train_history(train_history,'loss','val_loss')
195 
196 
197 # In[22]:
198 
199 
200 scores = model.evaluate(x_test, y_test, verbose=1)
201 scores[1]
202 
203 
204 # In[23]:
205 
206 
207 probility=model.predict(x_test)
208 
209 
210 # In[24]:
211 
212 
213 probility[:10]
214 
215 
216 # In[25]:
217 
218 
219 probility[12500:12510]
220 
221 
222 # In[26]:
223 
224 
225 predict=model.predict_classes(x_test)
226 
227 
228 # In[27]:
229 
230 
231 predict_classes=predict.reshape(-1)
232 
233 
234 # In[28]:
235 
236 
237 SentimentDict={1:'正面的',0:'负面的'}
238 def display_test_Sentiment(i):
239     print(test_text[i])
240     print('标签label:',SentimentDict[y_test[i]],
241           '预测结果:',SentimentDict[predict_classes[i]])
242 
243 
244 # In[29]:
245 
246 
247 display_test_Sentiment(2)
248 
249 
250 # In[30]:
251 
252 
253 display_test_Sentiment(12505)
254 
255 
256 # In[31]:
257 
258 
259 from keras.models import Sequential
260 from keras.layers.core import Dense, Dropout, Activation
261 from keras.layers.embeddings import Embedding
262 from keras.layers.recurrent import SimpleRNN
263 model = Sequential()
264 model.add(Embedding(output_dim=32,
265                     input_dim=2000, 
266                     input_length=100))
267 model.add(Dropout(0.35))
268 model.add(SimpleRNN(units=16))
269 model.add(Dense(units=256,activation='relu' ))
270 model.add(Dropout(0.35))
271 model.add(Dense(units=1,activation='sigmoid' ))
272 model.summary()
273 
274 
275 # In[32]:
276 
277 
278 model.compile(loss='binary_crossentropy', 
279               optimizer='adam', 
280               metrics=['accuracy'])
281 train_history =model.fit(x_train, y_train,batch_size=100, 
282                          epochs=10,verbose=2,
283                          validation_split=0.2)
284 
285 
286 # In[33]:
287 
288 
289 scores = model.evaluate(x_test, y_test, verbose=1)
290 scores[1]
291 
292 
293 # In[34]:
294 
295 
296 from keras.models import Sequential
297 from keras.layers.core import Dense, Dropout, Activation,Flatten
298 from keras.layers.embeddings import Embedding
299 from keras.layers.recurrent import LSTM
300 model = Sequential()
301 model.add(Embedding(output_dim=32,
302                     input_dim=2000, 
303                     input_length=100))
304 model.add(Dropout(0.2))
305 model.add(LSTM(32))
306 model.add(Dense(units=256,
307                 activation='relu' ))
308 model.add(Dropout(0.2))
309 model.add(Dense(units=1,
310                 activation='sigmoid' ))
311 model.summary()
312 
313 
314 # In[35]:
315 
316 
317 model.compile(loss='binary_crossentropy', 
318               #optimizer='rmsprop', 
319               optimizer='adam', 
320               metrics=['accuracy'])
321 train_history =model.fit(x_train, y_train,batch_size=100, 
322                          epochs=10,verbose=2,
323                          validation_split=0.2)
324 
325 
326 # In[36]:
327 
328 
329 show_train_history(train_history,'acc','val_acc')
330 show_train_history(train_history,'loss','val_loss')
331 scores = model.evaluate(x_test, y_test, verbose=1)
332 scores[1]
333 
334 
335 # In[ ]:
View Code

 文本来源于IMDb网络电影数据集。下载,放到合适的路径下,然后,开始。

过滤掉HTML标签。因为数据集中有相关标签。:

之后读取所有数据和目标标签,然后建立字典:

 

将文本转化为数字串:

格式化数字串长度为100

建立MLP模型,其中嵌入层将每个长度为100的数字串转为100个32维的向量,将文字映射成多维的几何空间向量,让每一个文字有上下的关联性。

编译,训练,绘图,评估后的准确率:

 

建立RNN模型,有关RNN模型的介绍:https://www.cnblogs.com/bai2018/p/10466418.html

 

测试评估:

建立LSTM模型,相关介绍:https://www.cnblogs.com/bai2018/p/10466497.html

 

 准确率:

 

posted @ 2019-03-03 16:23  大浪淘沙、  阅读(879)  评论(0)    收藏  举报