text clf rnn
#!/usr/bin/env python # coding=utf-8 import numpy as np import pandas as pd import re from bs4 import BeautifulSoup import os from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils.np_utils import to_categorical from keras.models import Model from keras.layers import Dense, Input, Layer from keras.layers import Convolution1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional from keras import backend as K from sklearn.model_selection import train_test_split MAX_SEQUENCE_LENGTH= 1000 MAX_NB_WORDS = 2000 EMBEDDING_DIM = 100 def load_data(path='~/workspace/data/imdb/labeledTrainData.tsv'): """ 载入imdb评论的训练数据 """ def clean_str(sent): sent = re.sub(r"\\|\'|\"", '', sent) return sent.strip().lower() data, label = [], [] df = pd.read_csv(path, sep='\t') print df.shape for idx in range(df['review'].shape[0]): text = BeautifulSoup(df['review'][idx], 'lxml') data.append(clean_str(text.get_text().encode('ascii', 'ignore'))) label.append(int(df['sentiment'][idx])) if idx>1000: break return data, label def load_weights(fname, word_index): """ 导入预先训练好的glove词向量 """ emb_weights = {} with open(fname) as fr: for line in fr: values = line.strip().split() emb_weights[values[0]] = values[1:] emb_matrix = np.random.random((len(word_index)+1, EMBEDDING_DIM)) for word, i in word_index.items(): if word in emb_weights: emb_matrix[i] = emb_weights[word] return emb_matrix texts, label = load_data() tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(texts) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) label_onehot = to_categorical(label) x_train, x_test, y_train, y_test = train_test_split(data, label_onehot, test_size=0.1) print('Training %d positive reviews', y_train.sum(axis=0)) print('Training %d negative reviews', y_test.sum(axis=0)) emb_matrix = load_weights(fname='/home/jkmiao/workspace/data/glove/glove.6B.100d.txt', word_index=word_index) embedding_layer = Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[emb_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True) input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') embedded_sequences = embedding_layer(input) bi_lstm = Bidirectional(LSTM(100))(embedded_sequences) output = Dense(2, activation='softmax')(bi_lstm) model = Model(input=input, output=output) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics='accuracy') print('model fitting - Bidirectional LSTM') model.summary() model.fit(x_train, y_train, validation_data=(x_test, y_test), nb_epoch=10, batch_size=32) model.save('model/text_clf_lstm.h5')
每天一小步,人生一大步!Good luck~
浙公网安备 33010602011771号