from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas
import numpy
def load_data():
with open('./data/corpus', encoding='utf-8') as f:
data = f.read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
content = line.split()
labels.append(content[0])
texts.append(" ".join(content[1:]))
trainDF = pandas.DataFrame({'label': labels, 'text': texts})
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.25, random_state=42)
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.transform(valid_y)
return trainDF, train_x, valid_x, train_y, valid_y
def extract_feature(trainDF, train_x, valid_x):
# TF-IDF: Term Frequency – Inverse Document Frequency,衡量一个词在一篇文档里有多重要
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)
# n-gram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2, 3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2, 3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x)
xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x)
# Combine all features (word-level, ngram, char-level)
xtrain_combined = hstack([xtrain_tfidf, xtrain_tfidf_ngram, xtrain_tfidf_ngram_chars])
xvalid_combined = hstack([xvalid_tfidf, xvalid_tfidf_ngram, xvalid_tfidf_ngram_chars])
return xtrain_combined, xvalid_combined
# 训练朴素贝叶斯分类器
def train_model(classifier, feature_vector_train, label_train, feature_vector_valid, label_valid, is_neural_net=False):
classifier.fit(feature_vector_train, label_train)
predictions = classifier.predict(feature_vector_valid)
if is_neural_net:
predictions = predictions.argmax(axis=-1)
return metrics.accuracy_score(label_valid, predictions)
# 文本序列化和嵌入矩阵
def prepare_embedding(train_texts, valid_texts, embedding_path='./data/wiki-news-300d-1M.vec', max_len=70):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train_texts))
train_seq = tokenizer.texts_to_sequences(train_texts)
valid_seq = tokenizer.texts_to_sequences(valid_texts)
train_seq = pad_sequences(train_seq, maxlen=max_len)
valid_seq = pad_sequences(valid_seq, maxlen=max_len)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
# 加载预训练词向量
embedding_matrix = numpy.zeros((vocab_size, 300))
with open(embedding_path, encoding='utf-8') as f:
for line in f:
values = line.rstrip().split(' ')
idx = word_index.get(values[0])
if idx is not None:
vector = numpy.asarray(values[1:], dtype='float32')
embedding_matrix[word_index[values[0]]] = vector
return train_seq, valid_seq, vocab_size, embedding_matrix
def create_cnn(input_length, vocab_size, embedding_matrix, num_classes):
input_layer = layers.Input((input_length,))
# 把每个词索引变成 300 维的词向量
embedding_layer = layers.Embedding(
input_dim=vocab_size,
output_dim=300,
weights=[embedding_matrix],
trainable=False
)(input_layer)
embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer) # 在训练中随机丢弃30%的词向量
# 卷积层
conv_layer = layers.Conv1D(100, 3, activation='relu')(embedding_layer)# 用 100 个卷积核扫描序列,每个卷积核大小是 3,每个卷积核就像一个特征探测器,专门识别某种 3-gram 模式
# 池化层
pooling_layer = layers.GlobalMaxPool1D()(conv_layer)
dense_layer = layers.Dense(50, activation='relu')(pooling_layer) # 把 100 维特征压缩到 50 维。
dense_layer = layers.Dropout(0.25)(dense_layer)
# 根据类别数,决定输出层的形状和激活函数
if num_classes == 2: # 二分类 → 输出一个概率值,用 sigmoid
output_layer = layers.Dense(1, activation='sigmoid')(dense_layer)
loss = 'binary_crossentropy'
else: # 多分类 → 输出每类的概率分布,用 softmax
output_layer = layers.Dense(num_classes, activation='softmax')(dense_layer)
loss = 'sparse_categorical_crossentropy'
# 构建和编译模型
model = models.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=optimizers.Adam(), loss=loss, metrics=['accuracy']) # 用 Adam 优化器、合适的损失函数,设置准确率为评估指标
return model
if __name__ == "__main__":
# 1. 加载数据集、划分数据集和训练集
train_DF, trainx, validx, trainy, validy = load_data()
# 2. 文本特征提取(使用TF-IDF)
xtrainCombined, xvalidCombined = extract_feature(train_DF, trainx, validx)
# 3. 训练朴素贝叶斯分类器
nb_accuracy = train_model(naive_bayes.MultinomialNB(), xtrainCombined, trainy, xvalidCombined, validy)
print(f"Naive Bayes Validation Accuracy: {nb_accuracy:.4f}")
# 4. 词向量准备
maxLen = 70
trainSeq, validSeq, vocabSize, embeddingMatrix = prepare_embedding(trainx, validx, max_len=maxLen)
numClasses = len(numpy.unique(trainy))
cnn_model = create_cnn(input_length=maxLen, vocab_size=vocabSize, embedding_matrix=embeddingMatrix, num_classes=numClasses) # 5. 搭建一个文本卷积神经网络
cnn_model.fit(trainSeq, trainy, validation_data=(validSeq, validy), epochs=5, batch_size=32) # 6. 训练 CNN 模型,训练 5 个 epoch
cnn_acc = cnn_model.evaluate(validSeq, validy)[1]
print(f"CNN Validation Accuracy: {cnn_acc:.4f}")