基于LSTM的垃圾短信分类

import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# 假设您已经加载并准备好了 SMSSpamCollection 数据集,其中 X 是短信文本,y 是标签(ham 或 spam)
# 读入数据集
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
# 将标签转化为二元变量
df['label'] = df.label.map({'ham': 0, 'spam': 1})
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], random_state=1)

# 对文本数据进行处理
vocab_size = 5000
max_len = 100

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# 构建 LSTM 模型
####参考教材内容,补全关键代码,重新运行#####   
model=tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=64,input_length=max_len),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1,activation='sigmoid')])
# 编译模型
####参考教材内容,补全关键代码,重新运行#####   
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
# 训练模型
####参考教材内容,补全关键代码,重新运行#####   
model.fit(X_train_pad,y_train,epochs=5,batch_size=32,validation_data=(X_test_pad,y_test),verbose=1)

# 在测试集上评估模型
_, accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print("Accuracy: {:.2f}".format(accuracy))