1读取
file_path = r'C:\Users\jiqixuexi\SMSSpamCollection'
sms = open(file_path, 'r', encoding='utf-8')
sms_data = []
sms_label = []
csv_reader = csv.reader(sms, delimiter='\t')
for line in csv_reader:
sms_label.append(line[0])
sms_data.append(preprocessing(line[1])) # 对每封邮件进行预处理
sms.close()
sms = open(file_path, 'r', encoding='utf-8')
sms_data = []
sms_label = []
csv_reader = csv.reader(sms, delimiter='\t')
for line in csv_reader:
sms_label.append(line[0])
sms_data.append(preprocessing(line[1])) # 对每封邮件进行预处理
sms.close()
2数据预处理
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'): # 形容词
return nltk.corpus.wordnet.ADJ
elif treebank_tag.startswith('V'): # 动词
return nltk.corpus.wordnet.VERB
elif treebank_tag.startswith('N'): # 名词
return nltk.corpus.wordnet.NOUN
elif treebank_tag.startswith('R'): # 副词
return nltk.corpus.wordnet.ADV
else:
return nltk.corpus.wordnet.NOUN
if treebank_tag.startswith('J'): # 形容词
return nltk.corpus.wordnet.ADJ
elif treebank_tag.startswith('V'): # 动词
return nltk.corpus.wordnet.VERB
elif treebank_tag.startswith('N'): # 名词
return nltk.corpus.wordnet.NOUN
elif treebank_tag.startswith('R'): # 副词
return nltk.corpus.wordnet.ADV
else:
return nltk.corpus.wordnet.NOUN
def preprocessing(text):
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # 分词
stops = stopwords.words('english')
tokens = [token for token in tokens if token not in stops] # 去掉停用词
tokens = [token.lower() for token in tokens if len(token) >= 3]
tag = nltk.pos_tag(tokens) # 标注词性
lemmatizer = WordNetLemmatizer() # 词性还原
tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(tag[i][1])) for i, token in enumerate(tokens)] # 词性还原
preprocessed_text = ''.join(tokens)
return preprocessed_text
lemmatizer = WordNetLemmatizer() # 词性还原
tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(tag[i][1])) for i, token in enumerate(tokens)] # 词性还原
preprocessed_text = ''.join(tokens)
return preprocessed_text
3.数据划分—训练集和测试集数据划分
x_train, x_test, y_train, y_test = train_test_split(sms_data, sms_label, test_size=0.2, stratify=sms_label)
print('总集:', len(sms_label))
print('训练集:', len(x_train))
print('测试集:', len(y_test)
print('总集:', len(sms_label))
print('训练集:', len(x_train))
print('测试集:', len(y_test)
浙公网安备 33010602011771号