用sklearn进行文本特征提取
最近在做微博的转发预测,给的数据只有用户id,微博id,微博文本,和t时刻的转发、评论、点赞数。
考虑从文本着手来寻找特征。百度了一下大概方法有jieba分词和sklearn,这篇就是记录一下如何提取的微博文本特征啦,顺便进一步学习sklearn中的各种方法。
参考sklearn user guide: http://scikit-learn.org/stable/modules/feature_extraction.html
# coding: utf-8 import numpy as np import jieba from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer import sys reload(sys) sys.setdefaultencoding('utf8') def weibo_text_list(train_path, predict_path, text_path): trainf = open(train_path) predictf = open(predict_path) textf = open(text_path, 'w') text_list = [] predict_num = 0 train_num = 0 y_repost = [] y_comment = [] y_like = [] for line in trainf: train_num += 1 line = line.split() y_repost.append(line[4]) y_comment.append(line[5]) y_like.append(line[6]) text = line[7:] text_str = "" for t in text: text_str = text_str + " " + t text_list.append(text_str) textf.write(text_str + "\n") for line in predictf: predict_num += 1 line = line.split() text = line[4:] text_str = "" for t in text: text_str = text_str + " " + t text_list.append(text_str) textf.write(text_str + "\n") return text_list, train_num, predict_num, y_repost, y_comment, y_like # 计算训练集中每条微博中词的TF-IDF权值,并将其作为特征 def train(text, train_n, predict_n, y_re, y_com, y_lik): trainfp = open("train_feature.txt", 'w') testfp = open("predict_feature.txt", 'w') seg_list = [] print "segregate every line in train and predict data..." for t in text: s_line = '' seg = list(jieba.cut(t, cut_all=False)) for s in seg: s = s.encode() if s != "\t" and s != ',' and s!='/': s_line = s_line + "\t" + s seg_list.append(s_line) print "count TF-IDF of segregated words..." # 该类会统计每个词语的tf-idf权值 transformer = TfidfTransformer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = CountVectorizer() # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 tf_array = vectorizer.fit_transform(seg_list) tfidf = transformer.fit_transform(tf_array) # 获取词袋模型中的所有词语 word_feature = vectorizer.get_feature_names() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 weight = tfidf.toarray() print "logistic regression..." # train_feature = np.loadtxt("train_feature.txt", dtype=float64) # pred_feature = np.loadtxt("predict_feature.txt", dtype=float64) train_feature = weight[:train_num, :] pred_feature = weight[train_num:, :] y_rep = np.array(y_re) y_rep = y_rep.reshape(train_n, 1) y_com = np.array(y_com) y_com = y_com.reshape(train_n, 1) y_lik = np.array(y_lik) y_lik = y_lik.reshape(train_n, 1) print "predict repost..." clf1 = LogisticRegression() clf1.fit(train_feature, y_rep) repost_pred = clf1.predict(pred_feature) print "predict comment..." clf2 = LogisticRegression() clf2.fit(train_feature, y_com) comment_pred = clf1.predict(pred_feature) print "predict like..." clf3 = LogisticRegression() clf3.fit(train_feature, y_lik) like_pred = clf1.predict(pred_feature) result = open("result.txt", "w") predf = open("weibo_predict_data.txt") for line in predf: line = line.split() result.write(line[0]+"\t"+line[1]+"\t"+repost_pred+","+comment_pred+","+like_pred+"\n") result.close() predf.close() return 0 if __name__ == "__main__": train_path = "weibo_train_data.txt" predict_path = "weibo_predict_data.txt" text_path = "weibo_text.txt" train_feature_path = "train_feature.txt" test_feature_path = "predict_feature.txt" text_list, train_num, predict_num, y_repost, y_comment, y_like = weibo_text_list(train_path, predict_path, text_path) end = train(text_list, train_num, predict_num, y_repost, y_comment, y_like)
浙公网安备 33010602011771号