用sklearn进行文本特征提取

最近在做微博的转发预测,给的数据只有用户id,微博id,微博文本,和t时刻的转发、评论、点赞数。

考虑从文本着手来寻找特征。百度了一下大概方法有jieba分词和sklearn,这篇就是记录一下如何提取的微博文本特征啦,顺便进一步学习sklearn中的各种方法。

参考sklearn user guide: http://scikit-learn.org/stable/modules/feature_extraction.html

 

# coding: utf-8
import numpy as np
import jieba
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
import sys
reload(sys)
sys.setdefaultencoding('utf8')

def weibo_text_list(train_path, predict_path, text_path):
    trainf = open(train_path)
    predictf = open(predict_path)
    textf = open(text_path, 'w')
    text_list = []
    predict_num = 0
    train_num = 0
    y_repost = []
    y_comment = []
    y_like = []
    for line in trainf:
        train_num += 1
        line = line.split()
        y_repost.append(line[4])
        y_comment.append(line[5])
        y_like.append(line[6])
        text = line[7:]
        text_str = ""
        for t in text:
            text_str = text_str + " " + t
        text_list.append(text_str)
        textf.write(text_str + "\n")

    for line in predictf:
        predict_num += 1
        line = line.split()
        text = line[4:]
        text_str = ""
        for t in text:
            text_str = text_str + " " + t
        text_list.append(text_str)
        textf.write(text_str + "\n")

    return text_list, train_num, predict_num, y_repost, y_comment, y_like


# 计算训练集中每条微博中词的TF-IDF权值,并将其作为特征


def train(text, train_n, predict_n, y_re, y_com, y_lik):
    trainfp = open("train_feature.txt", 'w')
    testfp = open("predict_feature.txt", 'w')
    seg_list = []
    print "segregate every line in train and predict data..."
    for t in text:
        s_line = ''
        seg = list(jieba.cut(t, cut_all=False))
        for s in seg:
            s = s.encode()
            if s != "\t" and s != ',' and s!='/':
                s_line = s_line + "\t" + s
        seg_list.append(s_line)

    print "count TF-IDF of segregated words..."
    # 该类会统计每个词语的tf-idf权值
    transformer = TfidfTransformer()
    # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    vectorizer = CountVectorizer()

    # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    tf_array = vectorizer.fit_transform(seg_list)
    tfidf = transformer.fit_transform(tf_array)
    # 获取词袋模型中的所有词语
    word_feature = vectorizer.get_feature_names()
    # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    weight = tfidf.toarray()

    print "logistic regression..."
    # train_feature = np.loadtxt("train_feature.txt", dtype=float64)
    # pred_feature = np.loadtxt("predict_feature.txt", dtype=float64)
    train_feature = weight[:train_num, :]
    pred_feature = weight[train_num:, :]
    y_rep = np.array(y_re)
    y_rep = y_rep.reshape(train_n, 1)
    y_com = np.array(y_com)
    y_com = y_com.reshape(train_n, 1)
    y_lik = np.array(y_lik)
    y_lik = y_lik.reshape(train_n, 1)

    print "predict repost..."
    clf1 = LogisticRegression()
    clf1.fit(train_feature, y_rep)
    repost_pred = clf1.predict(pred_feature)
    print "predict comment..."
    clf2 = LogisticRegression()
    clf2.fit(train_feature, y_com)
    comment_pred = clf1.predict(pred_feature)
    print "predict like..."
    clf3 = LogisticRegression()
    clf3.fit(train_feature, y_lik)
    like_pred = clf1.predict(pred_feature)

    result = open("result.txt", "w")
    predf = open("weibo_predict_data.txt")
    for line in predf:
        line = line.split()
        result.write(line[0]+"\t"+line[1]+"\t"+repost_pred+","+comment_pred+","+like_pred+"\n")
    result.close()
    predf.close()
    return 0

if __name__ == "__main__":
    train_path = "weibo_train_data.txt"
    predict_path = "weibo_predict_data.txt"
    text_path = "weibo_text.txt"
    train_feature_path = "train_feature.txt"
    test_feature_path = "predict_feature.txt"
    text_list, train_num, predict_num, y_repost, y_comment, y_like = weibo_text_list(train_path, predict_path, text_path)
    end = train(text_list, train_num, predict_num, y_repost, y_comment, y_like)

 

posted @ 2015-09-23 23:23  hanahana  阅读(320)  评论(0)    收藏  举报