基于文本挖掘技术的投资者情绪计算

之前写毕业论文时有从股票文本中计算投资者情绪，在此把当时所用的代码发上来，当做一个记录。

#导入需要的库

from collections import defaultdict

from pandas import DataFrame, Series

from datetime import datetime

import pandas as pd

import os

import re

import jieba

import codecs

#读取情感词典文件

sen_file=open('C:\\Users\\Administrator\\Desktop\\dictionary3.txt','r+',encoding='utf-8')

sen_list = sen_file.read().splitlines()

sen_dict = defaultdict()

for s in sen_list:

　　s_split=s.split(',')

　　if len(s_split) == 2:

　　　　sen_dict[s.split(',')[0]] = s.split(',')[1]

#读取否定词文件

not_word_file = open('C:\\Users\\Administrator\\Desktop\\not_word.txt', 'r+', encoding='utf-8')

not_word_list = not_word_file.read().splitlines()

not_word_list[0]="不"

#读取程度副词文件

degree_file = open('C:\\Users\\Administrator\\Desktop\\degree.txt', 'r+', encoding='utf-8')

degree_list = degree_file.read().splitlines()

degree_dic = defaultdict()

for d in degree_list:

　　d_split=d.split(' ')

　　if len(d_split) == 2:

　　degree_dic[d.split(' ')[0]] = d.split(' ')[1]

#对句子进行分词

def seg_word(sentence):

　　seg_list = jieba.cut(sentence)

　　seg_result = []

　　for w in seg_list:

　　　　seg_result.append(w)

　　stopwords = set()

　　fr = codecs.open('C:\\Users\\Administrator\\Desktop\\stopwords.txt', 'r', 'utf8')

　　for word in fr:

　　　　stopwords.add(word.strip())

　　fr.close()

return list(filter(lambda x: x not in stopwords, seg_result))

#对分词后的词语进行词性划分

def classify_words(word_list):

　　sen_word = dict()

　　not_word = dict()

　　degree_word = dict()

　　for i in range(len(word_list)):

　　　　word = word_list[i]

　　　　if word in sen_dict.keys() and word not in not_word_list and word not in degree_dic.keys():

　　　　　　sen_word[i] = sen_dict[word]

　　　　elif word in not_word_list:

　　　　　　not_word[i] = -1

　　　　elif word in degree_dic.keys():

　　　　　　degree_word[i] = degree_dic[word]

　　sen_file.close()

　　degree_file.close()

　　not_word_file.close()

　　return sen_word, not_word, degree_word

#计算情感词的情感值

def socre_sentiment(sen_word, not_word, degree_word, seg_result):

　　score = 0

　　sentiment_index_list = list(sen_word.keys())

　　for i in sentiment_index_list:

　　　　W = 1

　　　　n = i-1

　　　　if n in not_word and n not in degree_word.keys():#有否定词

　　　　W=W*(-1)

　　　　m = n-1

　　　　　　if m in degree_word.keys():

　　　　　　　　s = W*float(degree_word[m])*float(sen_word[i])

　　　　　　else:

　　　　　　　　s = W*float(sen_word[i])

　　　　else:

　　　　　　if n in degree_word.keys():

　　　　　　　　s = W*float(degree_word[n])*float(sen_word[i])

　　　　　　else:

　　　　　　　　s = float(sen_word[i])

　　　　score = s + score

return score

#计算评论情感值

def setiment_score(sententce):

　　seg_list = seg_word(sententce)

　　sen_word, not_word, degree_word = classify_words(seg_list)

　　score = socre_sentiment(sen_word, not_word, degree_word, seg_list)

return score

#带入评论文件进行计算

path = 'C:\\Users\\Administrator\\Desktop\\comment-CSV.csv'

comment = pd.read_csv(path,encoding='gbk').astype(str)

review = comment['内容 2']

score = []

for r in review:

　　A = score.append(setiment_score(r))

comment['score'] = score

total_daily = df.groupby("time").count()

total_daily.columns = ['total']

pos_daily = df[df['score'] >0]

pos_daily = pos_daily.groupby("time").count()

pos_daily.columns = ['positive']

df_merge = pd.merge(total_daily, pos_daily, on='time')

df_merge['sentiment'] = df_merge['positive'] /df_merge['total']

path = 'C:\\Users\\Administrator\\Desktop\\000300.csv'

stock = pd.read_csv(path,encoding='gbk').astype(str)

stock['time'] = pd.to_datetime(stock['time'])

df_merge = df_merge.reset_index()

df_merge['time'] = pd.to_datetime(df_merge['time'])

df_final = pd.merge(df_merge, stock, on='time', how = 'inner')

df_final.to_csv('C:\\Users\\Administrator\\Desktop\\data.csv')

这个里面的文本评论是自己爬取的，然后用的情感词典的文件是搜集的汇总。

我研究一下如何传文件，有需要的可以自取。

代码部分并非全部原创，里面的情感词情感值计算的规则部分也进行了较大改动。

欢迎讨论。

posted @ 2020-07-14 20:10 不知天高地厚的小可爱阅读(476) 评论(9) 收藏举报

刷新页面返回顶部

不知天高地厚的小可爱

基于文本挖掘技术的投资者情绪计算

公告