基于文本挖掘技术的投资者情绪计算

之前写毕业论文时有从股票文本中计算投资者情绪,在此把当时所用的代码发上来,当做一个记录。

#导入需要的库

from collections import defaultdict
from pandas import DataFrame, Series
from datetime import datetime
import pandas as pd
import os
import re
import jieba
import codecs
 
#读取情感词典文件
sen_file=open('C:\\Users\\Administrator\\Desktop\\dictionary3.txt','r+',encoding='utf-8')
sen_list = sen_file.read().splitlines()
sen_dict = defaultdict()
for s in sen_list:
  s_split=s.split(',')
  if len(s_split) == 2:
    sen_dict[s.split(',')[0]] = s.split(',')[1]
 
#读取否定词文件
not_word_file = open('C:\\Users\\Administrator\\Desktop\\not_word.txt', 'r+', encoding='utf-8')
not_word_list = not_word_file.read().splitlines()
not_word_list[0]="不"
#读取程度副词文件
degree_file = open('C:\\Users\\Administrator\\Desktop\\degree.txt', 'r+', encoding='utf-8')
degree_list = degree_file.read().splitlines()
degree_dic = defaultdict()
for d in degree_list:
  d_split=d.split(' ')
  if len(d_split) == 2:
  degree_dic[d.split(' ')[0]] = d.split(' ')[1]
 
#对句子进行分词
def seg_word(sentence):
  seg_list = jieba.cut(sentence)
  seg_result = []
  for w in seg_list:
    seg_result.append(w)
  stopwords = set()
  fr = codecs.open('C:\\Users\\Administrator\\Desktop\\stopwords.txt', 'r', 'utf8')
  for word in fr:
    stopwords.add(word.strip())
  fr.close()
return list(filter(lambda x: x not in stopwords, seg_result))

#对分词后的词语进行词性划分

def classify_words(word_list):
  sen_word = dict()
  not_word = dict()
  degree_word = dict()
  for i in range(len(word_list)):
    word = word_list[i]
    if word in sen_dict.keys() and word not in not_word_list and word not in degree_dic.keys():
      sen_word[i] = sen_dict[word]
    elif word in not_word_list:
      not_word[i] = -1
    elif word in degree_dic.keys():
      degree_word[i] = degree_dic[word]
  sen_file.close()
  degree_file.close()
  not_word_file.close()
  return sen_word, not_word, degree_word
 
#计算情感词的情感值
def socre_sentiment(sen_word, not_word, degree_word, seg_result):
  score = 0
  sentiment_index_list = list(sen_word.keys())
  for i in sentiment_index_list:
    W = 1
    n = i-1
    if n in not_word and n not in degree_word.keys():#有否定词
    W=W*(-1)
    m = n-1
      if m in degree_word.keys():
        s = W*float(degree_word[m])*float(sen_word[i])
      else:
        s = W*float(sen_word[i])
    else:
      if n in degree_word.keys():
        s = W*float(degree_word[n])*float(sen_word[i])
      else:
        s = float(sen_word[i])
    score = s + score
return score
 
#计算评论情感值
def setiment_score(sententce):
  seg_list = seg_word(sententce)
  sen_word, not_word, degree_word = classify_words(seg_list)
  score = socre_sentiment(sen_word, not_word, degree_word, seg_list)
return score
 
#带入评论文件进行计算
path = 'C:\\Users\\Administrator\\Desktop\\comment-CSV.csv'
comment = pd.read_csv(path,encoding='gbk').astype(str)
review = comment['内容 2']
score = []
for r in review:
  A = score.append(setiment_score(r))
comment['score'] = score
total_daily = df.groupby("time").count()
total_daily.columns = ['total']
pos_daily = df[df['score'] >0]
pos_daily = pos_daily.groupby("time").count()
pos_daily.columns = ['positive']
df_merge = pd.merge(total_daily, pos_daily, on='time')
df_merge['sentiment'] = df_merge['positive'] /df_merge['total']
path = 'C:\\Users\\Administrator\\Desktop\\000300.csv'
stock = pd.read_csv(path,encoding='gbk').astype(str)
stock['time'] = pd.to_datetime(stock['time'])
df_merge = df_merge.reset_index()
df_merge['time'] = pd.to_datetime(df_merge['time'])
df_final = pd.merge(df_merge, stock, on='time', how = 'inner')
df_final.to_csv('C:\\Users\\Administrator\\Desktop\\data.csv')
 
这个里面的文本评论是自己爬取的,然后用的情感词典的文件是搜集的汇总。
我研究一下如何传文件,有需要的可以自取。
代码部分并非全部原创,里面的情感词情感值计算的规则部分也进行了较大改动。
欢迎讨论。
posted @ 2020-07-14 20:10  不知天高地厚的小可爱  阅读(476)  评论(9)    收藏  举报
1