Python实现读取多个excel;以及统计词频;使用词典

# -*- coding: utf8 -*-
import xlrd
import codecs
import os
def handExcel(path):
#定义输出文件
output1 = codecs.open("7.8_userQue.txt","w","utf-8")
output2 = codecs.open("7.8_StandQue.txt","w","utf-8")

#定义输入文件
os.chdir(path)
names = os.listdir(path)

j = 1
for name in names:
print("读取第%d个excel"%j)
print(name)
bk = xlrd.open_workbook(name)
shxrange = range(bk.nsheets)
try:
sh = bk.sheet_by_name("Sheet1")
except:
print
"no sheet in %s named Sheet1" % name
# 获取行数
nrows = sh.nrows
# 获取列数
ncols = sh.ncols
print("nrows %d, ncols %d" % (nrows, ncols))
for i in range(1,nrows):
cell_value = sh.cell_value(i, 3)
cell_value1 = sh.cell_value(i, 4)
output1.write(cell_value+"\n")
output2.write(cell_value1 + "\n")
j += 1
handExcel("D:/Users/cassie.xiao/PycharmProjects/read_excel/three")



---------------------------------------统计词频----------------------------------
# -*- coding: utf8 -*-

import codecs
def getfreq(freqdict):
output1 = codecs.open("xiaoi_userQue_seg_hanlp.txt", "r", "utf-8")
print("getfreq....")
for line in output1.readlines():
line = line.split(" ")
for word in line:
if word in freqdict.keys():
freqdict[word] += 1
elif word not in freqdict.keys():
freqdict[word] = 1
return freqdict

def sort_out(dic,outfilename):
print("sort....")
sort = sorted(dic.items(), key = lambda item:item[1],reverse=True)
print("out....")
with codecs.open(outfilename,'w','utf-8') as f:
for each in sort:
f.write(each[0]+":"+str(each[1])+"\n")
def main():
freqdict = {}
freqdict1 = getfreq(freqdict)
outfilename = "xiaoi_userQue_seg_hanlp_freq.txt"
sort_out(freqdict1,outfilename)

if __name__ == "__main__":
main()

--------------------------------------使用词典操作-------------------------------------
# -*- coding: utf8 -*-

import codecs
def getfreq():
input_xiaoi_userQ = codecs.open(r"300W_xiaoi_jieba_UQ.txt", "r", "utf-8")
input_xiaoi_standQ = codecs.open(r"300W_xiaoi_jieba_SQ.txt", "r", "utf-8")
input_noun = codecs.open("noun.txt", "r", "utf-8")
output_xiaoi_standQ = codecs.open(r"freq_xiaoi&Noun_standQ.txt", "w", "utf-8")
output_xiaoi_userQ = codecs.open(r"freq_xiaoi&Noun_userQ.txt", "w", "utf-8")
print("getfreq....")
#先处理用户问
dict_userQ = {}
for line in input_xiaoi_userQ.readlines():
if not line.startswith(":"):
pair = line.strip().split(":")
dict_userQ[pair[0]] = pair[1]
for line in input_noun.readlines():
if dict_userQ.has_key(line.strip()):

ferq = dict_userQ.get(line.strip())
output_xiaoi_userQ.write(line.strip() + ":" + ferq + "\n")
output_xiaoi_userQ.close()

#处理标准问
dict_userQ1 = {}
for line in input_xiaoi_standQ.readlines():
if not line.startswith(":"):
pair = line.strip().split(":")
# print pair
dict_userQ1[pair[0]] = pair[1]
# print(len(dict_userQ1))
input_noun.seek(0)
for line in input_noun.readlines():
print line
if dict_userQ1.has_key(line.strip()):
print(line.strip())
ferq = dict_userQ1.get(line.strip())
output_xiaoi_standQ.write(line.strip() + ":" + ferq + "\n")
output_xiaoi_standQ.close()
getfreq()

posted on 2017-08-24 09:47  毛无语666  阅读(1488)  评论(0编辑  收藏  举报

导航