阶段作业1：完整的中英文词频统计+补交上次作业 - C22C

#补交作业

cc = ('''Counting stars Lately I've been, I've been losing sleep 　　
Dreaming 'bout the things that we could be 　　
But baby I've been, I've been prayin' hard 　 　　
Said no more counting dollars 　　We'll be counting stars 　　
Yeah, we'll be counting stars 　　I see this life Like a swinging vine 　
　Swing my heart across the line 　　In my face is flashing signs 　　Seek it out and ye shall find
　　Old, but I'm not that old 　　Young, but I'm not that bold 　　And I don't think the world is sold 　
　I'm just doing what we're told 　　I, feel something so right 　　But doing the wrong thing 　　
I, feel something so wrong 　　But doing the right thing 　　I could lie, could lie, could lie 　
　everything that kills me makes me feel alive 　　Lately I've been, I've been losing sleep 　
　Dreaming 'bout the things that we could be 　　Baby I've been, I've been prayin' hard 　
　Said no more counting dollars 　　We'll be counting stars 　　Lately I've been, I've been losing sleep 　　
Dreaming 'bout the things that we could be 　　Baby I've been, I've been prayin' hard 　　Said no more counting dollars 　
　We'll be, we'll be counting stars 　　I feel the love And I feel it burn 　　Down this river every turn 　
　Hope is a four letter word 　　Make that money 　　Watch it burn 　　Old, but I'm not that old 　
　Young, but I'm not that bold 　　And I don't think the world is sold 　　I'm just doing what we're told 　
　I, feel something so wrong 　　But doing the right thing 　　I could lie, could lie, could lie 　
　Everything that drowns me makes me wanna fly 　　Lately I've been, I've been losing sleep 　
　Dreaming 'bout the things that we could be 　　Baby I've been, I've been prayin' hard
　　Said no more counting dollars 　　We'll be counting stars 　　Lately I've been, I've been losing sleep 　
　Dreaming 'bout the things that we could be 　　Baby I've been, I've been prayin' hard 　
　Said no more counting dollars 　　We'll be, we'll be counting stars 　　Take that money And watch it burn 　　Sink in the river
''')
cc = cc.replace('.', ' ')
ccList = cc.split()
print(len(cc), ccList)  # 分隔一个单词并统计英文单词个数
ccSet = set(ccList)  # 将列表转化成集合，再将集合转化成字典来统计每个单词出现个数

print(ccSet)


strDict = {}
# for star in ccSet:
#     strDict[star] = ccList.count(star)
# print(strDict, len(strDict))
for star in ccSet:
    strDict[star]=cc.count(star)
for key in ccSet:
    print(key,strDict[key])
wclist=list(ccSet.items())
print(wclist)
# def takeSecond(elem):
#     return  elem[1]
# wclist.sort(key=takeSecond,reverse=True)
# print(wclist)

#按词频排序
wcList=list(strDict.items())
print(wcList)
wcList.sort(key=lambda x:x[1],reverse=True)
print(wcList)

#输出TOP(20)
for i in range(20):
    print(wcList[i])


# 列表的遍历

cclist = ['wqdq', 'dqd', 'Awd', 313, '小四', 'dqd']
print(cclist)
cclist.append('gegeheh')
print(cclist)
cclist.pop(2)
print(cclist)
for i in cclist:
    print(i)

# 元组的遍历

tuple = ('jtfjhrr', 'rqfw f2q', 800, 10)
print(tuple[2])
for i in tuple:
    print(i)

# 字典的遍历

dic = {'fhehe': '4w6436', 'jgdns': 7, '4w6436': 'First'}

print('fhehe:', dic['fhehe'])
print('4w6436:', dic['4w6436'])

dic['4w6436'] = 8;
dic['4w6436'] = "对接欧文机房的维护"

print('4w6436:', dic['4w6436'])
print('4w6436:', dic['4w6436'])

for key in dic:
    print(key, ':', dic.get(key))

# 集合的遍历

a = set([1, 2, 3, 6, 5])
print(a)

a.add(4)
print(a)
a.add('uteru')
print(a)

a.remove(5)
print(a)

for i in a:
    print(i)

#此次作业

fo=open('ccc1015.txt','r',encoding='utf-8')
strBig=fo.read().lower()
fo.close()
print(strBig)
#字符串预处理：#大小写,标点符号，特殊符号
sep=""".,:;!?"""
for ch in sep:
    strBig=strBig.replace(ch,'')
strlist=strBig.split()
print(len(strlist),strlist)
strSet=set(strlist)
exclude={'is','be','be','I','we','the','in'}
strSet=strSet-exclude
print(len(strSet),strSet)
strDict={}
for word in strSet:
    strDict[word]=strlist.count(word)
print(len(strDict),strDict)
#按词频排序
wcList=list(strDict.items())
print(wcList)
wcList.sort(key=lambda x:x[1],reverse=True)
print(wcList)

#输出TOP(20)
for i in range(20):
    print(wcList[i])




# 中文版


#读取文本文件
f = open('shengxu.txt','r',encoding='utf-8')
story = f.read()
f.close()
print(story)

#预处理
sep = '，。：“”？！'''     #符号处理
for ch in sep:
    story=story.replace(ch,' ')   #利用for循环语句把特殊符号替换成空格
    print(story)

#中文分词：结巴
import jieba
cnStr = story
#精确模式
print(list(jieba.cut(cnStr)))

# 分隔提取单词
strList = list(jieba.cut(cnStr))
print(len(strList), strList)
# 单词计数字典
strSet = set(strList)
print(len(strSet), strSet)
strDict = {}
for word in strSet:
    strDict[word] = strList.count(word)
    # print(len(strDict),strDict)
# 词频排序
wcList = list(strDict.items())
# print(wcList)
wcList.sort(key=lambda x: x[1], reverse=True)
# print(wcList)

# 输出TOP10
for i in range(10):
    print(wcList[i])

posted on 2018-10-15 11:03 C22C 阅读(227) 评论(0) 收藏举报

刷新页面返回顶部


博客园 © 2004-2026 浙公网安备 33010602011771号浙ICP备2021040463号-3

导航