cc013陈文朋  

#补交作业

cc = ('''Counting stars Lately I've been, I've been losing sleep   
Dreaming 'bout the things that we could be   
But baby I've been, I've been prayin' hard     
Said no more counting dollars   We'll be counting stars   
Yeah, we'll be counting stars   I see this life Like a swinging vine  
 Swing my heart across the line   In my face is flashing signs   Seek it out and ye shall find
  Old, but I'm not that old   Young, but I'm not that bold   And I don't think the world is sold  
 I'm just doing what we're told   I, feel something so right   But doing the wrong thing   
I, feel something so wrong   But doing the right thing   I could lie, could lie, could lie  
 everything that kills me makes me feel alive   Lately I've been, I've been losing sleep  
 Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard  
 Said no more counting dollars   We'll be counting stars   Lately I've been, I've been losing sleep   
Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard   Said no more counting dollars  
 We'll be, we'll be counting stars   I feel the love And I feel it burn   Down this river every turn  
 Hope is a four letter word   Make that money   Watch it burn   Old, but I'm not that old  
 Young, but I'm not that bold   And I don't think the world is sold   I'm just doing what we're told  
 I, feel something so wrong   But doing the right thing   I could lie, could lie, could lie  
 Everything that drowns me makes me wanna fly   Lately I've been, I've been losing sleep  
 Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard
  Said no more counting dollars   We'll be counting stars   Lately I've been, I've been losing sleep  
 Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard  
 Said no more counting dollars   We'll be, we'll be counting stars   Take that money And watch it burn   Sink in the river
''')
cc = cc.replace('.', ' ')
ccList = cc.split()
print(len(cc), ccList)  # 分隔一个单词并统计英文单词个数
ccSet = set(ccList)  # 将列表转化成集合,再将集合转化成字典来统计每个单词出现个数

print(ccSet)


strDict = {}
# for star in ccSet:
#     strDict[star] = ccList.count(star)
# print(strDict, len(strDict))
for star in ccSet:
    strDict[star]=cc.count(star)
for key in ccSet:
    print(key,strDict[key])
wclist=list(ccSet.items())
print(wclist)
# def takeSecond(elem):
#     return  elem[1]
# wclist.sort(key=takeSecond,reverse=True)
# print(wclist)

#按词频排序
wcList=list(strDict.items())
print(wcList)
wcList.sort(key=lambda x:x[1],reverse=True)
print(wcList)

#输出TOP(20)
for i in range(20):
    print(wcList[i])


# 列表的遍历

cclist = ['wqdq', 'dqd', 'Awd', 313, '小四', 'dqd']
print(cclist)
cclist.append('gegeheh')
print(cclist)
cclist.pop(2)
print(cclist)
for i in cclist:
    print(i)

# 元组的遍历

tuple = ('jtfjhrr', 'rqfw f2q', 800, 10)
print(tuple[2])
for i in tuple:
    print(i)

# 字典的遍历

dic = {'fhehe': '4w6436', 'jgdns': 7, '4w6436': 'First'}

print('fhehe:', dic['fhehe'])
print('4w6436:', dic['4w6436'])

dic['4w6436'] = 8;
dic['4w6436'] = "对接欧文机房的维护"

print('4w6436:', dic['4w6436'])
print('4w6436:', dic['4w6436'])

for key in dic:
    print(key, ':', dic.get(key))

# 集合的遍历

a = set([1, 2, 3, 6, 5])
print(a)

a.add(4)
print(a)
a.add('uteru')
print(a)

a.remove(5)
print(a)

for i in a:
    print(i)

  

 

 

#此次作业

fo=open('ccc1015.txt','r',encoding='utf-8')
strBig=fo.read().lower()
fo.close()
print(strBig)
#字符串预处理:#大小写,标点符号,特殊符号
sep=""".,:;!?"""
for ch in sep:
    strBig=strBig.replace(ch,'')
strlist=strBig.split()
print(len(strlist),strlist)
strSet=set(strlist)
exclude={'is','be','be','I','we','the','in'}
strSet=strSet-exclude
print(len(strSet),strSet)
strDict={}
for word in strSet:
    strDict[word]=strlist.count(word)
print(len(strDict),strDict)
#按词频排序
wcList=list(strDict.items())
print(wcList)
wcList.sort(key=lambda x:x[1],reverse=True)
print(wcList)

#输出TOP(20)
for i in range(20):
    print(wcList[i])




# 中文版


#读取文本文件
f = open('shengxu.txt','r',encoding='utf-8')
story = f.read()
f.close()
print(story)

#预处理
sep = ',。:“”?!'''     #符号处理
for ch in sep:
    story=story.replace(ch,' ')   #利用for循环语句把特殊符号替换成空格
    print(story)

#中文分词:结巴
import jieba
cnStr = story
#精确模式
print(list(jieba.cut(cnStr)))

# 分隔提取单词
strList = list(jieba.cut(cnStr))
print(len(strList), strList)
# 单词计数字典
strSet = set(strList)
print(len(strSet), strSet)
strDict = {}
for word in strSet:
    strDict[word] = strList.count(word)
    # print(len(strDict),strDict)
# 词频排序
wcList = list(strDict.items())
# print(wcList)
wcList.sort(key=lambda x: x[1], reverse=True)
# print(wcList)

# 输出TOP10
for i in range(10):
    print(wcList[i])

  

  

 

 

 

posted on 2018-10-15 11:03  C22C  阅读(208)  评论(0编辑  收藏  举报