python自然语言处理——2.4 词典资源

微信公众号:数据运营人
本系列为博主的读书学习笔记,如需转载请注明出处。

第二章 获取文本预料和词汇资源

2.4 词典资源词汇列表语料库发音的词典比较词表词汇工具: Toolbox和Shoebox

2.4 词典资源

词汇列表语料库
 1def unusual_words(text):                                                               
2    text_vocab = set(w.lower() for w in text if w.isalpha())                           
3    english_vocab = set(w.lower() for w in nltk.corpus.words.words())                  
4    unusual = text_vocab.difference(english_vocab)                                     
5    return sorted(unusual)                                                             
6print(unusual_words(nltk.corpus.gutenberg.words("austen-sense.txt")))                  
7
8# 停词语料库 处理掉高频词 如to the                                                                                                                                                         
9from nltk.corpus import stopwords                                                      
10print(stopwords.words("english"))                                                      
11
12# 文本中没有在停用词列表中的比例                                                                                                                                                               
13def content_fraction(text):                                                            
14    stopwords = nltk.corpus.stopwords.words("english")                                 
15    content = [w for w in text if w.lower() not in stopwords]                          
16    return len(content)/len(text)                                                      
17print(content_fraction(nltk.corpus.reuters.words()))                                   
18
19# 同时找出两个文件中名字暧昧的名字                                                                                                                                                                
20names = nltk.corpus.names                                                              
21print(names.fileids())                                                                 
22male_names = names.words("male.txt")                                                   
23female_name = names.words("female.txt")                                                
24print([w for w in male_names if w in female_name])                                     
25
26# 以字母a结尾的名字几乎都是女性                                                                                                                                                               
27cfd = nltk.ConditionalFreqDist(                                                        
28        (fileid,name[-1])                                                              
29    for fileid in names.fileids()                                                      
30    for name in names.words(fileid)                                                    
31)                                                                                      
32print(cfd.plot())                                                                                                                                                              

返回结果:

发音的词典
1entries = nltk.corpus.cmudict.entries()    
2print(len(entries))                        
3for entry in entries[39900:39951]:         
4    print(entry)      

返回结果:

比较词表
 1from nltk.corpus import  swadesh                        
2print(swadesh.fileids())                                
3print(swadesh.words("en"))                              
4fr2en = swadesh.entries(['fr','en'])                    
5print(fr2en)                                            
6translate = dict(fr2en)   # 简单的翻译器                      
7print(translate["chien"])                               
8
9# 德语-英语 西班牙-英语                                            
10de2en = swadesh.entries(['de','en'])  #german-english   
11es2en = swadesh.entries(['es','en'])  #spanish-english  
12print(translate.update(dict(de2en)))                    
13print(translate.update(dict(es2en)))                    
14print(translate['Hund'])                                
15print(translate['perro'])          
词汇工具: Toolbox和Shoebox
1from nltk.corpus import toolbox        
2print(toolbox.entries("rotokas.dic"))  
posted @ 2018-12-07 14:50  ly803744  阅读(616)  评论(0编辑  收藏  举报