#-*-coding:gbk-*-                                                                      
  2 #########################################################################              
  3 #   Copyright (C) 2015 All rights reserved.                                            
  4 #                                                                                      
  5 #   文件名称:getnltksinica.py                                                         
  6 #   创 建 者:刘禹 finallyly liuyusi0121@sogou-inc.com(ext 3209)                       
  7 #   创建日期:2015年10月28日                                                           
  8 #   描    述:                                                                         
  9 #                                                                                      
 10 #   备    注:                                                                         
 11 #                                                                                      
 12 #########################################################################              
 13 #!/usr/bin/python                                                                      
 14 # please add your code here!                                                           
 15 import sys                                                                                                       
 16 reload(sys)                                                                            
 17 sys.setdefaultencoding('utf8')                                                         
 18 import nltk;                                                                           
 19 from nltk.corpus import sinica_treebank                                                
 20 sinica_fd=nltk.FreqDist(sinica_treebank.words())                                       
 21 print len(sinica_fd)                                                                   
 22 for m in sinica_fd:                                                                    
 23     sys.stdout.write("%s\n"%m);  

 

有一份文档是UTF-8编码,直接打印到标准输出没有问题,但是重定向的话就会出错,因为系统的默认编码是GBK的。加上reload(sys)

sys.segdefaultencoding这两句就没错了。

posted on 2015-10-28 10:18  finallyly  阅读(200)  评论(0编辑  收藏  举报