1957

无聊蛋疼的1957写的低端博客
  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

python解析网页

Posted on 2013-05-14 22:07  1957  阅读(852)  评论(0编辑  收藏  举报

果断使用BeautifulSoup!

- -不想写了,挺简单的,贴个代码

 1 import urllib2
 2 import chardet
 3 from BeautifulSoup import BeautifulSoup
 4 import pymongo
 5 import time
 6 db = pymongo.Connection().notice #user notice
 7 
 8 def load_work():
 9     return db.work.find()
10 def work_insert(data):
11     db.work.insert(data)
12 def update_state(idx,data):
13     #print data
14     db.state.update(idx,data,upsert=True)
15 def to_notice(item):
16     if(len(item) == 0):
17         print 'No news'
18     else:
19         print item
20     #pass
21 def main_work():
22     work = load_work()
23     print '=====begin======'
24     print time.ctime()
25     for i in work:
26         data = urllib2.urlopen(i['url']).read()
27         charset = chardet.detect(data)['encoding'].lower()
28         if charset == 'gb2312':
29             charset = 'GBK'
30         soup = BeautifulSoup(data , fromEncoding = charset)
31         lst = soup.findAll('a')
32         new_item = []
33         url_set = []
34         for item in lst:
35             url = item.attrs[0][1]
36             text = item.getText()
37             url_set.append(url)
38             if(db.state.find_one({'url':i['url'] , 'url_set':url}) == None):state.update({'url':url} , {'url':url} , upsert = True)
39         if(len(new_item) > 0):
40             #TODO notice
41             to_notice(new_item)
42             db.state.update({'url':i['url']} , {'url':i['url'] , 'url_set':url_set},upsert=True)
43     print '=====end======'
44 while True:
45     main_work()
46     time.sleep(60)