黑板课Python爬虫第一关

 1 #coding=utf-8
 2 import requests
 3 from lxml import etree
 4 import re
 5 from multiprocessing import Process
 6 
 7 
 8 
 9 def ex00(ex00_html):
10     #构造元url
11     #获取respond
12     r = requests.get(ex00_html)
13     #获取网页二进制文本,并且从utf-8解码
14     html = r.content.decode('utf-8')
15     #用etree获取页面
16     page = etree.HTML(html)
17     #提取信息
18     info = page.xpath('/html/body/div/div/div/h3/text()')[0]
19     print 'info:',info
20     #获取数字
21     num = re_num.match(info).groups()[0]
22     print num
23 
24     while num:
25         url = ex00_html+num
26         r = requests.get(url)
27         #获取网页二进制文本,并且从utf-8解码
28         html = r.content.decode('utf-8')
29         #用etree获取页面
30         page = etree.HTML(html)
31         #提取信息
32         info = page.xpath('/html/body/div/div/div/h3/text()')[0]
33         #获取数字
34         try:
35             num = re_num.match(info).groups()[0]
36             print num
37         except AttributeError,e:
38             print 'AttributeError:',e
39             print info
40             return url
41 
42 
43 
44 
45 if __name__ == '__main__':
46     #提取数字的正则表达式
47     re_num = re.compile(ur'[\u4e00-\u9fa5]+(\d+)')
48     #方法二,利用findall方法
49     #例:re.findall(r'\d+','你好123')
50     u = 'http://www.heibanke.com/lesson/crawler_ex00/'
51     print ex00(u)

 

posted on 2016-01-20 13:16  freetime  阅读(293)  评论(0)    收藏  举报

导航