1 #coding=utf-8
2 import requests
3 from lxml import etree
4 import re
5 from multiprocessing import Process
6
7
8
9 def ex00(ex00_html):
10 #构造元url
11 #获取respond
12 r = requests.get(ex00_html)
13 #获取网页二进制文本,并且从utf-8解码
14 html = r.content.decode('utf-8')
15 #用etree获取页面
16 page = etree.HTML(html)
17 #提取信息
18 info = page.xpath('/html/body/div/div/div/h3/text()')[0]
19 print 'info:',info
20 #获取数字
21 num = re_num.match(info).groups()[0]
22 print num
23
24 while num:
25 url = ex00_html+num
26 r = requests.get(url)
27 #获取网页二进制文本,并且从utf-8解码
28 html = r.content.decode('utf-8')
29 #用etree获取页面
30 page = etree.HTML(html)
31 #提取信息
32 info = page.xpath('/html/body/div/div/div/h3/text()')[0]
33 #获取数字
34 try:
35 num = re_num.match(info).groups()[0]
36 print num
37 except AttributeError,e:
38 print 'AttributeError:',e
39 print info
40 return url
41
42
43
44
45 if __name__ == '__main__':
46 #提取数字的正则表达式
47 re_num = re.compile(ur'[\u4e00-\u9fa5]+(\d+)')
48 #方法二,利用findall方法
49 #例:re.findall(r'\d+','你好123')
50 u = 'http://www.heibanke.com/lesson/crawler_ex00/'
51 print ex00(u)