1 #encoding=utf8
2 import urllib.request
3 import re
4 import os
5
6
7 seed="http://idea.lanyus.com/" #最开始的种子,爬取的网站
8 depth=3 #最多递归depth层,避免递归栈过深
9 count=5 #每个网页只抓取count个url作为新的seed
10 href_re=re.compile(r'href\S=\S"(https?:/\S*)"') #通过正则表达匹配网页源码中的URL
11 http_re=re.compile(r'\w+') #通过正则表达匹配中文
12 pages=set()
13 path_dir="." #保存路径
14 def get_path(url):
15 name='_'.join(href_re.findall(url))[:30]
16 path=os.path.join(path_dir,"%s.txt" %name)
17 def fetch(que=[seed,],dep=0):
18 nxt_que=[]
19 for url in que:
20 print("depth:%d fetch:%s..." %(dep,url))
21 html=urllib.request.urlopen(url).read()
22 print(html)
23 with open(get_path(url),'w+') as f:
24 f.write(html) #保存网页内容
25 cnt=0
26 for new_url in href_re.findall(html):
27 if new_url in pages:continue #如果已经爬过则跳过
28 pages.add(new_url)
29 cnt+=1
30 nxt_que.append(new_url)
31 if cnt>=count:
32 break
33 if dep<depth:
34 fetch(nxt_que,dep+1)
35
36 if __name__=="__main__":
37 fetch()