1 import sys
2 import multiprocessing
3 import re
4 import os
5 import urllib.request as lib
6
7 def craw_links( url,depth,keyword,processed):
8 ''' url:the url to craw
9 deth:the current depth to craw
10 keyword:the tuple of keywords to focus
11 pool:process pool
12 '''
13
14 contents=[]
15 if url.startswith(('htpp://','https://')):
16 if url not in processed:
17 #mark this url as processed
18 processed.append(url)
19 else:
20 #avoid prossing the same url again
21 return
22 print('Crawing '+url+'...')
23 fp = lib.urlopen(url)
24 #python3 returns bytes,so need to decode
25 contents = fp.read()
26 contents_decoded = contents.decode('UTF-8')
27 fp.close()
28 pattern = '|'.join(keyword)
29 #if this page contains certain keywords,save it to a file
30 flag = False
31 if pattern:
32 searched = re.search(pattern,contents_decoded)
33 else:
34 #if the keywords to filter is not given,save current page
35 flag = True
36 if flag or searched:
37 with open('craw\\'+url.replace(':','_').replace('/','_'),'wb') as fp:
38 fp.write(contents)
39 #find all the links in the current page
40 links = re.findall('href="(.*?)"',contents_decoded)
41 #craw all links in the current page
42 for link in links:
43 #consider the relative path
44 if not link.startswith(('http://','https://')):
45 try:
46 index=url.rindex('/')
47 link = url[0:index+1]+link
48 except:
49 pass
50 if depth>0 and link.endswith(('.htm','.html')):
51 craw_links(link,depth-1,keyword,processed)
52
53 if __name__ == '__main__':
54 processed = []
55 keywords = ('KeyWord1','KeyWord2')
56 if os.path.exists('craw') or not os.path.isdir('craw'):
57 os.mkdir('craw')
58 craw_links(r'http://docs.python.org/3/library/index.html',1,keywords,processed)