1 #!/usr/bin/python
2 import urllib2
3 import re
4
5 # download a web file (.html) of url with given name
6 def downURL(url, filename):
7 try:
8 fp = urllib2.urlopen(url)
9 except:
10 print 'download exception'
11 return False
12 op = open(filename, 'wb')
13 while True:
14 s = fp.read()
15 if not s:
16 break
17 op.write(s)
18
19 fp.close()
20 op.close()
21 return True
22
23 # get urls in a web
24 def getURLs(url):
25 try:
26 fp = urllib2.urlopen(url)
27 except:
28 print 'get url exception'
29 return []
30 pattern = re.compile('http://[\w\.]+')
31 while True:
32 s = fp.read()
33 if not s:
34 break
35 urls = pattern.findall(s)
36 fp.close()
37 return urls
38
39 # crawl web in one level
40 def spider(startURL):
41 urls = []
42 urls.append(startURL)
43 urllist = getURLs(startURL)
44 for url in urllist:
45 print url
46 if urls.count(url) == 0:
47 urls.append(url)
48 i = 0
49 while True:
50 if len(urls) <= 0:
51 break
52 else:
53 url = urls.pop(0)
54 i = i + 1
55 downURL(url, str(i) + '.html')
56 return True
57
58 # test
59 spider('http://www.baidu.com')