Naive Website Crawl using Python


 1 #!/usr/bin/python
 2 import urllib2
 3 import re
 4 
 5 # download a web file (.html) of url with given name
 6 def downURL(url, filename):
 7     try:
 8         fp = urllib2.urlopen(url)
 9     except:
10         print 'download exception'
11         return False
12     op = open(filename, 'wb')
13     while True:
14         s = fp.read()
15         if not s:
16             break
17         op.write(s)
18     
19     fp.close()
20     op.close()
21     return True
22 
23 # get urls in a web
24 def getURLs(url):
25     try:
26         fp = urllib2.urlopen(url)
27     except:
28         print 'get url exception'
29         return []
30     pattern = re.compile('http://[\w\.]+')
31     while True:
32         s = fp.read()
33         if not s:
34             break
35         urls = pattern.findall(s)
36     fp.close()
37     return urls
38 
39 # crawl web in one level
40 def spider(startURL):
41     urls = []
42     urls.append(startURL)
43     urllist = getURLs(startURL)
44     for url in urllist:
45         print url
46         if urls.count(url) == 0:
47             urls.append(url)
48     i = 0
49     while True:
50         if len(urls) <= 0:
51             break
52         else:
53             url = urls.pop(0)
54             i = i + 1
55             downURL(url, str(i) + '.html')
56     return True
57 
58 # test
59 spider('http://www.baidu.com')

posted @ 2013-08-29 09:51 awarrior 阅读(263) 评论(0) 收藏举报

刷新页面返回顶部

Awarrior

The hard part isn't making the decision. It's living with it.

Naive Website Crawl using Python

公告