写了一个单线程的爬虫。多线程的还在研究!
废话不多说,直接上代码。请多多指点!!
1 #!/usr/bin/env python 2 #-*- coding = utf-8 -*- 3 # author: h3i_dan 4 # version: v1.0 5 ######################### 6 7 import urllib 8 from sgmllib import SGMLParser 9 10 class Urllist(SGMLParser): 11 12 def reset(self): 13 SGMLParser.reset(self) 14 self.urls = [] 15 16 def start_a(self, attrs): 17 self.attrs = attrs 18 href = [v for k,v in attrs if k == 'href'] 19 if href: 20 self.urls.extend(href) 21 22 def getUrl(url): 23 urls = [] 24 usock = urllib.urlopen(url) 25 if usock.code == 200: 26 parser = Urllist() 27 parser.feed(usock.read()) 28 usock.close() 29 parser.close() 30 31 for url in parser.urls: 32 33 if 'http' not in url: 34 pass 35 else: 36 urls.append(url) 37 else: 38 pass 39 40 def spider(start_url, depth): 41 42 if depth < 0: 43 return False 44 else: 45 urls = getUrl(start_url) # there is repeatitive url in urls 46 global num 47 if len(urls) > 0: 48 for url in urls: 49 print url, num 50 num += 1 51 spider(url, depth-1) 52 else: 53 return False 54 print '^^^^^^^^^^^^^^^^^^^^^^^^^^^-' 55 return True 56 57 58 if __name__ == '__main__': 59 60 num = 0 61 spider('http://www.baidu.com', 1)
代码基本就是这样的。url 去重和 类似这样:http://www.xxx.com/file/xxx.doc 还没有想到好的办法去掉。多线程一直都搞不太懂。如果python大牛看到请多多指教了
浙公网安备 33010602011771号