如何写python爬虫

0x00 为什么写python爬虫？
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1，用python写爬虫能很独到的锻炼对于python的理解，比如threading模块，os模块，re模块，urllib2模块等。
2，爬虫需要对捕获的url分析，这个对数据的转换是很好锻炼python能力的方式。
3，每写次爬虫，对于自己都能提升，因为这个过程中能让自己产生各种独特的想法，不用去管是否合适，因为这个只是练习，能实现即可。

0x01 如何写？
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
好了，下面我开始分享如何写爬虫的一些经验。
1，理解什么是爬虫？百度百科有很好的解释。
http://baike.baidu.com/link?url=EEHS8WynW3EvDHwv-FeNKf1Tick5GauCy-wdrfWEf5NC88eEAsl2Hdsp0TVtglFi#2
2，爬虫的核心是什么？我认为是这么一个过程。
抓 - 分析 - 抓 - 分析 - 抓 - 分析 - 抓 - 分析 - 抓 - 分析 - 抓 - 分析 - 抓 - 分析 - 抓 - 分析 - 抓 - 分析
核心基于的是 urllib2.urlopen（url）抓取，和re.compile 的分析。
剩下的都是辅佐这个核心的。
但是我认为核心东西的难度只是占整个编写过程的20%。

0x02 代码（缩进问题使用者自行更改吧，正好也是阅读的一个过程）
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

  1 # -*- coding: utf-8 -*-
  2 '''
  3 Author: WK
  4 Write on December 2, 2013
  5 '''
  6 import re
  7 import urllib2
  8 import urlparse
  9 import threading
 10 import Queue
 11 import sys
 12 import time
 13 import os
 14 
 15 count = 0
 16 re_href = re.compile('href="([^"]*)"')
 17 url_dict = {'http://www.baidu.com':'1'}
 18 total_url_list = []
 19 
 20 
 21 def url_open(url): #get html页面
 22 
 23 headers={
 24 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36"
 25 #"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 26 #"Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
 27 #"Accept-Encoding": "gzip, deflate",
 28 #"If-Modified-Since": "Tue, 03 Dec 2010 08:25:11 GMT",
 29 #"Cache-Control": "max-age=0"
 30 }
 31 try:
 32 req = urllib2.Request(url,headers = headers)
 33 html= urllib2.urlopen(req,timeout=3).read()
 34 ret = '200'
 35 return html,ret
 36 except urllib2.HTTPError, e:
 37 #print u'error:', e.code
 38 return '',e.code
 39 except :
 40 pass
 41 return '','99999'
 42 
 43 
 44 
 45 def keyword_check(url,keyword): #检查关键字
 46 str1 = ".*"
 47 keyword = str1 + keyword
 48 re_keyword = re.compile(keyword)
 49 if re_keyword.match(url):
 50 return 'true'
 51 else:
 52 return 'false'
 53 
 54 
 55 def html_open(html,response_code): #抓取html页面内a标签内的href
 56 url_list = []
 57 if response_code == '200':
 58 list_1st = re.findall(re_href,html)
 59 list_2nd = list(set(list_1st)) 
 60 return list_2nd
 61 else:
 62 return url_list
 63 
 64 
 65 def check_url(url): #跟字典内比较，检查url是否重复
 66 if url in url_dict.keys():
 67 #print 'in the dictionary'
 68 return 'true'
 69 else:
 70 #print 'not in the dictionary'
 71 global count
 72 count = count + 1
 73 return 'false'
 74 
 75 def clean_list(root_url,url_list,key): #整理url列表，返回整理后的列表
 76 keyword = key
 77 #print keyword,len(url_list)
 78 new_list = []
 79 if len(url_list) == 0:
 80 return []
 81 else:
 82 for url in url_list:
 83 url_parse = urlparse.urlparse(url)
 84 if url_parse[0] == '' and url_parse[1] == '':
 85 temp_root_url = urlparse.urlparse(root_url)
 86 url = temp_root_url[0]+'://'+temp_root_url[1]+'/'+url_parse[2]
 87 url_parse = urlparse.urlparse(url)
 88 if url_parse[0] == 'http' : #the whitelist
 89 path = url_parse[2]
 90 if path[-3:] not in ['css','png','pdf','jsp','zip','rar','doc','jpg','JPG','DOC','xls']: #the blacklist
 91 t = urlparse.urlunparse(url_parse) 
 92 if check_url(t) == 'false': #检查url是否在字典内，若不是，进行下一个判断
 93 if keyword_check(t,keyword) == 'true':
 94 #print keyword_check(t,keyword)
 95 new_list.append(t) 
 96 return new_list
 97 
 98 def put_url_list_in_dict(url_list): #把队列转换成字典，并返回
 99 if len(url_list) == 0:
100 temp_dict = {}
101 return temp_dict
102 else:
103 temp_dict = {}.fromkeys(url_list,1)
104 return temp_dict
105 
106  
107 
108 
109 class SpiderThread(threading.Thread): #爬虫线程类
110 def __init__(self, urlqueue, readurls, key, deep):
111 threading.Thread.__init__(self)
112 self.urlqueue = urlqueue
113 self.key = key
114 self.readurls = readurls
115 self.deep = deep
116 self.urls = readurls
117 def geturl(self, urltuple): 
118 deep_count, url = urltuple #对应赋值
119 if deep_count < self.deep: # 判断是否到达层数
120 #print 'go',url
121 html,response_code = url_open(url) #打开url 返回 html页面和响应码
122 #print response_code
123 url_list = html_open(html,response_code) #传入 html 返回 url列表
124 #print url_list
125 cleaned_list = clean_list(url,url_list,self.key) #传入url列表 返回整理后的url列表
126 #print cleaned_list
127 if len(cleaned_list) != 0 : #如果整理后的url列表不为空，全部导入字典
128 temp_dict = put_url_list_in_dict(cleaned_list)
129 global url_dict
130 url_dict = dict(url_dict.items() + temp_dict.items())
131 #print len(url_dict)
132 for url in cleaned_list: #把整理好的url列表全部导入队列
133 self.urlqueue.put((deep_count+1, url))
134 def run(self):
135 while True:
136 urltuple = self.urlqueue.get() #从队列里获取一个URL
137 #print self.urlqueue.qsize() 此线程获取的目前队列长度
138 global total_url_list
139 deep_count, url = urltuple #深度 ，URL 对应赋值
140 total_url_list.append(url)
141 #print u'抓取 URL:', self.urlqueue.qsize(), deep_count, url 
142 self.geturl(urltuple) #运行geturl，传入urltuple值
143 self.urlqueue.task_done() # 向url队列发送信号
144 
145 
146 def write_in_txt(url_list,url,start_time): #写如txt函数，参数：url列表，根url（命名用），开始时间
147 print len(url_list)
148 set_url_list = list(set(url_list))
149 print len(set_url_list)
150 set_url_list.sort()
151 url_parse = urlparse.urlparse(url)
152 pc_username = os.popen('cd').readline()
153 Desktop_addr = '%s\log_%s.txt'%(pc_username[0:-1],url_parse[1])
154 file_object = open(Desktop_addr, 'w')
155 spend_time = time.time()-start_time
156 file_object.write(str(spend_time)+'\n')
157 file_object.write(str(len(set_url_list))+'\n')
158 for i in set_url_list:
159 file_object.write(i+'\n')
160 print 'write done'
161 
162 
163 def monitor_url_list(urlqueue,interval_time,start_time):
164 global total_url_list
165 global count
166 while 1:
167 time.sleep(1)
168 if not urlqueue.qsize() < 0:
169 print 'The crawler have analyzed ',len(total_url_list),'URL',' Spent %.2f s!!'%(time.time()-start_time),' The total number of url : ',urlqueue.qsize()
170 time.sleep(interval_time-1)
171 if urlqueue.qsize() == 0:
172 break
173  
174 
175 
176 def work(url, deep, threads, key):
177 start_time = time.time() 
178 urlqueue = Queue.Queue(0)
179 readurls = [] 
180 deep = deep + 1
181 for i in range(threads): #启动线程 threads 是线程个数
182 r = SpiderThread(urlqueue,readurls,key,deep)
183 r.setDaemon(True)
184 r.start()
185 monitor_thread = threading.Thread(target = monitor_url_list,args = (urlqueue,10,start_time)) #监听线程（监听全局变量total_url_list的个数）
186 monitor_thread.start()
187 
188 
189 urlqueue.put((1,url)) #把原始url放入队列
190 urlqueue.join() #队列执行 join 操作，实际上意味着等到队列为空，再退出主程序。
191 print '----------------------------------------'
192 write_in_txt(total_url_list,url,start_time) #写函数
193 print 'spend time:',time.time()-start_time
194 #a = {'1':'2','3':'4','5':'6','7':'8','9':'q'}
195 #b = {'1':'2','33':'43','54':'6a','d':'8e','9e':'q'}
196 #print a,b
197 #print dict(a.items()+b.items())
198 if __name__ == "__main__": 
199 try:
200 　　a = work('http://www.cits.com.cn/',5,10,'cits')
201 
202 except KeyboardInterrupt: 
203 　　print "User Press Ctrl+C,Exit"

0x03 总结
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

这个爬虫实现的功能
多线程
深度
关键词
记录存储

posted on 2013-12-07 09:08 WK23 阅读(1304) 评论(0) 收藏举报

刷新页面返回顶部

WK23

如何写python爬虫

导航

公告