1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Date : 2017-08-24 10:17:28
4 # @Author : EnderZhou (zptxwd@gmail.com)
5 # @Link : http://www.cnblogs.com/enderzhou/
6 # @Version : $Id$
7
8 import requests
9 from bs4 import BeautifulSoup as bs
10 import threading
11 import Queue
12 import urllib
13
14 class jiandan_ooxx(threading.Thread):
15 def __init__(self,queue):
16 threading.Thread.__init__(self)
17 self._queue = queue
18
19 def run(self):
20 while not self._queue.empty():
21 url = self._queue.get_nowait()
22 self.spider(url)
23
24 def spider(self,url):
25 r = requests.get(url = url)
26 soup = bs(r.content,'html.parser')
27 imges = soup.find_all(name='img',attrs={})
28 lists = []
29 for i in imges:
30 if 'border' in str(i):
31 continue
32 elif 'onload' in str(i):
33 lists.append(i['org_src'])
34 print i['org_src']
35 img = 'http:' + i['org_src']
36 else:
37 lists.append(i['src'])
38 print i['src']
39 img = 'http:' + i['src']
40 name = img.split('/')[-1]
41 urllib.urlretrieve(img,filename=name)
42
43 def main(number):
44 url = 'http://jandan.net/ooxx/page-'
45 headers = {}
46 queue = Queue.Queue()
47
48 # 此处由最新页面开始爬取,默认爬取最新10页的图片,把number-11改成0即可爬取全部页面的图片。
49 for i in xrange(number,number-11,-1):
50 queue.put(url+str(i))
51 threads = []
52 thread_count = 10
53
54 for i in range(thread_count):
55 threads.append(jiandan_ooxx(queue))
56
57 for t in threads:
58 t.start()
59 for t in threads:
60 t.join()
61
62 if __name__ == '__main__':
63 # 获取最新页码并传入main函数
64 r = requests.get('http://jandan.net/ooxx')
65 soup = bs(r.content,'html.parser')
66 string = soup.find_all(name='span',attrs={'class':'current-comment-page'})
67 number = int(string[1].string[1:-1])
68 main(number)