import re
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues
peoples = {'011': 71, '012': 66, '013': 54, '014': 50, '015': 66, '041': 61,
'042': 103, '044': 31, '061': 32, '062': 41, '063': 33, '073': 93, '074': 50, '077': 108, '081': 55,
'083': 55, '084': 92, '102': 56, '105': 29, '106': 27,
'107': 25, '108': 25, '141': 50, '143': 66, '144': 68, '161': 52, '162': 50, '163': 50, '164': 52, '167': 50,
'181': 133, '201': 166, '202': 10, '203': 8, '204': 99, '211': 18,
'212': 50, '213': 24, '214': 19, '215': 25, '216': 24, '217': 24, '221': 67, '222': 52, '224': 67,
'261': 67, '271': 8, '274': 31, '291': 82, '292': 62, '296': 8, '312': 104, '341': 52, '316': 52, '331': 47,
'332': 56, '333': 72, '335': 57, '351': 36, '352': 50, '371': 120, '372': 50,
'373': 56}
class AsySpider(object):
def __init__(self, urls, concurrency=10, results=None, **kwargs):
urls.reverse()
self.urls = urls
self.concurrency = concurrency
self._q = queues.Queue()
self._fetching = set()
self._fetched = set()
if results is None:
self.results = []
def fetch(self, url, **kwargs):
fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
return fetch(url, raise_error=False, **kwargs)
def handle_html(self, url, html):
"""handle html page"""
print(url)
def handle_response(self, url, response):
"""inherit and rewrite this method if necessary"""
if response.code == 200:
self.handle_html(url, response.body)
elif response.code == 599: # retry
self._fetching.remove(url)
self._q.put(url)
@gen.coroutine
def get_page(self, url):
try:
response = yield self.fetch(url)
# print('######fetched %s' % url)
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return(e)
raise gen.Return(response)
@gen.coroutine
def _run(self):
@gen.coroutine
def fetch_url():
current_url = yield self._q.get()
try:
if current_url in self._fetching:
return
# print('fetching****** %s' % current_url)
self._fetching.add(current_url)
response = yield self.get_page(current_url)
self.handle_response(current_url, response) # handle reponse
self._fetched.add(current_url)
for i in range(self.concurrency):
if self.urls:
yield self._q.put(self.urls.pop())
finally:
self._q.task_done()
@gen.coroutine
def worker():
while True:
yield fetch_url()
self._q.put(self.urls.pop()) # add first url
# Start workers, then wait for the work queue to be empty.
for _ in range(self.concurrency):
worker()
yield self._q.join(timeout=timedelta(seconds=300000))
try:
assert self._fetching == self._fetched
except AssertionError:
print(self._fetching - self._fetched)
print(self._fetched - self._fetching)
def run(self):
io_loop = ioloop.IOLoop.current()
io_loop.run_sync(self._run)
class MySpider(AsySpider):
def fetch(self, url, **kwargs):
"""重写父类fetch方法"""
cookies_str = 'JSESSIONID=0000n4jBi_dKg91XbtHHQHDeeDL:1b4e17j2v; iPlanetDire' \
'ctoryPro=AQIC5wM2LY4Sfcxu%' \
'2FWPIJWGHttZPiXafd%2B1gowyEoxTmyiY%3D%40AAJTSQACMDE%3D%23'
headers = {
'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
'cookie': cookies_str
}
return super(MySpider, self).fetch(
url, headers=headers
)
def handle_html(self, url, html):
url += 'qwertyu'
pattern = re.compile('userPhoto&ownerId=(.*)qwertyu')
filename = re.findall(pattern, url)[0]
# 注意把dir修改成你想要存放照片位置.例如C:/picture/
dir = '/home/innovation/文档/pic/'
with open(dir + filename + '.jpg', 'wb') as file:
file.write(html)
file.close()
def main():
urls = []
url_pic = 'http://myportal.sxu.edu.cn/attachmentDownload.portal?notUseCache=true&type=userPhoto&ownerId='
for academy in peoples:
for i in range(peoples[academy]):
i += 1
if i < 10:
i = '00' + str(i)
elif 100 > i >= 10:
i = '0' + str(i)
urls.append(url_pic + '2014' + academy + str(i))
s = MySpider(urls)
s.run()
if __name__ == '__main__':
main()