#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/7/11 18:57
# @Author : 李振华
# 多线程快速抓取某个网页全部图片
import lxml
import threading
import requests
import queue
from bs4 import BeautifulSoup
from random import Random
import os
# 解析图片地址
def parse(url):
html = requests.get(url).text
soup = BeautifulSoup(html, 'lxml')
for img_url in soup.find_all('img'):
q.put(img_url.get('src'))
print(q.qsize())
# 生成随机数
def random_str(randomlength=8):
strs = ''
chars = 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789'
length = len(chars) - 1
random = Random()
for i in range(randomlength):
strs += chars[random.randint(0, length)]
return strs
# 下载图片
def download_image(image_url):
if not os.path.exists('image'):
os.mkdir('image')
image_name = random_str()
image = requests.get(image_url)
with open('image/%s.jpg' % image_name, 'wb') as img:
for b in image.iter_content(2048):
img.write(b)
# 主函数
if __name__ == '__main__':
q = queue.Queue()
url = 'http://enrz.com/fhm/2017/05/11/90122.html'
parse_thread = threading.Thread(target=parse, args=(url, ))
parse_thread.start()
parse_thread.join()
for i in range(q.qsize()):
download_image(q.get())
download_thread_pool = []
for i in range(q.qsize()):
download_thread = threading.Thread(target=download_image, args=(q.get(), ))
download_thread.start()
download_thread_pool.append(download_thread)
for thread in download_thread_pool:
thread.join()