python3 爬取网页图片实例

import os
import sys
import time
import requests
from requests.exceptions import RequestException, Timeout, ConnectionError, HTTPError
from bs4 import BeautifulSoup

url_head='https://xxxx.xxxx'

request_headers = {'User-Agent': 'Mozilla/5.0'}

def get_url_list(url_index):
response = requests.get(url_index, headers=request_headers)
soup = BeautifulSoup(response.text, 'html.parser')
urls_recent=soup.find('ul',class_='list-unstyled bookAll-item-list')
urls = urls_recent.find_next('ul',class_='list-unstyled bookAll-item-list').find_all('a')
root_name = soup.find('div',class_='container').find_next('div',class_='mip-box mip-info book-top').find_next('div',class_='mip-box-body').find_next('div',class_='right').find_next('h1').text
print('root_name: '+root_name)
os.makedirs(root_name, exist_ok=True)
return urls, root_name

def get_url_resource(url_page,root):
print(url_page)
url_title=url_page.get('title')
url_addr=url_head+url_page.get('href')
get_url_resource_store(url_addr,root+'\'+url_title)
print(root+'\'+url_title)

def get_url_resource_store(url_page_first,dir_path):
next_url=url_page_first
i=0
should_break=False
os.makedirs(dir_path, exist_ok=True)
while next_url:
response = requests.get(next_url, headers=request_headers)
soup = BeautifulSoup(response.text, 'html.parser')
image_list = soup.find_all('img',class_='comic_img lazy')
for img in image_list:
print(img)
filename = i.str() + '.jpg'
img_data_accessed=False
img_data_retry=0
if img.get('data-original')!='' and i>-1:
filepath = os.path.join(dir_path, filename)
if not os.path.exists(filepath):
while not img_data_accessed:
try:
img_data = requests.get(img.get('data-original'), headers=request_headers).content
if img_data:
try:
with open(filepath, 'wb') as f:
f.write(img_data)
f.close()
img_data_accessed=True
except Exception as e:
print(f"search end {img.get('data-original')}: {str(e)}: i")
break
except ConnectionError:
print("connection error")
except Timeout:
print("timeout")
except HTTPError as e:
print(f"http error: {e.response.status_code} - {e.response.reason}")
except RequestException as e:
print(f"request error: {str(e)}")
except Exception as e:
print(f"other error: {str(e)}")
if img_data_accessed:
break
else:
img_data_retry += 1
time.sleep(img_data_retry)
if img_data_retry > 10:
print(f'retry:{img_data_retry} failed, break')
break
i=i+1
end_pages=soup.find_all('a',class_='chapter-list')
for end_page in end_pages:
if end_page:
if end_page.text == '本章结束':
print('this chapter end, should_break=True')
should_break=True
break
if should_break:
break
next_urls=soup.find_all('a',class_='down-page')
should_break=True
if next_urls:
for url_t in next_urls:
if url_t.text == '下一页':
next_url=url_head+url_t.get('href')
print('find next url: '+next_url)
should_break=False
break
if should_break:
break

if len(sys.argv)<2:
print('usage: this.py url_index_text')
exit(0)
url_list,root_name=get_url_list(sys.argv[1])

for url in url_list:
get_url_resource(url,root_name)

posted @ 2025-06-13 23:23  ampbb  阅读(9)  评论(0)    收藏  举报