python3 爬取网页图片实例

import os
import sys
import time
import requests
from requests.exceptions import RequestException, Timeout, ConnectionError, HTTPError
from bs4 import BeautifulSoup

url_head='https://xxxx.xxxx'

request_headers = {'User-Agent': 'Mozilla/5.0'}

def get_url_list(url_index):
    response = requests.get(url_index, headers=request_headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    urls_recent=soup.find('ul',class_='list-unstyled bookAll-item-list')
    urls = urls_recent.find_next('ul',class_='list-unstyled bookAll-item-list').find_all('a')
    root_name = soup.find('div',class_='container').find_next('div',class_='mip-box mip-info book-top').find_next('div',class_='mip-box-body').find_next('div',class_='right').find_next('h1').text
    print('root_name: '+root_name)
    os.makedirs(root_name, exist_ok=True)
    return urls, root_name

def get_url_resource(url_page,root):
    print(url_page)
    url_title=url_page.get('title')
    url_addr=url_head+url_page.get('href')
    get_url_resource_store(url_addr,root+'\\'+url_title)
    print(root+'\\'+url_title)

def get_url_resource_store(url_page_first,dir_path):
    next_url=url_page_first
    i=0
    should_break=False
    os.makedirs(dir_path, exist_ok=True)
    while next_url:
        response = requests.get(next_url, headers=request_headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        image_list = soup.find_all('img',class_='comic_img lazy')
        for img in image_list:
            print(img)
            filename = i.__str__() + '.jpg'
            img_data_accessed=False
            img_data_retry=0
            if img.get('data-original')!='' and i>-1:
                filepath = os.path.join(dir_path, filename)
                if not os.path.exists(filepath):
                    while not img_data_accessed:
                        try:
                            img_data = requests.get(img.get('data-original'), headers=request_headers).content
                            if img_data:
                                try:
                                    with open(filepath, 'wb') as f:
                                        f.write(img_data)
                                        f.close()
                                    img_data_accessed=True
                                except Exception as e:
                                    print(f"search end {img.get('data-original')}: {str(e)}: i")
                                    break
                        except ConnectionError:
                            print("connection error")
                        except Timeout:
                            print("timeout")
                        except HTTPError as e:
                            print(f"http error: {e.response.status_code} - {e.response.reason}")
                        except RequestException as e:
                            print(f"request error: {str(e)}")
                        except Exception as e:
                            print(f"other error: {str(e)}")
                        if img_data_accessed:
                            break
                        else:
                            img_data_retry += 1
                            time.sleep(img_data_retry)
                            if img_data_retry > 10:
                                print(f'retry:{img_data_retry} failed, break')
                                break
            i=i+1
        end_pages=soup.find_all('a',class_='chapter-list')
        for end_page in end_pages:
            if end_page:
                if end_page.text == '本章结束':
                    print('this chapter end, should_break=True')
                    should_break=True
                    break
        if should_break:
            break
        next_urls=soup.find_all('a',class_='down-page')
        should_break=True
        if next_urls:
            for url_t in next_urls:
                if url_t.text == '下一页':
                    next_url=url_head+url_t.get('href')
                    print('find next url: '+next_url)
                    should_break=False
                    break
        if should_break:
            break

if len(sys.argv)<2:
    print('usage: this.py url_index_text')
    exit(0)
url_list,root_name=get_url_list(sys.argv[1])

for url in url_list:
    get_url_resource(url,root_name)
posted @ 2025-06-13 23:23  cacarrot  阅读(11)  评论(0)    收藏  举报