爬虫示例,输出为excel

安装依赖

pip install requests beautifulsoup4 pandas fake-useragent openpyxl -i --trusted-host -i https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/

脚本

import requests
import time
import random
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent

# 配置参数
BASE_URL = "https://www.yhwdt.cn/payh/{}.html"
OUTPUT_FILE = "bank_info.xlsx"
MIN_DELAY = 0.5  # 最小请求间隔(秒)
MAX_DELAY = 2.0  # 最大请求间隔(秒)
MAX_RETRY = 3  # 最大重试次数
ERROR_PAGE_URL = "https://www.yhwdt.cn/template/404.htm"

def get_random_ua():
    return UserAgent().random


def crawl_bank_info(page_num):
    headers = {'User-Agent': get_random_ua()}
    url = BASE_URL.format(page_num)

    for attempt in range(MAX_RETRY):
        try:
            response = requests.get(url, headers=headers, allow_redirects=True, timeout=1000)

            # 检查是否被重定向到404页面
            if response.url == ERROR_PAGE_URL:
                print(f"第{page_num}页不存在,被重定向到404页面")
                return None

            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # 提取银行名称
            name_div = soup.find('div', class_='hh_left_1')
            bank_name = name_div.find('h1').text.strip() if name_div else "N/A"

            # 提取银行行号
            info_div = soup.find('div', class_='hh_left_2')
            bank_code = "N/A"
            if info_div:
                first_ul = info_div.find('ul')
                if first_ul:
                    first_li = first_ul.find('li')
                    if first_li:
                        bank_code = first_li.text.strip().split(':')[-1]

            return {'银行名称': bank_name, '行号': bank_code}

        except Exception as e:
            print(f"第{page_num}页第{attempt + 1}次尝试失败: {str(e)}")
            if attempt < MAX_RETRY - 1:
                time.sleep(random.uniform(3, 5))
            else:
                return None


def main():
    results = []
    page_num = 1

    while True:
        print(f"正在爬取第{page_num}页...")
        data = crawl_bank_info(page_num)

        if data is None:
            print(f"第{page_num}页不存在,爬取结束")
            break

        results.append(data)
        print(f"成功获取: {data['银行名称']} - {data['行号']}")

        # 随机延迟防止封禁
        delay = random.uniform(MIN_DELAY, MAX_DELAY)
        time.sleep(delay)
        page_num += 1

    # 保存到Excel
    if results:
        df = pd.DataFrame(results)
        df.to_excel(OUTPUT_FILE, index=False)
        print(f"数据已保存到 {OUTPUT_FILE},共 {len(results)} 条记录")


if __name__ == "__main__":
    main()

posted @ 2025-07-04 17:42  Mizuki-Vone  阅读(17)  评论(0)    收藏  举报