安装依赖
pip install requests beautifulsoup4 pandas fake-useragent openpyxl -i --trusted-host -i https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
脚本
import requests
import time
import random
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent
# 配置参数
BASE_URL = "https://www.yhwdt.cn/payh/{}.html"
OUTPUT_FILE = "bank_info.xlsx"
MIN_DELAY = 0.5 # 最小请求间隔(秒)
MAX_DELAY = 2.0 # 最大请求间隔(秒)
MAX_RETRY = 3 # 最大重试次数
ERROR_PAGE_URL = "https://www.yhwdt.cn/template/404.htm"
def get_random_ua():
return UserAgent().random
def crawl_bank_info(page_num):
headers = {'User-Agent': get_random_ua()}
url = BASE_URL.format(page_num)
for attempt in range(MAX_RETRY):
try:
response = requests.get(url, headers=headers, allow_redirects=True, timeout=1000)
# 检查是否被重定向到404页面
if response.url == ERROR_PAGE_URL:
print(f"第{page_num}页不存在,被重定向到404页面")
return None
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 提取银行名称
name_div = soup.find('div', class_='hh_left_1')
bank_name = name_div.find('h1').text.strip() if name_div else "N/A"
# 提取银行行号
info_div = soup.find('div', class_='hh_left_2')
bank_code = "N/A"
if info_div:
first_ul = info_div.find('ul')
if first_ul:
first_li = first_ul.find('li')
if first_li:
bank_code = first_li.text.strip().split(':')[-1]
return {'银行名称': bank_name, '行号': bank_code}
except Exception as e:
print(f"第{page_num}页第{attempt + 1}次尝试失败: {str(e)}")
if attempt < MAX_RETRY - 1:
time.sleep(random.uniform(3, 5))
else:
return None
def main():
results = []
page_num = 1
while True:
print(f"正在爬取第{page_num}页...")
data = crawl_bank_info(page_num)
if data is None:
print(f"第{page_num}页不存在,爬取结束")
break
results.append(data)
print(f"成功获取: {data['银行名称']} - {data['行号']}")
# 随机延迟防止封禁
delay = random.uniform(MIN_DELAY, MAX_DELAY)
time.sleep(delay)
page_num += 1
# 保存到Excel
if results:
df = pd.DataFrame(results)
df.to_excel(OUTPUT_FILE, index=False)
print(f"数据已保存到 {OUTPUT_FILE},共 {len(results)} 条记录")
if __name__ == "__main__":
main()