python实验

import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os
import re
import time
import random
from datetime import datetime
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

设置中文字体

plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams["axes.unicode_minus"] = False # 解决负号显示问题

随机User-Agent列表

USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 11.5; rv:90.0) Gecko/20100101 Firefox/90.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
]

爬取指定年份的大学排名数据

def fetch_university_ranking(year):
url = f"https://www.shanghairanking.cn/rankings/bcur/{year}"

创建带有重试机制的会话

session = requests.Session()
retries = Retry(
total=5,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504, 403, 429]
)
session.mount('https://', HTTPAdapter(max_retries=retries))

随机延迟

wait_time = random.uniform(2, 5)
print(f"等待{wait_time:.2f}秒后请求{year}年数据...")
time.sleep(wait_time)

try:
headers = {"User-Agent": random.choice(USER_AGENTS)}

# 设置超时
response = session.get(url, headers=headers, timeout=(20, 40))
response.raise_for_status()

print(f"成功获取{year}年页面数据,状态码: {response.status_code}")
return parse_ranking_data(response.text, year)

except requests.exceptions.RequestException as e:
print(f"请求异常,{year}年数据获取失败: {e}")
return []
except Exception as e:
print(f"处理{year}年数据时出错: {e}")
return []

def parse_ranking_data(html_content, year):
"""解析排名数据并输出解析过程"""
print(f"开始解析{year}年排名数据...")
soup = BeautifulSoup(html_content, 'html.parser')
ranking_data = []

查找排名表格

table = soup.find('table', class_='rk-table')
if not table:
table = soup.find('table', class_='ranking-table')
if not table:
print(f"未找到{year}年的排名表格")
return []
else:
print(f"找到备用表格,类名: ranking-table")
else:
print(f"找到排名表格,类名: rk-table")

提取表格数据

rows = table.find_all('tr')
print(f"找到{len(rows)}行数据")

for i, row in enumerate(rows[1:], 1): # 跳过表头
try:
columns = row.find_all('td')
if len(columns) < 5:
continue

    rank = columns[0].text.strip()
    name_elem = columns[1].find('a')
    name = name_elem.text.strip() if name_elem else columns[1].text.strip()
    score = columns[4].text.strip()

    # 处理排名中的非数字字符
    rank = re.sub(r'[^0-9]', '', rank)
    if rank:
        ranking_data.append({
            'year': year,
            'rank': int(rank),
            'name': name,
            'score': float(score)
        })

        # 只输出前20条记录的详细信息
        if i <= 20:
            print(f"解析第{i}行: 排名#{rank} - {name},分数: {score}")

except (AttributeError, IndexError, ValueError) as e:
    print(f"解析第{i}行时出错: {e}")
    continue

print(f"成功解析{year}年数据,共获取{len(ranking_data)}所大学排名")
return ranking_data

爬取指定年份的大学排名数据

def fetch_university_ranking(year):
url = f"https://www.shanghairanking.cn/rankings/bcur/{year}"

创建带有重试机制的会话

session = requests.Session()
retries = Retry(
total=5,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504, 403, 429]
)
session.mount('https://', HTTPAdapter(max_retries=retries))

随机延迟

wait_time = random.uniform(2, 5)
print(f"等待{wait_time:.2f}秒后请求{year}年数据...")
time.sleep(wait_time)

try:
headers = {"User-Agent": random.choice(USER_AGENTS)}

# 设置超时
response = session.get(url, headers=headers, timeout=(20, 40))
response.raise_for_status()

# 强制设置编码为UTF-8
response.encoding = 'utf-8'

print(f"成功获取{year}年页面数据,状态码: {response.status_code}")
return parse_ranking_data(response.text, year)

except requests.exceptions.RequestException as e:
print(f"请求异常,{year}年数据获取失败: {e}")
return []
except Exception as e:
print(f"处理{year}年数据时出错: {e}")
return []

获取指定年份范围的排名数据

def get_ranking_data(start_year, end_year):
all_data = {}
for year in range(start_year, end_year + 1):
print(f"\n{'=' * 50}\n开始爬取{year}年中国大学排名数据\n{'=' * 50}")

# 多次尝试获取数据
max_attempts = 3
for attempt in range(max_attempts):
    print(f"\n尝试 {attempt + 1}/{max_attempts}")
    data = fetch_university_ranking(year)

    if data:
        all_data[year] = data
        break

    if attempt < max_attempts - 1:
        wait_time = random.uniform(5, 10)
        print(f"等待{wait_time:.2f}秒后进行下一次尝试...")
        time.sleep(wait_time)
else:
    print(f"多次尝试后仍无法获取{year}年数据")

return all_data

输出指定年份的前n名大学

def print_top_universities(data, year, n=10):
if year not in data:
print(f"没有找到{year}年的数据")
return

top_universities = sorted(data[year], key=lambda x: x['rank'])[:n]

print(f"\n{'=' * 30} {year}年中国大学排名前{n}名 {'=' * 30}")
print(f"{'排名':<5}{'学校名称':<20}{'总分':<10}")
print('-' * 60)

for uni in top_universities:
print(f"{uni['rank']:<5}{uni['name']:<20}{uni['score']:<10.2f}")

可视化指定大学在各年的排名变化

def visualize_ranking_trend(data, university_names):
plt.figure(figsize=(12, 8))

years = sorted(data.keys())
for name in university_names:
ranks = []
for year in years:
uni_data = next((u for u in data[year] if u['name'] == name), None)
if uni_data:
ranks.append(uni_data['rank'])
else:
ranks.append(None)

# 过滤掉None值并绘制排名趋势
valid_years = [year for year, rank in zip(years, ranks) if rank is not None]
valid_ranks = [rank for rank in ranks if rank is not None]

if valid_ranks:
    plt.plot(valid_years, valid_ranks, 'o-', label=name)
    # 在每个数据点上标注排名
    for year, rank in zip(valid_years, valid_ranks):
        plt.annotate(f'{rank}', (year, rank), textcoords="offset points",
                     xytext=(0, 10), ha='center')

plt.title('中国大学排名趋势分析 (2015-2019)')
plt.xlabel('年份')
plt.ylabel('排名')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.ylim(bottom=0)
plt.gca().invert_yaxis() # 排名数值越小越靠前,所以反转Y轴

保存图表

if not os.path.exists('charts'):
os.makedirs('charts')
chart_path = os.path.join('charts', 'university_ranking_trend.png')
plt.savefig(chart_path, dpi=300, bbox_inches='tight')
print(f"图表已保存至: {chart_path}")

plt.show()

查询指定大学在指定年份的排名

def query_university_ranking(data):
while True:
year = input("\n请输入要查询的年份 (2015-2019,输入q退出): ")
if year.lower() == 'q':
break

try:
    year = int(year)
    if year not in data:
        print(f"抱歉,没有找到{year}年的数据。")
        continue
except ValueError:
    print("输入的年份格式不正确,请输入一个整数年份。")
    continue

name = input("请输入要查询的大学名称: ")
university = next((u for u in data[year] if u['name'] == name), None)

if university:
    print(f"{year}年 {name} 的排名是: 第{university['rank']}名,得分: {university['score']:.2f}")
else:
    print(f"抱歉,在{year}年的排名中未找到{name}。")

choice = input("是否继续查询?(y/n): ").lower()
if choice != 'y':
    break

可视化每年前10名大学分布

def visualize_top_universities_by_year(data, n=10):
years = sorted(data.keys())
top_universities = {}

收集每年前n名大学

for year in years:
top_universities[year] = sorted(data[year], key=lambda x: x['rank'])[:n]

plt.figure(figsize=(15, 10))

for i, year in enumerate(years):
plt.subplot(2, 3, i + 1)
universities = top_universities[year]
names = [u['name'] for u in universities]
scores = [u['score'] for u in universities]

plt.barh(names, scores, color='skyblue')
plt.xlabel('总分')
plt.title(f'{year}年前{n}名大学')
plt.grid(axis='x', linestyle='--', alpha=0.7)

# 在每个条形上标注分数
for j, score in enumerate(scores):
    plt.text(score + 1, j, f'{score:.1f}', va='center')

plt.tight_layout()

保存图表

if not os.path.exists('charts'):
os.makedirs('charts')
chart_path = os.path.join('charts', 'top_universities_by_year.png')
plt.savefig(chart_path, dpi=300, bbox_inches='tight')
print(f"图表已保存至: {chart_path}")

plt.show()

def main():
# 检查并设置控制台编码
try:
import sys
# 尝试设置控制台输出编码为UTF-8
if sys.stdout.encoding != 'UTF-8':
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
print("已将控制台输出编码设置为UTF-8")
except:
print("无法自动设置控制台编码,请确保终端支持UTF-8")

print("=" * 50)
print(" 软科中国最好大学排名数据分析工具 ")
print("=" * 50)

获取数据

start_year = 2015
end_year = 2019
ranking_data = get_ranking_data(start_year, end_year)

保存原始数据用于调试

if ranking_data and not os.path.exists('data'):
os.makedirs('data')
import json
with open('data/university_ranking_data.json', 'w', encoding='utf-8') as f:
json.dump(ranking_data, f, ensure_ascii=False, indent=2)
print("\n数据已保存到 data/university_ranking_data.json")

if not ranking_data:
print("没有获取到任何排名数据,程序退出。")
return

输出每年前10名大学

for year in sorted(ranking_data.keys()):
print_top_universities(ranking_data, year)

posted @ 2025-06-05 08:12  skurar  阅读(7)  评论(0)    收藏  举报