python实验
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os
import re
import time
import random
from datetime import datetime
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
设置中文字体
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams["axes.unicode_minus"] = False # 解决负号显示问题
随机User-Agent列表
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 11.5; rv:90.0) Gecko/20100101 Firefox/90.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
]
爬取指定年份的大学排名数据
def fetch_university_ranking(year):
url = f"https://www.shanghairanking.cn/rankings/bcur/{year}"
创建带有重试机制的会话
session = requests.Session()
retries = Retry(
total=5,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504, 403, 429]
)
session.mount('https://', HTTPAdapter(max_retries=retries))
随机延迟
wait_time = random.uniform(2, 5)
print(f"等待{wait_time:.2f}秒后请求{year}年数据...")
time.sleep(wait_time)
try:
headers = {"User-Agent": random.choice(USER_AGENTS)}
# 设置超时
response = session.get(url, headers=headers, timeout=(20, 40))
response.raise_for_status()
print(f"成功获取{year}年页面数据,状态码: {response.status_code}")
return parse_ranking_data(response.text, year)
except requests.exceptions.RequestException as e:
print(f"请求异常,{year}年数据获取失败: {e}")
return []
except Exception as e:
print(f"处理{year}年数据时出错: {e}")
return []
def parse_ranking_data(html_content, year):
"""解析排名数据并输出解析过程"""
print(f"开始解析{year}年排名数据...")
soup = BeautifulSoup(html_content, 'html.parser')
ranking_data = []
查找排名表格
table = soup.find('table', class_='rk-table')
if not table:
table = soup.find('table', class_='ranking-table')
if not table:
print(f"未找到{year}年的排名表格")
return []
else:
print(f"找到备用表格,类名: ranking-table")
else:
print(f"找到排名表格,类名: rk-table")
提取表格数据
rows = table.find_all('tr')
print(f"找到{len(rows)}行数据")
for i, row in enumerate(rows[1:], 1): # 跳过表头
try:
columns = row.find_all('td')
if len(columns) < 5:
continue
rank = columns[0].text.strip()
name_elem = columns[1].find('a')
name = name_elem.text.strip() if name_elem else columns[1].text.strip()
score = columns[4].text.strip()
# 处理排名中的非数字字符
rank = re.sub(r'[^0-9]', '', rank)
if rank:
ranking_data.append({
'year': year,
'rank': int(rank),
'name': name,
'score': float(score)
})
# 只输出前20条记录的详细信息
if i <= 20:
print(f"解析第{i}行: 排名#{rank} - {name},分数: {score}")
except (AttributeError, IndexError, ValueError) as e:
print(f"解析第{i}行时出错: {e}")
continue
print(f"成功解析{year}年数据,共获取{len(ranking_data)}所大学排名")
return ranking_data
爬取指定年份的大学排名数据
def fetch_university_ranking(year):
url = f"https://www.shanghairanking.cn/rankings/bcur/{year}"
创建带有重试机制的会话
session = requests.Session()
retries = Retry(
total=5,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504, 403, 429]
)
session.mount('https://', HTTPAdapter(max_retries=retries))
随机延迟
wait_time = random.uniform(2, 5)
print(f"等待{wait_time:.2f}秒后请求{year}年数据...")
time.sleep(wait_time)
try:
headers = {"User-Agent": random.choice(USER_AGENTS)}
# 设置超时
response = session.get(url, headers=headers, timeout=(20, 40))
response.raise_for_status()
# 强制设置编码为UTF-8
response.encoding = 'utf-8'
print(f"成功获取{year}年页面数据,状态码: {response.status_code}")
return parse_ranking_data(response.text, year)
except requests.exceptions.RequestException as e:
print(f"请求异常,{year}年数据获取失败: {e}")
return []
except Exception as e:
print(f"处理{year}年数据时出错: {e}")
return []
获取指定年份范围的排名数据
def get_ranking_data(start_year, end_year):
all_data = {}
for year in range(start_year, end_year + 1):
print(f"\n{'=' * 50}\n开始爬取{year}年中国大学排名数据\n{'=' * 50}")
# 多次尝试获取数据
max_attempts = 3
for attempt in range(max_attempts):
print(f"\n尝试 {attempt + 1}/{max_attempts}")
data = fetch_university_ranking(year)
if data:
all_data[year] = data
break
if attempt < max_attempts - 1:
wait_time = random.uniform(5, 10)
print(f"等待{wait_time:.2f}秒后进行下一次尝试...")
time.sleep(wait_time)
else:
print(f"多次尝试后仍无法获取{year}年数据")
return all_data
输出指定年份的前n名大学
def print_top_universities(data, year, n=10):
if year not in data:
print(f"没有找到{year}年的数据")
return
top_universities = sorted(data[year], key=lambda x: x['rank'])[:n]
print(f"\n{'=' * 30} {year}年中国大学排名前{n}名 {'=' * 30}")
print(f"{'排名':<5}{'学校名称':<20}{'总分':<10}")
print('-' * 60)
for uni in top_universities:
print(f"{uni['rank']:<5}{uni['name']:<20}{uni['score']:<10.2f}")
可视化指定大学在各年的排名变化
def visualize_ranking_trend(data, university_names):
plt.figure(figsize=(12, 8))
years = sorted(data.keys())
for name in university_names:
ranks = []
for year in years:
uni_data = next((u for u in data[year] if u['name'] == name), None)
if uni_data:
ranks.append(uni_data['rank'])
else:
ranks.append(None)
# 过滤掉None值并绘制排名趋势
valid_years = [year for year, rank in zip(years, ranks) if rank is not None]
valid_ranks = [rank for rank in ranks if rank is not None]
if valid_ranks:
plt.plot(valid_years, valid_ranks, 'o-', label=name)
# 在每个数据点上标注排名
for year, rank in zip(valid_years, valid_ranks):
plt.annotate(f'{rank}', (year, rank), textcoords="offset points",
xytext=(0, 10), ha='center')
plt.title('中国大学排名趋势分析 (2015-2019)')
plt.xlabel('年份')
plt.ylabel('排名')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.ylim(bottom=0)
plt.gca().invert_yaxis() # 排名数值越小越靠前,所以反转Y轴
保存图表
if not os.path.exists('charts'):
os.makedirs('charts')
chart_path = os.path.join('charts', 'university_ranking_trend.png')
plt.savefig(chart_path, dpi=300, bbox_inches='tight')
print(f"图表已保存至: {chart_path}")
plt.show()
查询指定大学在指定年份的排名
def query_university_ranking(data):
while True:
year = input("\n请输入要查询的年份 (2015-2019,输入q退出): ")
if year.lower() == 'q':
break
try:
year = int(year)
if year not in data:
print(f"抱歉,没有找到{year}年的数据。")
continue
except ValueError:
print("输入的年份格式不正确,请输入一个整数年份。")
continue
name = input("请输入要查询的大学名称: ")
university = next((u for u in data[year] if u['name'] == name), None)
if university:
print(f"{year}年 {name} 的排名是: 第{university['rank']}名,得分: {university['score']:.2f}")
else:
print(f"抱歉,在{year}年的排名中未找到{name}。")
choice = input("是否继续查询?(y/n): ").lower()
if choice != 'y':
break
可视化每年前10名大学分布
def visualize_top_universities_by_year(data, n=10):
years = sorted(data.keys())
top_universities = {}
收集每年前n名大学
for year in years:
top_universities[year] = sorted(data[year], key=lambda x: x['rank'])[:n]
plt.figure(figsize=(15, 10))
for i, year in enumerate(years):
plt.subplot(2, 3, i + 1)
universities = top_universities[year]
names = [u['name'] for u in universities]
scores = [u['score'] for u in universities]
plt.barh(names, scores, color='skyblue')
plt.xlabel('总分')
plt.title(f'{year}年前{n}名大学')
plt.grid(axis='x', linestyle='--', alpha=0.7)
# 在每个条形上标注分数
for j, score in enumerate(scores):
plt.text(score + 1, j, f'{score:.1f}', va='center')
plt.tight_layout()
保存图表
if not os.path.exists('charts'):
os.makedirs('charts')
chart_path = os.path.join('charts', 'top_universities_by_year.png')
plt.savefig(chart_path, dpi=300, bbox_inches='tight')
print(f"图表已保存至: {chart_path}")
plt.show()
def main():
# 检查并设置控制台编码
try:
import sys
# 尝试设置控制台输出编码为UTF-8
if sys.stdout.encoding != 'UTF-8':
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
print("已将控制台输出编码设置为UTF-8")
except:
print("无法自动设置控制台编码,请确保终端支持UTF-8")
print("=" * 50)
print(" 软科中国最好大学排名数据分析工具 ")
print("=" * 50)
获取数据
start_year = 2015
end_year = 2019
ranking_data = get_ranking_data(start_year, end_year)
保存原始数据用于调试
if ranking_data and not os.path.exists('data'):
os.makedirs('data')
import json
with open('data/university_ranking_data.json', 'w', encoding='utf-8') as f:
json.dump(ranking_data, f, ensure_ascii=False, indent=2)
print("\n数据已保存到 data/university_ranking_data.json")
if not ranking_data:
print("没有获取到任何排名数据,程序退出。")
return
输出每年前10名大学
for year in sorted(ranking_data.keys()):
print_top_universities(ranking_data, year)