5.13
python爬虫
• 所花时间:3
• 代码行数:489
• 博客容量:1
• 代码如下:
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
# 设置请求头,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 爬取每年的数据
def fetch_university_rankings(year):
url = f'https://www.shanghairanking.cn/rankings/bcur/{year}'
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
response = requests.get(url, headers=headers)
response.encoding = 'utf-8' # 设置正确的编码格式
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'rk-table'})
rankings = []
for row in table.find('tbody').find_all('tr'):
columns = row.find_all('td')
rank = int(columns[0].text.strip())
name = columns[1].text.strip().split('\n')[0].strip() # 提取中文大学名称
if rank <= 10:
rankings.append((rank, name))
return rankings
# 输出前10名大学信息
def print_top_10(rankings):
for year, ranking in rankings.items():
print(f"{year}年排名前10的大学:")
for rank, name in ranking:
print(f"排名 {rank}: {name}")
print("\n")
# 获取2015-2019年排名前10的大学
rankings = {}
for year in range(2015, 2019 + 1):
rankings[year] = fetch_university_rankings(year)
print_top_10(rankings)
# 数据可视化
def plot_rankings(rankings):
plt.figure(figsize=(14, 7))
# 找到所有上榜的大学
all_universities = set()
for ranking in rankings.values():
for _, name in ranking:
all_universities.add(name)
# 绘制每所大学的排名波动图
for university in all_universities:
x = []
y = []
for year in range(2015, 2019 + 1):
found = False
for rank, name in rankings[year]:
if name == university:
x.append(year)
y.append(rank)
found = True
break
if not found:
x.append(year)
y.append(None) # 如果该年没有排名,则为None
plt.plot(x, y, marker='o', label=university)
plt.gca().invert_yaxis()
plt.xticks(range(2015, 2020))
plt.xlabel('年份')
plt.ylabel('排名')
plt.title('2015-2019年前10位大学排名波动')
plt.legend()
plt.show()
plot_rankings(rankings)
# 查询特定大学的排名
def query_ranking(rankings):
while True:
university = input("请输入大学名称(或输入'退出'结束查询):")
if university == '退出':
break
year = int(input("请输入查询年份(2015-2019):"))
if year in rankings:
found = False
for rank, name in rankings[year]:
if name == university:
print(f"{year}年 {university} 的排名是: {rank}")
found = True
break
if not found:
print(f"未找到 {university} 在 {year} 年的排名信息。")
else:
print(f"未找到 {year} 年的排名信息。")
query_ranking(rankings)
浙公网安备 33010602011771号