#爬虫中国大学排名
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
allUniv = []
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def fillUnivList(soup):
soup.encode('utf-8')
data = soup.find_all('tr')
list1=[]
for tr in data:
ltd = tr.find_all('td')
if len(ltd) == 0:
continue
singleUniv = []
for td in ltd:
temp=re.findall('[\u4e00-\u9fff]+' ,str(td))
if td.string!=None and td.string!="[]":
singleUniv.append(td.string)
if temp!=[]:
if type(temp)==list:
str1=''
for i in temp:
str1+=i
singleUniv.append(str1)
allUniv.append(singleUniv)
return allUniv
def printUnivList(num):
print("{:^5}{:^4}{:^5}{:^10}{:^10}".format("排名", "学校名称", "省市", "类型", "总分"))
for i in range(num):
u = allUniv[i]
u[0]=u[0][29:31]
u[1]=u[1][:4]
u[4]=u[4][25:31]
print("{:^5} {:^4}{:^5}{:^10}{:^10}".format(u[0], u[1], u[2], u[3], u[4]))
def main(flag):
url = 'https://www.shanghairanking.cn/rankings/bcur/201611'
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
list1=fillUnivList(soup)
if flag==0:
printUnivList(10)
else:
return list1
def combination(list1,count):
list2=[]
for i in list1:
list2.append(i[count])
return list2
main(0)
list1=main(1)
def deal_data(list1):
list_1=combination(list1,0)
list_2=combination(list1,1)
list_3=combination(list1,2)
list_4=combination(list1,3)
list_5=combination(list1,4)
data = pd.DataFrame({
"排名": list_1,
"学校名称": list_2,
'省市': list_3,
'类型': list_4,
'总分': list_5
})
return data
data=deal_data(list1)
data.to_csv('University_grade.csv',index=False)
print("22信计2班—2022310143137—黄志涛")