爬虫作业

import requests

for _ in range(20):
response = requests.get('https://www.google.com', timeout=10)
print('Status:', response.status_code)
print('Text:', response.text)
print('Text Length:', len(response.text))
print('Content Length:', len(response.content))
print('---')


由于谷歌主页需要FQ或其他手段去浏览,故在测试时出现连接超时的问题。



(3)
import requests
from bs4 import BeautifulSoup
import csv

all_univ = []


def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""


def fill_univ_list(soup):
data = soup.find_all('tr')
for tr in data:
ltd = tr.find_all('td')
if len(ltd) < 5:
continue
single_univ = [ltd[0].string.strip(), ltd[1].find('a', 'name-cn').string.strip(), ltd[2].text.strip(),
ltd[4].string.strip()]
all_univ.append(single_univ)


def print_univ_list(num):
file_name = "大学排行.csv"
print("{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}".format("排名", "学校名称", "省市", "总分", chr(12288)))
with open(file_name, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["排名", "学校名称", "省市", "总分"])
for i in range(num):
u = all_univ[i]
writer.writerow(u)
print("{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}".format(u[0], u[1], u[2], u[3], chr(12288)))


def main(num):
url = "https://www.shanghairanking.cn/rankings/bcur/201911.html"
html = get_html_text(url)
soup = BeautifulSoup(html, features="html.parser")
fill_univ_list(soup)
print_univ_list(num)


main(20)
posted @ 2023-12-29 00:23  Antea  阅读(23)  评论(0)    收藏  举报