爬虫作业
(2)请用requests库的get()函数访问如下一个网站20次,打印返回状态,text()内容,计算text()属性和content属性所返回网页内容的长度。(不同学号选做如下网页,必做及格)
1 from pip._vendor import requests 2 3 print('访问谷歌网站 获取Response对象') 4 r = requests.get("http://www.google.cn") 5 x = 1 6 while x <= 20: 7 print('第' + str(x) + '次的返回状态打印:' + str(r.status_code)) 8 print('第' + str(x) + '次的text()打印:' + str(r.text)) 9 print('第' + str(x) + '次的text()属性长度打印:' + str(len(r.text))) 10 print('第' + str(x) + '次的content属性长度打印:' +str(len(r.content))) 11 x += 1
爬取结果

(3)html页面
1 <!DOCTYPE html> 2 <html lang="en"> 3 <head> 4 <meta charset="UTF-8"> 5 <title>菜鸟教程(runoob.com)</title> 6 </head> 7 <body> 8 <h1>作业1</h1> 9 <p id="first">作业2</p> 10 11 </body> 12 <table border="1"> 13 <tr> 14 <td>完成!</td> 15 <td>赞!</td> 16 </tr> 17 </table> 18 </html>
(4)爬中国大学排名网站内容
1 import requests 2 from bs4 import BeautifulSoup 3 allUniv = [] 4 def getHTMLText(url): 5 try: 6 r = requests.get(url, timeout=30) 7 r.raise_for_status() 8 r.encoding = 'utf-8' 9 return r.text 10 except: 11 return "" 12 def fillUnivList(soup): 13 data = soup.find_all('tr') 14 for tr in data: 15 ltd = tr.find_all('td') 16 if len(ltd) == 0: 17 continue 18 singleUniv = [] 19 for td in ltd: 20 singleUniv.append(td.string) 21 allUniv.append(singleUniv) 22 def printUnivList(num): 23 print("{1:^2}{2:{0}^10}{3:{0}^6}{4:{0}^4}{5:{0}^10}".format(chr(12288), "排名", "学校名称", "省市", "总分", "年费")) 24 for i in range(num): 25 u = allUniv[i] 26 print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8.1f}{5:{0}^11}".format(chr(12288), u[0], u[1], u[2], eval(u[3]), u[11])) 27 def main(): 28 url = 'https://www.gpnu.edu.cn/Guangdong polytechnic normal university2020.html' 29 html = getHTMLText(url) 30 soup = BeautifulSoup(html, "html.parser") 31 fillUnivList(soup) 32 printUnivList(10) 33 main()
浙公网安备 33010602011771号