爬虫作业

#请用requests库的get()函数访问如下一个网站20次,打印返回状态,text()内容,计算text()属性和content属性所返回网页内容的长度。
import requests
url="https://cn.bing.com/?mkt=zh-CN&mkt=zh-CN"
def getHTMLText(url):
try:
r=requests.get(url)
r.raise_for_status()
r.encoding="utf-8"
print("text:",r.text)
print(len(r.text))
except:
return ""
for i in range(20):
print(getHTMLText(url))

#这是一个简单的html页面,请保持为字符串,完成后面的计算要求
import requests
from bs4 import BeautifulSoup

r='''<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>菜鸟教程(runoob.com)</title>
</head>
<body>
<h1>我的第一个标题</h1>
<p id="first">我的第一个段落。</p>
</body>
<table border="1">
<tr>
<td>row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr>
<td>row 2, cell 1</td>
<td>row 2, cell 2</td>
</tr>
</table>
</html>'''
soup=BeautifulSoup(r)
print("{}{}".format(40,soup.head))

print(soup.body)
print(soup.find(id='first'))
list_=''
for i in r:
if('\u4e00' <= i <= '\u9fff'):
list_+=i
print(list_)

#爬中国大学排名网站内容取大学排名(a,爬取大学排名(学号尾号9,0,爬取年费2019,)
import requests
from bs4 import BeautifulSoup
import bs4
import csv

ulist1=[]

def getHTMLText(url):
try:
r = requests.get(url,timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return""

def fillUnivList(ulist,html):
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
a = tr('a')
tds = tr('td')
ulist.append([tds[0].text.strip(), a[0].string.strip(), tds[4].text.strip()])

def printUnivList(ulist1,num):

tplt = "{0:^10}\t{1:{3}^12}\t{2:^10}"

print(tplt.format("排名","学校名称","总分",chr(12288)))
for i in range(num):
u = ulist1[i]
print(tplt.format(u[0], u[1], u[2],chr(12288)))
print()

def writeList(ulist,num):
f = open('rank1-10.csv','w',encoding='gbk',newline='')
csv_writer = csv.writer(f)
csv_writer.writerow(['排名','名称','总分'])
for i in range(num):
u=ulist[i]
csv_writer.writerow([u[0],u[1],u[2]])
f.close()
print('succeeded')



def main():
uinfo = []
url = "https://www.shanghairanking.cn/rankings/bcur/2019"
html = getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,10)
writeList(uinfo,10)

main()
posted @ 2023-12-12 13:08  还是一个人a  阅读(26)  评论(0)    收藏  举报