[定向爬虫] 网络爬虫实例1
程序结构设计:
1.获取网页内容
getHTMLText()
2.获取网络内容信息并存储到合适的数据结构中
fillUnivList()
3.利用数据结构展示并输出结果
printUnivList()
实现代码
import requests
from bs4 import BeautifulSoup
import bs4
def getHtmlText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ''
def fillUnivList(ulist, html):
soup = BeautifulSoup(html,'html.parser')
trs = soup.find('tbody').children
for tr in trs:
if isinstance(tr,bs4.element.Tag):
#tds = tr.find_all("td")
#find_all可用于标签对象
tds = tr('td') #find_all()的简写
ulist.append([tds[0].string,tds[1].string,tds[3].string])
#chr(12288)表示采用中文字符空格填充
def printUnivList(ulist,num):
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
print(tplt.format("排名","学校名称","总分",chr(12288)))
for i in range(num):
list = ulist[i]
print(tplt.format(list[0],list[1],list[2],chr(12288)))
def main():
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
ulist = []
r = getHtmlText(url)
fillUnivList(ulist, r)
printUnivList(ulist, 20)
if __name__ == '__main__':
main()
浙公网安备 33010602011771号