[定向爬虫] 网络爬虫实例1

程序结构设计:

1.获取网页内容

getHTMLText()

2.获取网络内容信息并存储到合适的数据结构中

fillUnivList()

3.利用数据结构展示并输出结果

printUnivList()

实现代码

import requests
from bs4 import BeautifulSoup
import bs4

def getHtmlText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ''
        
def fillUnivList(ulist, html):
    soup = BeautifulSoup(html,'html.parser')
    trs = soup.find('tbody').children
    for tr in trs:
        if isinstance(tr,bs4.element.Tag):
            #tds = tr.find_all("td")
            #find_all可用于标签对象
            tds = tr('td') #find_all()的简写
            ulist.append([tds[0].string,tds[1].string,tds[3].string])

#chr(12288)表示采用中文字符空格填充
def printUnivList(ulist,num):
    tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" 
    print(tplt.format("排名","学校名称","总分",chr(12288)))
    for i in range(num):
        list = ulist[i]
        print(tplt.format(list[0],list[1],list[2],chr(12288)))

        
def main():
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
    ulist = []
    r = getHtmlText(url)
    fillUnivList(ulist, r)
    printUnivList(ulist, 20)

if __name__ == '__main__':
    main()

  

posted @ 2017-09-16 16:09  推杯问盏  阅读(298)  评论(0编辑  收藏  举报