大规模爬取（新浪为例子）网页之downloader、parser的封装（涉及编码等细节）

import requests
import cchardet
import traceback
from lxml import etree

def downloader(url,timeout = 10,headers = None,debug = False, binary = False):
    _headers = {
        'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
                       'Windows NT 6.1; Win64; x64; Trident/5.0)')
    }
    redirected_url = url
    if headers:
        headers = _headers
    try:
        res = requests.get(url,headers,timeout = timeout)
        if binary:
            html = res.content
        else:
            encoding = cchardet.detect(res.content)["encoding"]
            html = res.content.decode(encoding)
            status = res.status_code
            redirected_url = res.url
    except:
        if debug:
            traceback.print_exc()
            msg = "failed download:{}".format(url)
            print(msg)
        if binary:
            html =b""
        else:
            html = ""
            status = 0
    return status,html,redirected_url


def parser(html):
    d = 0
    tree = etree.HTML(html)
    divs_list = tree.xpath(".//div[@class = 'main']/div[contains(@class,'clearfix')]")
    for div in divs_list:
        a_list = div.xpath(".//ul[contains(@class,'list-a')]//a")
        for i in a_list:
            try:
                href = i.xpath("./@href")[0].strip().replace("\\n",'').replace('\\t','')
                title = i.xpath("./text()")[0].strip().replace("\\n",'').replace('\\t','')
                d += 1
                print(d,(href,title))
            except (IndexError) as e:
                pass


if __name__ == '__main__':
    url = r"https://www.sina.com.cn/"
    status,html,redirected_url = downloader(url)
    paser = parser(html)
    #print(status,html,redirected_url)

posted @ 2021-10-06 00:57 山水无期阅读(61) 评论(0) 收藏举报

刷新页面返回顶部

山水无期

大规模爬取（新浪为例子）网页之downloader、parser的封装（涉及编码等细节）

公告