爬取博客园的帖子

import requests
import sys
import io
from lxml import html
'''
    需求分析
        爬取博客园的帖子
        url = https://www.cnblogs.com/
    源码分析
    代码实现
        1.根据入口url请求源码
        2.提前数据（每篇帖子的ur)
        3.根据url进入到帖子详情，获取详细内容
        4.保存数据
'''
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
url = 'https://www.cnblogs.com/'
nwo_url = url
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
num = 1
page = 1
while True:
    r = requests.get(nwo_url,headers).text
    # print(r)
    # 解析
    index = html.etree.HTML(r)
    # print(index)
    # 2.提前数据（每篇帖子的ur)
    tz_url = index.xpath('//div[@class="post_item_body"]/h3/a/@href')
    next_url = index.xpath('//div[@class="pager"]/a[last()]')
    # print(next_url[0].xpath('@href'))
    # print(next_url[0].xpath('text()'))

    # 3.根据url进入到帖子详情，获取详细内容
    for i in tz_url:
        # 请求
        re = requests.get(i,headers).text
        # print(re)
        #解析
        re_index = html.etree.HTML(re)
        #提取标题和内容
        tz_title = re_index.xpath('//a[@id="cb_post_title_url"]/text()') #list
        tz_content = re_index.xpath('string(//div[@id="cnblogs_post_body"])') #str
        # print(tz_content)
        # print(tz_title)

        # 保存内容
        with open('cn-blogs.csv','a+',encoding='utf-8') as file:
            file.write(tz_title[0]+'\n')
            file.write(tz_content+'\n')
            file.write(i+'\n')
            file.write('*'*50+'\n')
        print('{0}页第{1}篇帖子'.format(page,num))
        num += 1

    if next_url[0].xpath('text()')[0] == 'Next >':
        nwo_url = url[:-1]+next_url[0].xpath('@href')[0]
        page += 1
        num = 1
        print(page)
    else:
        break
posted @ 2020-03-24 22:20 gz_wth 阅读(147) 评论(0) 收藏举报
gz_wth

爬取博客园的帖子

公告