爬取博客园的帖子

import requests
import sys
import io
from lxml import html
'''
需求分析
爬取博客园的帖子
url = https://www.cnblogs.com/
源码分析
代码实现
1.根据入口url请求源码
2.提前数据(每篇帖子的ur)
3.根据url进入到帖子详情,获取详细内容
4.保存数据
'''
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
url = 'https://www.cnblogs.com/'
nwo_url = url
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
num = 1
page = 1
while True:
r = requests.get(nwo_url,headers).text
# print(r)
# 解析
index = html.etree.HTML(r)
# print(index)
# 2.提前数据(每篇帖子的ur)
tz_url = index.xpath('//div[@class="post_item_body"]/h3/a/@href')
next_url = index.xpath('//div[@class="pager"]/a[last()]')
# print(next_url[0].xpath('@href'))
# print(next_url[0].xpath('text()'))

# 3.根据url进入到帖子详情,获取详细内容
for i in tz_url:
# 请求
re = requests.get(i,headers).text
# print(re)
#解析
re_index = html.etree.HTML(re)
#提取标题和内容
tz_title = re_index.xpath('//a[@id="cb_post_title_url"]/text()') #list
tz_content = re_index.xpath('string(//div[@id="cnblogs_post_body"])') #str
# print(tz_content)
# print(tz_title)

# 保存内容
with open('cn-blogs.csv','a+',encoding='utf-8') as file:
file.write(tz_title[0]+'\n')
file.write(tz_content+'\n')
file.write(i+'\n')
file.write('*'*50+'\n')
print('{0}页第{1}篇帖子'.format(page,num))
num += 1

if next_url[0].xpath('text()')[0] == 'Next >':
nwo_url = url[:-1]+next_url[0].xpath('@href')[0]
page += 1
num = 1
print(page)
else:
break
posted @ 2020-03-24 22:20  gz_wth  阅读(147)  评论(0)    收藏  举报