import requests
from lxml import etree
import re
# 爬取糗事百科所有列表页信息
class Qiushi():
def __init__(self, base_url):
self.base_url = base_url
self.max_page = self.get_max_page()
self.get_data()
# 获取最大页数
def get_max_page(self):
response = requests.get(self.base_url)
html_str = response.text
html = etree.HTML(html_str)
max_page = html.xpath('//ul[@class="pagination"]/li[last()-1]/a/span/text()')
max_page = int(max_page[0].strip())
return max_page
# 发起请求获取数据
def get_data(self):
# 循环获取每一页的数据
for page in range(1, self.max_page + 1):
base_url = 'https://www.qiushibaike.com/8hr/page/{}/'.format(page)
response = requests.get(base_url)
html_str = response.text
html = etree.HTML(html_str)
result = html.xpath('//div[@class="recommend-article"]/ul/li')
all_list = []
for site in result:
# print(type(site))
# 看看里面是什么
# print(etree.tostring(site, pretty_print=True,encoding='utf-8').decode('utf-8'))
qiushi_info = {}
funny_number = site.xpath('.//div[@class="recmd-num"]/span[1]/text()') # 搞笑数
comment_number = site.xpath('.//div[@class="recmd-num"]/span[4]/text()') # 评论数
content = site.xpath('.//a[@class="recmd-content"]/text()') # 内容
pic = site.xpath('.//a[contains(@class, "recmd-left")]/img/@src') # 图片
username = site.xpath('.//span[@class="recmd-name"]/text()') # 用户昵称
# all函数 所有的都为真 返回真 只要有一个假 则返回假
# any函数 只要有一个为真 则返回真
# 过滤掉广告
if all([funny_number, comment_number, content, pic, username]):
qiushi_info['funny_number'] = funny_number[0]
qiushi_info['comment_number'] = comment_number[0]
qiushi_info['content'] = content[0]
# 拼接图片url
pic = "https:" + pic[0]
# 获取原始图片大小 有些图片没有问号 就是原始图片
if "?" in pic:
pattern = re.compile('(.*?)\?')
pic = pattern.findall(pic)[0]
qiushi_info['pic'] = pic
qiushi_info['username'] = username[0]
all_list.append(qiushi_info)
# 整理输出
print('-------------------第{}页------------------------'.format(page))
for i in all_list:
print(i)
if __name__ == "__main__":
base_url = 'https://www.qiushibaike.com/'
Qiushi(base_url)