大数据导论课程笔记

网络爬虫

点击查看代码

# -*- codeing = utf-8 -*-
# 导入相关的包
# BeautifulSoup是网页解析的开源库
# request的作用是发送网络请求
from bs4 import BeautifulSoup
from urllib import request
# 将爬取的内容输出到文件
fp = open("D:\Desktop\DouBanTop250.txt", "w")
# 观察url，找到翻页规律
for page in range(0, 250, 25):
    url = "https://book.douban.com/top250?start={}".format(page)
    # 原来的代码出现了418错误，可能遇到了反爬虫机制，因此要设置复杂访问信息，即添加请求头信息
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
        'AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57'
    }
    # 指定一个url，打开这个url地址，读取其中的html内容
    req = request.Request(url, headers=head)
    res = request.urlopen(req)
    html = res.read().decode("utf-8")
    # 使用BeautifulSoup工具来解析内容，并过滤关键字，输出需要的内容
    soup = BeautifulSoup(html, "html.parser")
    headlines = soup.findAll('div', class_='pl2')
    for headline in headlines:
        # <div>标签下第一个<a>标签内“title”的值
        print(headline.a["title"], file=fp)
fp.close()

posted @ 2023-03-05 23:48 林汐岚阅读(60) 评论(0) 收藏举报

刷新页面返回顶部

林汐岚

大数据导论课程笔记

网络爬虫

公告