爬虫案例

爬虫案例图片爬取

import requests
from lxml import etree

# 图片的常见格式 jpg png
# 文本内容 txt
url = "https://www.baidu.com/"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                  ' Chrome/120.0.0.0 Safari/537.36'
}
req = requests.get(url=url, headers=headers)
html1 = etree.HTML(req.text)
# 用xpath匹配url
img = html1.xpath('//img[@id="s_lg_img"]/@src')[0]
img = 'http:' + img
# 请求图片
img_b = requests.get(url=img)
with open('data/img.jpg', 'wb') as f:
    # 响应内容以字节返回
    f.write(img_b.content)

爬虫案例豆瓣T250的爬取

import time

import requests
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                  ' Chrome/120.0.0.0 Safari/537.36'}

for n in range(9):
    start = n * 25
    html = requests.get('https://movie.douban.com/top250?start=' + str(start), headers=headers)
    parse_html = etree.HTML(html.text)
    #获取电影列表
    result = parse_html.xpath("//ol[@class='grid_view']/li")
    for i in result:
        #获得所须的元素对象
        name = i.xpath(".//span[@class='title'][1]/text()")[0]
        star = i.xpath(".//span[@class='rating_num']/text()")[0]
        number = i.xpath(".//div[@class='star']/span[4]/text()")[0]
        #将获得的数据写入文件
        with open('data.csv', 'a', encoding='utf-8') as f:
            f.write(f'{name},{star},{number}\n')
	#文件的读写停顿
    time.sleep(0.2)
    print(f'第{n}页')

posted @ 2024-01-24 09:45  low-reed  阅读(10)  评论(0)    收藏  举报