爬虫案例图片爬取
import requests
from lxml import etree
# 图片的常见格式 jpg png
# 文本内容 txt
url = "https://www.baidu.com/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/120.0.0.0 Safari/537.36'
}
req = requests.get(url=url, headers=headers)
html1 = etree.HTML(req.text)
# 用xpath匹配url
img = html1.xpath('//img[@id="s_lg_img"]/@src')[0]
img = 'http:' + img
# 请求图片
img_b = requests.get(url=img)
with open('data/img.jpg', 'wb') as f:
# 响应内容以字节返回
f.write(img_b.content)
爬虫案例豆瓣T250的爬取
import time
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/120.0.0.0 Safari/537.36'}
for n in range(9):
start = n * 25
html = requests.get('https://movie.douban.com/top250?start=' + str(start), headers=headers)
parse_html = etree.HTML(html.text)
#获取电影列表
result = parse_html.xpath("//ol[@class='grid_view']/li")
for i in result:
#获得所须的元素对象
name = i.xpath(".//span[@class='title'][1]/text()")[0]
star = i.xpath(".//span[@class='rating_num']/text()")[0]
number = i.xpath(".//div[@class='star']/span[4]/text()")[0]
#将获得的数据写入文件
with open('data.csv', 'a', encoding='utf-8') as f:
f.write(f'{name},{star},{number}\n')
#文件的读写停顿
time.sleep(0.2)
print(f'第{n}页')