python爬虫之爬取图片(五)

爬取“无聊哦”《姜文,你太皮了!》

import requests
from bs4 import BeautifulSoup
# 模拟反爬
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}

# 请求网页
def open_url(url):
    r = requests.get(url, headers=headers)
    r.encoding = r.apparent_encoding
    html = r.text
    return html

# 提取标题名称
def title_name(html):
    soup = BeautifulSoup(html, 'lxml')
    # 提取h1标签下的文本
    title = '' + soup.find('h1').text + ''
    return title

# 提取图片链接
def get_url(html):
    soup2 = BeautifulSoup(html, 'lxml')
    image_url = soup2.find_all('a', href="http://wuliaoo.com")[1]
    # 提取a标签下的img标签中的src属性的值
    img_url = image_url.img.get('src')
    return img_url
    
# 保存图片
def save_img(title, img_url):
    filename = title + '.jpg'
    # 以二进制方式保存文件
    response = requests.get(img_url).content
    with open(filename, 'wb') as file:
        file.write(response)
    print('《姜文,你太皮了!》保存完成!')

# 主程序整体框架
def main():
    url = 'http://wuliaoo.com/jiangwentaipi.html'
    res = open_url(url)
    title = title_name(res)
    img_url = get_url(res)
    save_img(title, img_url)


if __name__ == '__main__':
    main()

 

posted @ 2020-09-23 10:04  chchcharlie、  阅读(169)  评论(0)    收藏  举报