第一版，爬取缩略图

import os
import requests
from bs4 import BeautifulSoup   # pip install beautifulsoup4
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
fk = os.path.join(BASE_DIR, '4k')
if not os.path.isdir(fk):
    os.mkdir(fk)
url = "https://pic.netbian.com/4kmeinv/"
response = requests.get(url=url)
bs = BeautifulSoup(response.text, "html.parser")   # bs用来解析html页面用的
ul = bs.find(name='ul', attrs={"class": "clearfix"})
img_list = ul.find_all(name='img')
for img in img_list:
    img_url = "https://pic.netbian.com/" + img.get("src")
    file_path = os.path.join(fk, img_url.rsplit('/', 1)[-1])
    with open(file_path, 'wb') as f:
        img_response = requests.get(url=img_url)
        f.write(img_response.content)
    print(img_url, 'download done .....')

这一版爬取的是缩略图，效果不好，但能通过该页面了解该网站的相关规则。

第二版，爬取大图

import os
import requests
from bs4 import BeautifulSoup   # pip install beautifulsoup4
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
fk = os.path.join(BASE_DIR, '4k')
if not os.path.isdir(fk):
    os.mkdir(fk)
url = "https://pic.netbian.com/4kmeinv/"
response = requests.get(url=url)
bs = BeautifulSoup(response.text, "html.parser")   # bs用来解析html页面用的
ul = bs.find(name='ul', attrs={"class": "clearfix"})
a_list = ul.find_all(name='a')
for a in a_list:
    a_url = "https://pic.netbian.com/" + a.get("href")
    a_response = requests.get(url=a_url)
    a_bs = BeautifulSoup(a_response.text, 'html.parser')
    img_url = "https://pic.netbian.com/" + a_bs.find(name="a", attrs={"id": "img"}).find('img').get('src')
    file_path = os.path.join(fk, img_url.rsplit('/', 1)[-1])
    with open(file_path, 'wb') as f:
        img_response = requests.get(url=img_url)
        f.write(img_response.content)
    print(img_url, 'download done .....')

第三版，爬取多页

import os
import requests
from bs4 import BeautifulSoup   # pip install beautifulsoup4
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
fk = os.path.join(BASE_DIR, '4k')
if not os.path.isdir(fk):
    os.mkdir(fk)
for index in range(1, 11):  # 爬取前10页
    url = "https://pic.netbian.com/4kmeinv/index.html" if index == 1 else "https://pic.netbian.com/4kmeinv/index_{}.html".format(index)
    response = requests.get(url=url)
    bs = BeautifulSoup(response.text, "html.parser")   # bs用来解析html页面用的
    ul = bs.find(name='ul', attrs={"class": "clearfix"})
    a_list = ul.find_all(name='a')
    for a in a_list:
        a_url = "https://pic.netbian.com/" + a.get("href")
        a_response = requests.get(url=a_url)
        a_bs = BeautifulSoup(a_response.text, 'html.parser')
        img_url = "https://pic.netbian.com/" + a_bs.find(name="a", attrs={"id": "img"}).find('img').get('src')
        file_path = os.path.join(fk, img_url.rsplit('/', 1)[-1])
        with open(file_path, 'wb') as f:
            img_response = requests.get(url=img_url)
            f.write(img_response.content)
        print(img_url, 'download done .....')
    print('第{}页爬取完毕...'.format(index))

以上代码截止到2021.6.22日，运行无误

posted @ 2018-01-31 18:41 听雨危楼阅读(807) 评论(1) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

王战山的学习笔记

非淡泊无以明志，非宁静无以致远

1-Python - 爬取彼岸图网4k美女图片

第一版，爬取缩略图

第二版，爬取大图

第三版，爬取多页

公告