1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3 # Author:woshinidaye
4
5 # <h1>i love you</h1>
6 # <h2 align="center">i love you</h2>
7 # <a href="http://www.cnblogs.com/woshinidaye123">woshinidaye</a>>
8 # <biaoqian 属性="属性"></biaoqian>
9
10
11 # pip install bs4
12 from bs4 import BeautifulSoup
13 import requests
14 #解析数据 requests
15 #把页面源代码交给BeautifulSoup进行处理,生成bs4对象
16 # test = BeautifulSoup('text','html.parser') #指定html解析器
17 #从bs4对象中查找数据
18 #find(标签,属性=值)
19 #find_all(标签,属性=值)
20 # page.find('table',class_='ssss') #python中已经有class类 前面加两个__表示私有属性,后面一个_表示区分
21 # page.find('table',attrs{'class':'sssss'}) #python中已经有class类,上述两种表示完全一样
22
23 # url = 'http://www.xinfadi.com.cn/getCat.html'
24 # headers = {
25 # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari"
26 # }
27 #
28 # req = requests.post(url,headers=headers)
29 # # print(req.json()['list'],type(req.json()['list']))
30 # for i in req.json()['list']:
31 # print(i['id'],i['prodName'])
32
33 #实战案例
34 #思路
35 #1、拿到主页面的源代码,提取图片的连接地址,herf
36 #2、通过herf查看图片的下载连接;
37 #3、下载图片
38
39 import re,requests,time
40 from bs4 import BeautifulSoup
41 for i in range(2,100,1):
42 url = 'https://www.umei.cc/meinvtupian/index_{0}.htm'.format(i)
43 headers = {
44 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari"
45 }
46
47 resp = requests.get(url,headers=headers)
48 resp.encoding = 'utf-8' #处理乱码
49 # print(resp.text)
50 #把源代码给bs4
51 girl_gape = BeautifulSoup(resp.text,'html.parser')
52 aaa = girl_gape.find('div',class_= 'TypeList').find_all('a')
53 for a in aaa:
54 href = a.get('href') #通过get获取属性
55 href = 'https://www.umei.cc'+href
56 #获取子页面的源代码
57 child_page = requests.get(href,headers=headers)
58 child_page.encoding = 'utf-8'
59 child_page_text = child_page.text
60 #从子页面中拿到图片的下载地址
61 child_page_text_d = BeautifulSoup(child_page_text,'html.parser')
62 bbb = child_page_text_d.find('div',class_ = 'ImageBody').find('img')
63 src = bbb.get('src')
64 #下载图片
65 img = requests.get(src,headers=headers)
66 img_name = src.split('/')[-1]
67 img.content #获取到图片,字节!!!!
68 with open('E:\工作资料\python\Reptile\pic\{0}'.format(img_name),'wb') as f:
69 f.write(img.content)
70 print('===={0} over===='.format(img_name))
71 time.sleep(2)
72 print('====all done!!====')