1 import urllib.request as ur
2 import os
3 import time
4
5
6 def url_open(url):
7 # url = url.replace(' ', '%20')
8 req = ur.Request(url)
9 response = ur.urlopen(req)
10 html = response.read()
11 return html
12
13
14 def save_img(down_url, name):
15 f = open('C:\\Users\\路朝阳\\PycharmProjects\\pythonProject1\\my_img\\' + name + '.jpg', 'wb')
16 html_download = url_open(down_url)
17 f.write(html_download)
18 f.close()
19
20
21 def change_url(url, para_count_web):
22 if para_count_web == 1:
23 return url
24
25 else:
26 para_url1, para_url2 = url.split('_', 1)
27 url_changed = para_url1 + '_' + str(para_count_web) + '.html'
28 print(url_changed)
29 return url_changed
30
31
32 count = 0
33 count_web = 1
34 continue_location = 0
35 natual_url = 'https://desk.zol.com.cn/1920x1080/hot_1.html' # zol桌面壁纸 - 热榜
36
37 while True:
38 # ============== 第一步 ==============
39 next_natual_url = change_url(natual_url, count_web)
40 html1 = url_open(next_natual_url).decode('gbk') # 不同网站的编码方式不同,注意查询该网站编码方式
41
42 find_result = html1.find('<a class="pic" href="/bizhi/', continue_location)
43 # 保证在该页面全部图片被爬取后,跳出这次循环进入下一页的爬取(但也要设置一个限制count_web)
44 if find_result == -1:
45 count_web += 1
46 if count_web > 5:
47 break
48 else:
49 continue_location = 0 # 一定要把查询的起始位置重新恢复至 0,否则翻页后还是从末尾开始查询了
50 continue
51
52 a = find_result + 28
53 b = html1.find('.html', a, a + 100)
54 continue_location = b
55 img_series_id = html1[a:b] # 成功读取下一个图片集的id
56 print(img_series_id)
57
58 # ============== 第二步 ==============
59 url2 = 'https://desk.zol.com.cn/bizhi/' + img_series_id + '.html'
60
61 html2 = url_open(url2).decode('gbk')
62
63 resolution_judge = html2.find('id="1920x1080" href="/showpic/1920x1080_')
64 # 倘若该图片没有我们需要的分辨率,则跳过本次循环
65 if resolution_judge == -1:
66 continue
67
68 a_html2 = resolution_judge + 40
69 b_html2 = html2.find('.html', a_html2, a_html2 + 100)
70 img_id = html2[a_html2:b_html2]
71 print(img_id) # 成功读取每个图片的id(分辨率1920*1080)
72
73 # ============== 第三步 ==============
74 url3 = 'https://desk.zol.com.cn/showpic/1920x1080_' + img_id + '.html'
75 html3 = url_open(url3).decode('gbk')
76 a_html3 = html3.find('img src="https:') + 9
77 # 注意,若遇到的图片为png格式则会报错
78 png_judge = html3.find('.jpg', a_html3, a_html3 + 255)
79 if png_judge == -1:
80 continue
81 b_html3 = png_judge + 4
82 img_download_url = html3[a_html3:b_html3]
83
84 # ============== 第四步 ==============
85 save_img(img_download_url, 'photo-pc' + str(count))
86 count += 1
87 if count_web > 5:
88 break
89 time.sleep(1)