1 import urllib.request
2 import re
3 import os
4 import random
5 import threading
6
7 def url_open(url): #在第8到第12行,总是无法正常运行,代理Ip是从网上免费代理ip获取的。
8 #ips = ['117.136.234.12:80', '218.189.26.20:8080','202.194.101.150:80','180.166.112.47:8888']
9
10 #proxy = urllib.request.ProxyHandler({'http':random.choice(ips)})#{'http':'124.202.174.66:8118'}
11 #opener = urllib.request.build_opener(proxy)
12 #opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36')]
13
14 #urllib.request.install_opener(opener)
15
16 req = urllib.request.Request(url)
17
18 req.add_header('User-Agent','Mozilla/5.0')
19 urlobject = urllib.request.urlopen(req)
20 response = urlobject.read()
21 return response
22
23 def find_page(html):
24 s2 = r'\[\d{4}\]'
25 m = re.search(s2, html)
26 page = m.group()
27 print("find_page")
28 return page
29
30 def find_page_link(html):
31 s = r'http://ww[0-9].sinaimg.cn/mw600/\w+.jpg'
32 m = re.findall(s, html)
33 return m
34
35
36 def save_page(jpg):
37 for file in jpg:
38 data = url_open(file)
39 #print("wwwwwwwwww")
40 name = "E:\\作业\\j_d\\"+file.split('/')[-1]
41 with open(name, 'wb') as f:
42 f.write(data)
43
44
45 def down_jpg(dir_name='E:\作业\j_d', page=10, pages=10):
46 #os.mkdir(dir_name)
47 os.chdir(dir_name)
48 #red = url_open('http://jandan.net/ooxx')
49 #print(type(red))
50 #red = red.decode('utf-8')
51
52 #page = find_page(red)
53 #page = int(page[1:-1])
54 #page = 1333
55 for i in range(pages):
56 page += 1
57 url = 'http://jandan.net/ooxx/page-'+str(page)+'#comments'
58 print(url)
59 data = url_open(url)
60 data = data.decode('utf-8')
61 print("dddddddddddddd")
62 page_list = find_page_link(data)
63 #print("sssssssssssssss")
64 save_page(page_list)
65
66
67 if __name__ == '__main__':
68 p = threading.Thread(target=down_jpg,args=('E:\作业\j_d',1555,10))
69 c = threading.Thread(target=down_jpg,args=('E:\作业\j_d',1024,10))
70 #down_jpg()
71 p.start()
72 c.start()
73
74 p.join()
75 c.join()