1 import urllib.request
2 import os
3
4 def open_url(url):
5 req=urllib.request.Request(url)
6 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36')
7 response=urllib.request.urlopen(url)
8 html=response.read()
9
10 print(url)
11 return html
12
13 def get_page(url):
14 html=open_url(url).decode('utf-8')
15
16 a=html.find('current-comment-page')+23 #23代表偏移
17 b=html.find(']',a) #a代表起始位置
18 return html[a:b]
19
20 def find_imgs(url):
21 html=open_url(url).decode('utf-8')
22 img_addrs=[]
23 #图片的左右边界
24 a=html.find('img src=')
25 while a!=-1:
26 b=html.find('.jpg',a,a+255)
27 if b!=-1:
28 img_addrs.append(html[a+9:b+4])
29 else:
30 b=a+9
31 a=html.find('img src=',b)
32
33 for each in img_addrs:
34 print(each)
35 return img_addrs
36
37 def save_imgs(folder,img_addrs):
38 for each in img_addrs:
39 filename=each.split('/')[-1]
40 with open(filename,'wb')as f:
41 img=open_url(each)
42 f.write(img)
43
44 def download_mm(folder='OOXX',pages=10):
45 os.mkdir(folder)
46 os.chdir(folder)
47
48 url="http://jandan.net/ooxx/"
49 page_num=int(get_page(url))
50
51 for i in range(pages):
52 page_num-=i
53 page_url=url+'page-'+str(page_num)+'#comments'
54 img_addrs=str('http')+find_imgs(page_url)
55 save_imgs(folder,img_addrs)
56
57 if __name__=='__main__':
58 download_mm()