【Pyton】【小甲鱼】爬虫4-XXOO

 

 

 1 import urllib.request
 2 import os
 3 
 4 def open_url(url):
 5     req=urllib.request.Request(url)
 6     req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36')
 7     response=urllib.request.urlopen(url)
 8     html=response.read()
 9 
10     print(url)
11     return html
12 
13 def get_page(url):
14     html=open_url(url).decode('utf-8')
15     
16     a=html.find('current-comment-page')+23 #23代表偏移
17     b=html.find(']',a) #a代表起始位置
18     return html[a:b]
19     
20 def find_imgs(url):
21     html=open_url(url).decode('utf-8')
22     img_addrs=[]
23    #图片的左右边界
24     a=html.find('img src=')
25     while a!=-1:
26         b=html.find('.jpg',a,a+255)
27         if b!=-1:
28             img_addrs.append(html[a+9:b+4])
29         else:
30             b=a+9
31         a=html.find('img src=',b)
32    
33     for each in img_addrs:
34         print(each)
35     return img_addrs
36 
37 def save_imgs(folder,img_addrs):
38     for each in img_addrs:
39         filename=each.split('/')[-1]
40         with open(filename,'wb')as f:
41             img=open_url(each)
42             f.write(img)
43 
44 def download_mm(folder='OOXX',pages=10):
45     os.mkdir(folder)
46     os.chdir(folder)
47 
48     url="http://jandan.net/ooxx/"
49     page_num=int(get_page(url))
50 
51     for i in range(pages):
52         page_num-=i
53         page_url=url+'page-'+str(page_num)+'#comments'
54         img_addrs=str('http')+find_imgs(page_url)
55         save_imgs(folder,img_addrs)
56 
57 if __name__=='__main__':
58     download_mm()

 

posted @ 2017-04-10 22:10  猪猪宝丫  阅读(906)  评论(0)    收藏  举报