1 import requests,json,urllib.parse
2 import threading
3 threading_lock=threading.BoundedSemaphore(value=10)#设置最大线程
4
5 def get_page(url):
6 page=requests.get(url)
7 page=page.content
8 page=page.decode('utf-8')
9 return page
10
11 def pages_from_duitang(label):
12 pages=[]
13 url = 'https://www.duitang.com/napi/blog/list/by_search/?kw={}&type=feed&start={}&limit=100'
14 label=urllib.parse.quote(label)
15 for index in range(0,3000,100):
16 page_url=url.format(label,index)
17 print(page_url)
18 page=get_page(page_url)
19 pages.append(page)
20 return pages
21
22
23
24 def findall_page(page,startpaet,endstart):
25 all_strat=[]
26 end=0
27 while page.find(startpaet,end) != -1:
28 start=page.find(startpaet,end)+len(startpaet)
29 end=page.find(endstart,start)
30 string=page[start:end]
31 all_strat.append(string)
32 return all_strat
33
34 def pic_urls_from_pages(pages):
35 pic_urls=[]
36 for page in pages:
37 urls=findall_page(page,'"path":"','"')
38 pic_urls.extend(urls)
39
40 return pic_urls
41
42 def download_pics(url,name):
43 r=requests.get(url)
44 path=r'C:\Users\Administrator\Desktop\爬取堆糖图片\pics\\'+str(name)+ '.jpg'
45 with open(path,'wb') as f:
46 f.write(r.content)
47 threading_lock.release()
48 def main(label):
49 pages=pages_from_duitang(label)
50 pic_urls=pic_urls_from_pages(pages)
51 name=0
52 for url in pic_urls:
53 name+=1
54 threading_lock.acquire()
55 print('正在下载第{}张图片'.format(name))
56 t=threading.Thread(target=download_pics,args=(url,name))
57 t.start()
58 #download_pics(url,name)
59
60
61
62 main('表情包')