1 import requests
2 from bs4 import BeautifulSoup as bs
3 import re
4 import os
5 import socket
6 import time
7 import threading
8
9
10 def url_open(url):
11 socket.setdefaulttimeout(20)
12 headers = {
13 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}
14 res = requests.get(url, headers=headers)
15
16 return res
17
18
19 def save(url): # 传入每个子网页链接
20 res = url_open(url)
21 res.encoding = 'gbk'
22 soup = bs(res.text, 'lxml')
23 title = soup.find('title').text.split('-')[0] # 标题
24 #os.mkdir(title)
25 # os.chdir(title)
26 temp = soup.find_all('tr', class_='tr3')
27 img = re.findall(r'data-src="(.*?jpg)" type', str(temp))
28
29
30 imglist = []
31
32 for each in img:
33 imglist.append(each)
34 for each in imglist:
35 filename = each.split('/')[-1]
36 img = url_open(each)
37 print('saving...+%s'%filename)
38
39 with open(title+filename, 'wb')as f:
40 f.write(img.content)
41 #os.chdir('..')
42
43
44
45
46 if __name__ == '__main__':
47 os.makedirs('1024', exist_ok=True)
48 os.chdir('1024')
49 url = 'https://cl.e7s.win/thread0806.php?fid=16&search=&page=1' #默认爬取第一个页面,毕竟要注意身体,需要多个页面的话,自己加个for循环也不是什么难事~
50 urlhead = 'https://cl.e7s.win/' #页面解析出来的连接前面需要加上这个头才能打开,根据多年经验这个头是会变的,如果哪天不能用了自己看下是不是这个头变了
51 res = url_open(url)
52 res.encoding = 'gbk'
53
54 '''找到页面中的所有子网页'''
55 soup = bs(res.text, 'lxml')
56 temp = soup.find_all('td', class_="tal")
57 link = []
58 for each in temp:
59 link.append(urlhead + each.h3.a.get('href'))
60 # del link[0:10]
61
62 downloads = []
63 for each in link:
64 print(each)
65
66 down = threading.Thread(target=save, args=[each])
67 downloads.append(down)
68
69 down.start()
70 for each in downloads:
71 each.join()
72 print('Done')