1 from bs4 import BeautifulSoup
2 import os
3 import requests
4
5 #获取页面内容
6 def getHtmlText(url, s='text'):
7 try:
8 r = requests.get(url, timeout=30)
9 r.raise_for_status()
10 r.encoding = r.apparent_encoding
11 if s == 'text':
12 return r.text
13 elif s == 'content':
14 return r.content
15 else:
16 return ''
17 except:
18 return ""
19
20
21 #获取表情包名字与表情包套链接
22 def getEmotionInfo(html):
23 soup = BeautifulSoup(html, 'html.parser')
24 emo_divs = soup.find_all('div', attrs={'class':'up'})
25 for div in emo_divs:
26 a = div.find('div', attrs={'class':'num_1'}).find('a')
27 title = a.attrs['title']
28 href = a.attrs['href']
29 getEmotionImgInfo(title, href)
30
31 #获取表情包中每一个图片的链接
32 def getEmotionImgInfo(title, href):
33 html = getHtmlText(href)
34 soup = BeautifulSoup(html, 'html.parser')
35 img_div = soup.find('div', attrs={'class':'img_text'}).next_sibling.next_sibling
36 imgs = img_div.find_all('img')
37 url_list = []
38 for img in imgs:
39 src = img.attrs['src']
40 url_list.append(src)
41 getImg(title, url_list)
42
43 #获取表情包保存在本地
44 def getImg(title, url_list):
45 root = 'D://pics//' + title
46 if not os.path.exists(root):
47 os.mkdir(root)
48 count_small = 0
49 for key in url_list:
50 path = root +'//'+ key.split('/')[-1]
51 if not os.path.exists(path):
52 img_content = getHtmlText(key,'content')
53 with open(path, 'wb') as f:
54 f.write(img_content)
55 count_small = count_small + 1
56 print('\r{}文件进度:{:.2f}%'.format(title, count_small*100/len(url_list)),end=',')
57
58 if __name__ == '__main__':
59 first_url = 'http://sc.chinaz.com/biaoqing/index.html'
60 root_url = 'http://sc.chinaz.com/biaoqing/index_'
61
62 pages = 20
63 for i in range(1,pages): #切换页面爬取内容
64 if i == 1:
65 html = getHtmlText(first_url)
66 else:
67 url = root_url + str(i) + '.html'
68 html = getHtmlText(url)
69 getEmotionInfo(html)