爬站长之家表情包

 1 from bs4 import BeautifulSoup
 2 import os
 3 import requests
 4 
 5 #获取页面内容
 6 def getHtmlText(url, s='text'):
 7     try:
 8         r = requests.get(url, timeout=30)
 9         r.raise_for_status()
10         r.encoding = r.apparent_encoding
11         if s == 'text':
12             return r.text
13         elif s == 'content':
14             return r.content
15         else:
16             return ''
17     except:
18         return ""
19 
20 
21  #获取表情包名字与表情包套链接
22 def getEmotionInfo(html):
23     soup = BeautifulSoup(html, 'html.parser')
24     emo_divs = soup.find_all('div', attrs={'class':'up'})
25     for div in emo_divs:
26         a = div.find('div', attrs={'class':'num_1'}).find('a')
27         title = a.attrs['title']
28         href = a.attrs['href']
29         getEmotionImgInfo(title, href)
30 
31 #获取表情包中每一个图片的链接
32 def getEmotionImgInfo(title, href):
33     html = getHtmlText(href)
34     soup = BeautifulSoup(html, 'html.parser')
35     img_div = soup.find('div', attrs={'class':'img_text'}).next_sibling.next_sibling
36     imgs = img_div.find_all('img')
37     url_list = []
38     for img in imgs:
39         src = img.attrs['src']
40         url_list.append(src)
41     getImg(title, url_list)
42 
43 #获取表情包保存在本地
44 def getImg(title, url_list):
45     root = 'D://pics//' + title
46     if not os.path.exists(root):
47         os.mkdir(root)
48     count_small = 0
49     for key in url_list:
50         path = root +'//'+ key.split('/')[-1]
51         if not os.path.exists(path):
52             img_content = getHtmlText(key,'content')
53             with open(path, 'wb') as f:
54                 f.write(img_content)
55             count_small = count_small + 1
56             print('\r{}文件进度：{:.2f}%'.format(title, count_small*100/len(url_list)),end=',')
57 
58 if __name__ == '__main__':
59     first_url = 'http://sc.chinaz.com/biaoqing/index.html'
60     root_url = 'http://sc.chinaz.com/biaoqing/index_'
61 
62     pages = 20
63     for i in range(1,pages): #切换页面爬取内容
64         if i == 1:
65             html = getHtmlText(first_url)
66         else:
67             url = root_url + str(i) + '.html'
68             html = getHtmlText(url)
69         getEmotionInfo(html)
posted @ 2017-04-24 20:50 起航追梦人阅读(541) 评论(0) 收藏举报
刷新页面返回顶部
起航追梦人

爬站长之家表情包

公告