1 # -*- coding:utf-8 -*-
2 """
3 下载煎蛋妹子到本地,通过selenium、正则表达式、phantomjs、Beautifulsoup实现
4 """
5
6 import re
7 import os
8
9 from selenium import webdriver
10 from selenium.webdriver.support.wait import WebDriverWait
11 from selenium.webdriver.support import expected_conditions as EC
12 from selenium.webdriver.common.by import By
13 from selenium.common.exceptions import TimeoutException
14 from bs4 import BeautifulSoup
15 from urllib import urlretrieve
16
17 #解决谷歌浏览器正受到自动测试软件的控制
18 # options = webdriver.ChromeOptions()
19 # options.add_argument('disable-infobars')
20
21 url = 'http://jandan.net/ooxx'
22 # driver = webdriver.Chrome(chrome_options=options)
23 driver = webdriver.PhantomJS()
24 wait = WebDriverWait(driver, 30)
25
26 #下载的煎蛋妹子保存的文件夹
27 img_save_file = 'images'
28
29 #获取总页数。打开煎蛋网-妹子图默认页面可以获取到总页数
30 def get_default_page_num():
31 try:
32 driver.get(url)
33 page_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.current-comment-page')))
34 return page_element.text
35 except TimeoutException:
36 get_default_page_num()
37
38 #获取图片的url
39 def get_img_url(page_number):
40 img_url_list = []
41 url = r'http://jandan.net/ooxx/page-'+ str(page_number) + r'#comments'
42 print url
43 # url = 'http://www.baidu.com'
44 html = driver.get(url)
45 try:
46 driver.get(url)
47 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#comments > ol img')))
48 except TimeoutException:
49 print "打开页面失败,重新加载该页面"
50 get_img_url(page_number)
51
52 #获取页面html元素
53 html = driver.page_source
54 #通过BeautifulSoup解析
55 soup = BeautifulSoup(html, 'html.parser')
56 #找出所有为img的标签
57 imgs = soup.find_all('img')
58 #gif图片需要获取ora_src属性,才是完整的gif图片。has_attr 判断是否有某个属性,attrs可以获取属性值
59 for img in imgs:
60 if img.has_attr('org_src'):
61 img_url = img.attrs['org_src']
62 else:
63 img_url = img.attrs['src']
64 img_url_list.append(img_url)
65 return img_url_list
66
67 #下载图片,通过urllib的urlretrieve实现
68 def download_img(img_url):
69 img_name = img_url.split('/')[-1]
70 img_save_path = img_save_file + '/' +img_name
71 urlretrieve(img_url, img_save_file + '/' + img_name)
72
73 #创建图片存储所在的文件夹
74 def add_img_save_file(img_save_file):
75 if os.path.exists(img_save_file):
76 pass
77 else:
78 os.makedirs(img_save_file)
79
80 def main():
81 add_img_save_file(img_save_file)
82 #通过正则表达式提取当前的页数
83 partner = re.compile(r'(\d+)')
84 content = get_default_page_num()
85 total_pages = partner.search(content).group()
86
87 for i in range(1, int(total_pages) + 1):
88 print "正在下载第" + str(i) + '的图片,url为:',
89 img_url_list = get_img_url(str(i))
90 for img_url in img_url_list:
91 download_img(img_url)
92
93 if __name__ == '__main__':
94 main()