1 import requests
2 from bs4 import BeautifulSoup
3 import os
4 from selenium import webdriver
5 from selenium.webdriver.firefox.webdriver import WebDriver
6 from selenium.webdriver.support.wait import WebDriverWait
7 from selenium.webdriver.support import expected_conditions as EC
8 from selenium.webdriver.common.by import By
9 class Down_Cartoon():
10 def __init__(self):
11 self.content_url='https://www.manhuabei.com/manhua/jinjidejuren/'
12 self.base_url='https://www.manhuabei.com'
13 self.header={"Use-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" }
14 self.html_path=r'd:\进击的巨人.txt'
15 self.file_path=r'D:\OneDrive\漫画\进击的巨人'
16
17 def get_url(self,url):
18 '''
19 通用url请求
20 '''
21 r=requests.get(url,headers=self.header)
22 if r.status_code==200:
23 return r.text
24 else:
25 return ""
26
27 def parse_html(self,html_content):
28 '''
29 BeautifulSoup解析网页
30 返回每个章节名称列表和每个章节首页列表
31 '''
32 soup=BeautifulSoup(html_content,'lxml')
33 #self.save_webxml(self.html_path,soup.prettify())
34 main=soup.find('ul',class_="list_con_li autoHeight")
35 content=main.find_all('a')
36 print("总章节:",len(content))
37 chapter_url=[]
38 title_name=[]
39 for p in content:
40 title_name.append(p['title'])
41 chapter_url.append(p['href'])
42 return chapter_url,title_name
43
44
45 def save_webxml(self,file_path, xml_content):
46 '''
47 保存html至本地
48 '''
49 with open(file_path,'w',encoding='UTF-8',errors='ignore') as write_blog:
50 write_blog.write(xml_content)
51
52 def download_one_page(self,href,dir_path,num):
53 '''
54 下载一个图片并保存
55 '''
56 strpic=str(num+1)+'.jpg'
57 full_path=os.path.join(dir_path,strpic)
58 if not os.path.exists(full_path):
59 try:
60 r=requests.get(href,headers=self.header)
61 if r.status_code==200:
62 with open(full_path,'wb') as img:
63 img.write(r.content)
64 print(strpic,"success")
65 else:
66 print(full_path,'下载失败',href)
67 except:
68 print('下载失败',href)
69 else:
70 print(strpic,'图片已存在,无需下载')
71
72 def mkdir(self,own_dir_name):
73 '''创建文件夹'''
74 own_dir_name=own_dir_name.strip()
75 full_path= os.path.join(self.file_path,own_dir_name)
76 isExists=os.path.exists(full_path)
77 if not isExists:
78 #print("创建",own_dir_name,"文件夹")
79 os.makedirs(full_path)
80 os.chdir(full_path)
81 return full_path
82 else:
83 #print(own_dir_name,'文件夹已存在')
84 return full_path
85
86 def run(self):
87 content_list,title_list= self.parse_html(self.get_url(self.content_url))
88 brower=webdriver.Chrome()
89 self.download_content(brower,content_list,title_list)
90 brower.quit()
91
92 def download_content(self,browerdrive,content_list,title_list):
93 '''
94 下载漫画
95 '''
96 cartoon_href_list=[]
97 for i,title in enumerate(title_list):
98 chapter_name=title.split(" ")[0]
99 print("正在下载第%s,总共%s话"%(chapter_name,len(title_list)))
100 dir_path=self.mkdir(chapter_name)
101 full_url=self.base_url+content_list[i]
102 browerdrive.get(full_url)
103 img_url_list=[]
104 chapter_info={}
105 try:
106 img_info= browerdrive.find_element_by_class_name("img_info")
107 except:
108 print("爬取失败!")
109 continue
110 tag_string=img_info.text
111 try:
112 init_page=browerdrive.find_element_by_css_selector("img[style='display: inline;']").get_attribute('src')
113 except:
114 print("爬取失败!")
115 continue
116 img_url_list.append(init_page)
117 num=int(tag_string.split('/')[1][0:2])
118 print("dir_path:",dir_path)
119 #print(num+1,len(os.listdir(dir_path)))
120 if num+1==len(os.listdir(dir_path)):
121 print("第%s已下载"%(chapter_name))
122 continue
123 self.download_one_page(init_page,dir_path,0)
124 chapter_href=self.download_chapter(browerdrive,dir_path,num)
125 img_url_list.extend(chapter_href)
126 chapter_info['href']=img_url_list
127 chapter_info['chapter_name']=chapter_name
128 cartoon_href_list.append(chapter_info)
129 return cartoon_href_list
130
131 def download_chapter(self,browerdrive,dir_path,max_num):
132 '''
133 下载一章节
134 '''
135 img_url=[]
136 for x in range(0,max_num):
137 browerdrive.find_element_by_class_name("img_land_next").click()
138 wait=WebDriverWait(browerdrive,10)
139 try:
140 wait_element=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"img[style='display: inline;']")))
141 #href=browerdrive.find_element_by_css_selector("img[style='display: inline;']").get_attribute('src')
142 href=wait_element.get_attribute('src')
143 print("准备下载图片:",x+2)
144 self.download_one_page(href,dir_path,x+1)
145 img_url.append(href)
146 except:
147 print("wait失败!")
148 continue
149
150 return img_url
151
152 if __name__=='__main__':
153 down_load=Down_Cartoon()
154 down_load.run()
155