bookSpyder
[!NOTE]
Highlights information that users should take into account, even when skimming.
[!TIP]
Optional information to help a user be more successful.
[!IMPORTANT]
Crucial information necessary for users to succeed.
[!WARNING]
Critical content demanding immediate user attention due to potential risks.
[!CAUTION]
Negative potential consequences of an action.
import re
import time
import requests
from bs4 import BeautifulSoup
import os
index_url = 'https://wizardforcel.gitbooks.io/ios-sec-wiki/content/'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
}
css_url = 'gitbook/style.css'
font_url = 'gitbook/fonts/fontawesome/fontawesome-webfont.woff'
# 下载资源
def get_source(soup,tag,attr):
link_items = soup.find_all(tag)
for link in link_items:
if link.has_attr(attr):
url = link.get(attr)
if "http" not in url and ".html" not in url:
url = url.replace('../','')
download(url)
# 根据资源链接下载资源
def download(url):
(filepath, tempfilename) = os.path.split(url)
if not(os.path.exists(url)):
if not(os.path.exists(filepath)):
os.makedirs(filepath)
downlaod_url = index_url + url
r = requests.get(downlaod_url)
with open(url,'wb') as f:
f.write(r.content)
# 获取目录
def get_content_url():
res = requests.get(index_url,headers=headers).text
soup = BeautifulSoup(res,'html5lib')
items = soup.find_all('ul',class_="summary")[0]
li_list = items.find_all('li')
list1 = []
for url_li in li_list[3:]:
try:
url_href = url_li.find('a').get('href')
url_content = url_href.replace('./','')
list1.append(url_content)
except Exception as err:
pass
else:
pass
return list1
# 处理某一页
def deal_page(page_url):
res = requests.get(index_url + page_url,headers=headers,stream=True).text
soup = BeautifulSoup(res,'html5lib')
title = soup.find('title')
print(title.string)
get_source(soup,'link','href')
get_source(soup,['script','img'],'src')
download(page_url)
# 下载文档
def get_books():
download(css_url)
download(font_url)
pages = get_content_url()
for page in pages:
deal_page(page)
if __name__ == "__main__":
get_books()
浙公网安备 33010602011771号