bookSpyder

[!NOTE]
Highlights information that users should take into account, even when skimming.

[!TIP]
Optional information to help a user be more successful.

[!IMPORTANT]
Crucial information necessary for users to succeed.

[!WARNING]
Critical content demanding immediate user attention due to potential risks.

[!CAUTION]
Negative potential consequences of an action.

import re 
import time
import requests
from bs4 import BeautifulSoup
import os


index_url = 'https://wizardforcel.gitbooks.io/ios-sec-wiki/content/'
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
}
css_url = 'gitbook/style.css'
font_url = 'gitbook/fonts/fontawesome/fontawesome-webfont.woff'

# 下载资源
def get_source(soup,tag,attr):
    link_items = soup.find_all(tag)
    for link in link_items:
        if link.has_attr(attr):
            url = link.get(attr)
            if "http" not in url and ".html" not in url:
                url = url.replace('../','')
                download(url)
                
# 根据资源链接下载资源
def download(url):
    (filepath, tempfilename) = os.path.split(url)
    if not(os.path.exists(url)):
        if not(os.path.exists(filepath)):
            os.makedirs(filepath)
        downlaod_url = index_url + url
        r = requests.get(downlaod_url) 
        with open(url,'wb') as f:
            f.write(r.content)  

# 获取目录
def get_content_url():
    res = requests.get(index_url,headers=headers).text
    soup = BeautifulSoup(res,'html5lib')
    items = soup.find_all('ul',class_="summary")[0]
    li_list = items.find_all('li') 
    list1 = []
    for url_li in li_list[3:]:
        try:
            url_href = url_li.find('a').get('href')
            url_content =  url_href.replace('./','')
            list1.append(url_content)
        except Exception as err:
            pass
        else:
            pass
    return list1     

# 处理某一页
def deal_page(page_url):
        res = requests.get(index_url + page_url,headers=headers,stream=True).text
        soup = BeautifulSoup(res,'html5lib')
        title = soup.find('title')
        print(title.string)
        get_source(soup,'link','href')
        get_source(soup,['script','img'],'src')
        download(page_url)    

# 下载文档 
def get_books():
    download(css_url)  
    download(font_url)     
    pages = get_content_url()
    for page in pages:
        deal_page(page)   


if __name__ == "__main__":
    get_books()
posted @ 2022-01-28 21:50  GShang  阅读(32)  评论(0)    收藏  举报