Fork me on github

八爪方案——原创力文档

可以图片保存,免费预览的文档,付费文档不能处理,请支持正版[狗头]。

import datetime
import os.path
import sys
import requests
from bs4 import BeautifulSoup
import re
import time
import json
from lxml import etree

class MaxDoc:
    def __init__(self, url):
        self.url = url

    def print_msg(self, msg):
        print(f"{str(datetime.datetime.now())[0:-7]}\t{msg}")

    def run(self):
        path = os.path.split(os.path.realpath(__file__))[0] + "\\img"
        if os.path.exists(path) and os.listdir(path + "\\") != []:
            self.print_msg("已存在文档,若想继续,请删除或者移动img目录")
            sys.exit()
        elif not os.path.exists(path):
            # print("检测到无img文件夹,创建文件")
            self.print_msg("检测到无img文件夹,创建文件")
            os.mkdir(path)
        doc_url = self.url  # 要爬的文档
        session = requests.session()
        session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.68"
        })
        html = session.get(doc_url).text
        soup = BeautifulSoup(html, "html.parser")
        t = re.search(re.compile("(?:senddate:)(.*),"), str(soup.select("script")[5].next)).groups()[0].replace("'", "").strip(" ")
        view_token = re.search(re.compile("(?:view_token:)(.*)//预览的token"), str(soup.select("script")[5].next)).groups()[0].replace("'", "").strip(" ")
        project_id = 1
        aid = re.findall(re.compile("(?:aid:)(.*),"), str(soup.select("script")[5].next))[-1].strip(" ")
        actual_page = re.search(re.compile("(?:actual_page:)(.*),"), str(soup.select("script")[5].next)).groups()[0].strip(" ")
        format = re.search(re.compile("(?:format:)(.*),"), str(soup.select("script")[5].next)).groups()[0].strip(" ").strip("'")
        if format == "ppt":
            # res = session.get("https:" + view_token)
            # etree.HTMLParser(encoding="utf-8")
            # # tree = etree.parse(local_file_path)
            # tree = etree.HTML(res._content.decode("utf-8"))
            # session.get("https://view-cache.book118.com" + json.loads(str(tree.xpath("/html/body/div[1]/input[2]")[0].attrib).replace("'", "\""))["value"])
            self.print_msg(f"格式为ppt,无法下载,请访问此链接查看\nhttps:{view_token}")
        else:
            doc_url_dict = {}
            for i in range(int(actual_page)):
                url_list = session.get("https://openapi.book118.com/getPreview.html", params={
                    "t": t,
                    "view_token": view_token,
                    "project_id": project_id,
                    "aid": aid,
                    "page": i + 1
                }).text
                time.sleep(1)
                if url_list[0] != "j":
                    self.print_msg(json.loads(url_list)["message"])
                    return
                now = json.loads(url_list.strip("jsonpReturn(").strip(")")[:-2])
                if now["status"] != 200:
                    # print("未知错误")
                    self.print_msg("未知错误")
                    os.rmdir(path)
                    return
                for j in now["data"]:
                    if now["data"][j] != "" and doc_url_dict.get(j) is None:
                        doc_url_dict[j] = now["data"][j]
                # print(json.loads(url_list.strip("jsonpReturn(").strip(")")[:-2])["data"])
            # re.search(re.compile("(?:senddate:)(.*)") ,str(soup.select("script")[5].next)).groups() view_token
            # print(doc_url_dict)
            for i in doc_url_dict:
                with open(f"{path}\\{i}.png", "wb") as f:
                    print(f"{str(datetime.datetime.now())[0:-7]}\t正在下载第{i}张图片...", end="")
                    f.write(session.get(f"https:{doc_url_dict[i]}").content)
                    print("下载完成")
            self.print_msg("爬取完成,存放于脚本img目录下")


if __name__ == "__main__":
    if len(sys.argv) == 1:
        print("用法:\npython3 get_doc.py url")
        sys.exit()
    get_doc = MaxDoc(sys.argv[1])  # 可以把url换成你要爬的文档url
    get_doc.run()

 

posted @ 2023-05-28 10:07  未配妥剑,已入江湖  阅读(108)  评论(0编辑  收藏  举报