不为别的,只为做一个连自己都羡慕的人

python实现网页转为pdf

简单记录一下,避免以后将代码丢失

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

# pdf下载
def save_webpage_as_pdf(url, output_path="webpage.pdf"):
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.ID, "js_content"))
        )
    except Exception as e:
        print("加载内容超时或发生错误:", e)

    # 1. 取消懒加载
    driver.execute_script("""
        const images = document.querySelectorAll('img');
        images.forEach(img => {
            if(img.hasAttribute('data-src')) {
                img.src = img.getAttribute('data-src');
            }
            if(img.hasAttribute('data-srcset')) {
                img.srcset = img.getAttribute('data-srcset');
            }
            img.loading = 'eager';  // 强制立即加载
        });
    """)

    time.sleep(2)  # 适当等待

    # 2. 模拟滚动,强制触发懒加载机制
    scroll_height = driver.execute_script("return document.body.scrollHeight")
    for i in range(0, scroll_height, 300):
        driver.execute_script(f"window.scrollTo(0, {i});")
        time.sleep(0.2)  # 给图片加载时间

    # 滚到页面底部
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)  # 等待最后图片加载

    # 3. 确认图片是否真的加载(所有图片都必须完成)
    images_not_loaded = driver.execute_script("""
        const imgs = Array.from(document.images);
        return imgs.filter(img => !img.complete || img.naturalHeight === 0).length;
    """)
    if images_not_loaded > 0:
        print(f"有 {images_not_loaded} 张图片仍未加载,等待3秒重试...")
        time.sleep(3)  # 再等等
    else:
        print("所有图片已加载完毕。")

    # 4. 计算PDF高度
    paper_height = driver.execute_script("return document.body.scrollHeight / 96")

    result = driver.execute_cdp_cmd("Page.printToPDF", {
        "printBackground": True,
        "paperWidth": 8.27,
        "paperHeight": paper_height,
        "marginTop": 0,
        "marginBottom": 0,
        "marginLeft": 0,
        "marginRight": 0,
    })

    pdf_data = base64.b64decode(result['data'])
    with open(output_path, "wb") as f:
        f.write(pdf_data)
        print(f"网页已保存为 PDF:{output_path}")

    driver.quit()

 

posted @ 2025-06-30 14:32  升级打怪  阅读(73)  评论(0)    收藏  举报