python实现网页转为pdf
简单记录一下,避免以后将代码丢失
from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait # pdf下载 def save_webpage_as_pdf(url, output_path="webpage.pdf"): chrome_options = Options() chrome_options.add_argument("--headless=new") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options=chrome_options) driver.get(url) try: WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.ID, "js_content")) ) except Exception as e: print("加载内容超时或发生错误:", e) # 1. 取消懒加载 driver.execute_script(""" const images = document.querySelectorAll('img'); images.forEach(img => { if(img.hasAttribute('data-src')) { img.src = img.getAttribute('data-src'); } if(img.hasAttribute('data-srcset')) { img.srcset = img.getAttribute('data-srcset'); } img.loading = 'eager'; // 强制立即加载 }); """) time.sleep(2) # 适当等待 # 2. 模拟滚动,强制触发懒加载机制 scroll_height = driver.execute_script("return document.body.scrollHeight") for i in range(0, scroll_height, 300): driver.execute_script(f"window.scrollTo(0, {i});") time.sleep(0.2) # 给图片加载时间 # 滚到页面底部 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(3) # 等待最后图片加载 # 3. 确认图片是否真的加载(所有图片都必须完成) images_not_loaded = driver.execute_script(""" const imgs = Array.from(document.images); return imgs.filter(img => !img.complete || img.naturalHeight === 0).length; """) if images_not_loaded > 0: print(f"有 {images_not_loaded} 张图片仍未加载,等待3秒重试...") time.sleep(3) # 再等等 else: print("所有图片已加载完毕。") # 4. 计算PDF高度 paper_height = driver.execute_script("return document.body.scrollHeight / 96") result = driver.execute_cdp_cmd("Page.printToPDF", { "printBackground": True, "paperWidth": 8.27, "paperHeight": paper_height, "marginTop": 0, "marginBottom": 0, "marginLeft": 0, "marginRight": 0, }) pdf_data = base64.b64decode(result['data']) with open(output_path, "wb") as f: f.write(pdf_data) print(f"网页已保存为 PDF:{output_path}") driver.quit()