python里使用Playwright
Playwright 是由微软开发的一款开源的 Web 自动化测试框架,主要用于自动化测试和浏览器操作
它是一个跨浏览器的自动化工具,支持 Python、JavaScript 等多种语言
安装
pip install playwright
安装 Playwright 支持的浏览器
playwright install
从 HTML 中提取文字、标题、摘要和关键字
from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup # 用于解析 HTML def extract_page_content(url): with sync_playwright() as p: # 启动浏览器 browser = p.chromium.launch(headless=True) # 可以设置为 headless=False 方便调试 page = browser.new_page() # 导航到目标页面 page.goto(url) page.wait_for_load_state("networkidle") # 等待页面加载完成 # 获取页面的 HTML 内容 html_content = page.content() # 关闭浏览器 browser.close() # 使用 BeautifulSoup 解析 HTML soup = BeautifulSoup(html_content, "html.parser") # 提取标题 title = soup.find("title").text if soup.find("title") else "No title found" # 提取摘要(meta description) meta_description = soup.find("meta", attrs={"name": "description"}) description = meta_description["content"] if meta_description else "No description found" # 提取关键字(meta keywords) meta_keywords = soup.find("meta", attrs={"name": "keywords"}) keywords = meta_keywords["content"] if meta_keywords else "No keywords found" # 提取正文内容(去除 HTML 标签) text = soup.get_text(separator="\n", strip=True) return { "title": title, "description": description, "keywords": keywords, "text": text } url = "https://www.cnblogs.com/baby123/p/18772196" result = extract_page_content(url) print("Title:", result["title"]) print("Description:", result["description"]) print("Keywords:", result["keywords"]) print("Content:", result["text"])