python 爬虫淘宝天猫
python 爬虫获取淘宝天猫的图片和标题
以 Playwright + Python 为例:
扫码登录获取 cookies 的方式(推荐用 headless browser 实现)
由于linux服务器没有页面,使用 headless=False + Xvfb(虚拟显示)模拟图形界面(推荐在服务器上扫码)
安装xvfb 将二维码保存下来,scp到本地
sudo apt update && sudo apt install -y xvfb
xvfb-run -a python your_script.py
保存cookies 和ua 关键将二维码保存到服务器,qr.png 扫码完成后保存cookies和ua
def save_cookies_and_ua(): with sync_playwright() as p: browser = p.chromium.launch(headless=False) context = browser.new_context() page = context.new_page() # 步骤1:登录淘宝 print(">>> 正在打开淘宝首页,请扫码登录") page.goto("https://login.taobao.com/") page.wait_for_load_state("load") # 页面完全加载,包括所有 DOM 和资源。 page.screenshot(path="qr.png") # 保存二维码 input(">>> 登录完成后请按回车继续...") # page.wait_for_timeout(60000) # 等待60秒 cookies = context.cookies() ua = page.evaluate("() => navigator.userAgent") with open("taobao_cookies.json", "w", encoding="utf-8") as f: json.dump(cookies, f, indent=2, ensure_ascii=False) with open("taobao_user_agent.txt", "w", encoding="utf-8") as f: f.write(ua) print("✅ cookies + User-Agent 已保存") browser.close()
获取商品图和标题描述:
def fetch_taobao_title(url_list, cookies_file, ua_file): user_agent = load_user_agent(ua_file) cookies = load_cookies(cookies_file) with sync_playwright() as p: browser = p.chromium.launch(headless=False) context = browser.new_context(user_agent=user_agent) context.add_cookies(cookies) page = context.new_page() try: for url in url_list: print("🌐 正在访问页面:", url) page.goto(url, timeout=20000) page.wait_for_timeout(3000) # 缓冲等待资源加载 if is_slide_verification(page): print("⚠️ 进入滑块验证页,无法继续抓取!") return # 获取标题(h1) title_element = page.query_selector("div.QJEEHAN8H5--summaryInfoWrap--_0f026d1 h1") title_text = title_element.inner_text() if title_element else "未找到标题" img_elements = page.query_selector_all("div.QJEEHAN8H5--thumbnailsWrap--_9878a4b img") img_urls = [img.get_attribute("src") for img in img_elements if img.get_attribute("src")] for i, url in enumerate(img_urls, 1): print(f"第{i}张图: {url}") print("title_text:", title_text) except Exception as e: print("❌ 出错:", e) with open("error.html", "w", encoding="utf-8") as f: f.write(page.content()) print("📄 已将页面 HTML 保存为 error.html,请检查是否为验证页") finally: browser.close()
完整代码如下:
""" 爬虫获取淘宝和天猫的商品图和标题 """ from playwright.sync_api import sync_playwright import json def load_user_agent(path): with open(path, "r", encoding="utf-8") as f: return f.read().strip() def load_cookies(path): with open(path, "r", encoding="utf-8") as f: return json.load(f) def is_slide_verification(page): html = page.content() return "请完成安全验证" in html or "滑动验证" in html def fetch_taobao_title(url_list, cookies_file, ua_file): user_agent = load_user_agent(ua_file) cookies = load_cookies(cookies_file) with sync_playwright() as p: browser = p.chromium.launch(headless=False) context = browser.new_context(user_agent=user_agent) context.add_cookies(cookies) page = context.new_page() try: for url in url_list: print("🌐 正在访问页面:", url) page.goto(url, timeout=20000) page.wait_for_timeout(3000) # 缓冲等待资源加载 if is_slide_verification(page): print("⚠️ 进入滑块验证页,无法继续抓取!") return # 获取标题(h1) title_element = page.query_selector("div.QJEEHAN8H5--summaryInfoWrap--_0f026d1 h1") title_text = title_element.inner_text() if title_element else "未找到标题" img_elements = page.query_selector_all("div.QJEEHAN8H5--thumbnailsWrap--_9878a4b img") img_urls = [img.get_attribute("src") for img in img_elements if img.get_attribute("src")] for i, url in enumerate(img_urls, 1): print(f"第{i}张图: {url}") print("title_text:", title_text) except Exception as e: print("❌ 出错:", e) with open("error.html", "w", encoding="utf-8") as f: f.write(page.content()) print("📄 已将页面 HTML 保存为 error.html,请检查是否为验证页") finally: browser.close() def save_cookies_and_ua(): with sync_playwright() as p: browser = p.chromium.launch(headless=False) context = browser.new_context() page = context.new_page() # 步骤1:登录淘宝 print(">>> 正在打开淘宝首页,请扫码登录") page.goto("https://login.taobao.com/") page.wait_for_load_state("load") # 页面完全加载,包括所有 DOM 和资源。 page.screenshot(path="qr.png") # 保存二维码 input(">>> 登录完成后请按回车继续...") # page.wait_for_timeout(60000) # 等待60秒 cookies = context.cookies() ua = page.evaluate("() => navigator.userAgent") with open("taobao_cookies.json", "w", encoding="utf-8") as f: json.dump(cookies, f, indent=2, ensure_ascii=False) with open("taobao_user_agent.txt", "w", encoding="utf-8") as f: f.write(ua) print("✅ cookies + User-Agent 已保存") browser.close() if __name__ == "__main__": save_cookies_and_ua() fetch_taobao_title( ["https://detail.tmall.com/item.htm?id=912583964692", "https://item.taobao.com/item.htm?id=768911298165"], cookies_file="taobao_cookies.json", ua_file="taobao_user_agent.txt" )
 
                     
                    
                 
                    
                 
                
            
         
 
         浙公网安备 33010602011771号
浙公网安备 33010602011771号