python 爬虫淘宝天猫

python 爬虫获取淘宝天猫的图片和标题

Playwright + Python 为例:

扫码登录获取 cookies 的方式(推荐用 headless browser 实现)

由于linux服务器没有页面,使用 headless=False + Xvfb(虚拟显示)模拟图形界面(推荐在服务器上扫码)

安装xvfb 将二维码保存下来,scp到本地

sudo apt update && sudo apt install -y xvfb

 

xvfb-run -a python your_script.py

 

保存cookies 和ua 关键将二维码保存到服务器,qr.png 扫码完成后保存cookies和ua

def save_cookies_and_ua():
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)
        context = browser.new_context()
        page = context.new_page()

        # 步骤1:登录淘宝
        print(">>> 正在打开淘宝首页,请扫码登录")
        page.goto("https://login.taobao.com/")
        page.wait_for_load_state("load")  # 页面完全加载,包括所有 DOM 和资源。
        page.screenshot(path="qr.png")  # 保存二维码
        input(">>> 登录完成后请按回车继续...")
        # page.wait_for_timeout(60000) # 等待60秒
        cookies = context.cookies()
        ua = page.evaluate("() => navigator.userAgent")
        with open("taobao_cookies.json", "w", encoding="utf-8") as f:
            json.dump(cookies, f, indent=2, ensure_ascii=False)
        with open("taobao_user_agent.txt", "w", encoding="utf-8") as f:
            f.write(ua)
        print("✅ cookies + User-Agent 已保存")
        browser.close()

 

获取商品图和标题描述:

def fetch_taobao_title(url_list, cookies_file, ua_file):
    user_agent = load_user_agent(ua_file)
    cookies = load_cookies(cookies_file)

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)
        context = browser.new_context(user_agent=user_agent)
        context.add_cookies(cookies)
        page = context.new_page()

        try:
            for url in url_list:
                print("🌐 正在访问页面:", url)
                page.goto(url, timeout=20000)
                page.wait_for_timeout(3000)  # 缓冲等待资源加载

                if is_slide_verification(page):
                    print("⚠️ 进入滑块验证页,无法继续抓取!")
                    return

                # 获取标题(h1)
                title_element = page.query_selector("div.QJEEHAN8H5--summaryInfoWrap--_0f026d1 h1")
                title_text = title_element.inner_text() if title_element else "未找到标题"

                img_elements = page.query_selector_all("div.QJEEHAN8H5--thumbnailsWrap--_9878a4b img")
                img_urls = [img.get_attribute("src") for img in img_elements if img.get_attribute("src")]

                for i, url in enumerate(img_urls, 1):
                    print(f"第{i}张图: {url}")

                print("title_text:", title_text)

        except Exception as e:
            print("❌ 出错:", e)
            with open("error.html", "w", encoding="utf-8") as f:
                f.write(page.content())
            print("📄 已将页面 HTML 保存为 error.html,请检查是否为验证页")

        finally:
            browser.close()

 

 

完整代码如下:

"""
爬虫获取淘宝和天猫的商品图和标题
"""
from playwright.sync_api import sync_playwright
import json


def load_user_agent(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read().strip()


def load_cookies(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def is_slide_verification(page):
    html = page.content()
    return "请完成安全验证" in html or "滑动验证" in html


def fetch_taobao_title(url_list, cookies_file, ua_file):
    user_agent = load_user_agent(ua_file)
    cookies = load_cookies(cookies_file)

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)
        context = browser.new_context(user_agent=user_agent)
        context.add_cookies(cookies)
        page = context.new_page()

        try:
            for url in url_list:
                print("🌐 正在访问页面:", url)
                page.goto(url, timeout=20000)
                page.wait_for_timeout(3000)  # 缓冲等待资源加载

                if is_slide_verification(page):
                    print("⚠️ 进入滑块验证页,无法继续抓取!")
                    return

                # 获取标题(h1)
                title_element = page.query_selector("div.QJEEHAN8H5--summaryInfoWrap--_0f026d1 h1")
                title_text = title_element.inner_text() if title_element else "未找到标题"

                img_elements = page.query_selector_all("div.QJEEHAN8H5--thumbnailsWrap--_9878a4b img")
                img_urls = [img.get_attribute("src") for img in img_elements if img.get_attribute("src")]

                for i, url in enumerate(img_urls, 1):
                    print(f"第{i}张图: {url}")

                print("title_text:", title_text)

        except Exception as e:
            print("❌ 出错:", e)
            with open("error.html", "w", encoding="utf-8") as f:
                f.write(page.content())
            print("📄 已将页面 HTML 保存为 error.html,请检查是否为验证页")

        finally:
            browser.close()


def save_cookies_and_ua():
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)
        context = browser.new_context()
        page = context.new_page()

        # 步骤1:登录淘宝
        print(">>> 正在打开淘宝首页,请扫码登录")
        page.goto("https://login.taobao.com/")
        page.wait_for_load_state("load")  # 页面完全加载,包括所有 DOM 和资源。
        page.screenshot(path="qr.png")  # 保存二维码
        input(">>> 登录完成后请按回车继续...")
        # page.wait_for_timeout(60000) # 等待60秒
        cookies = context.cookies()
        ua = page.evaluate("() => navigator.userAgent")
        with open("taobao_cookies.json", "w", encoding="utf-8") as f:
            json.dump(cookies, f, indent=2, ensure_ascii=False)
        with open("taobao_user_agent.txt", "w", encoding="utf-8") as f:
            f.write(ua)
        print("✅ cookies + User-Agent 已保存")
        browser.close()


if __name__ == "__main__":
    save_cookies_and_ua()
    fetch_taobao_title(
        ["https://detail.tmall.com/item.htm?id=912583964692", "https://item.taobao.com/item.htm?id=768911298165"],
        cookies_file="taobao_cookies.json",
        ua_file="taobao_user_agent.txt"
    )

 

posted on 2025-04-24 16:25  星河赵  阅读(159)  评论(0)    收藏  举报

导航