采用DrissionPage批量采集抖音视频

参考视频:https://www.bilibili.com/video/BV1nSRPYtEAU/?spm_id_from=333.337.search-card.all.click

效果展示(只实现了爬取视频,没有爬取图文):

image

具体代码:

# 导入自动化模块
from DrissionPage import ChromiumPage
import requests
import time
import os

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
    'referer': 'https://www.douyin.com/user/MS4wLjABAAAAczLgM1eXmTLadiZ_T7_VyrbKP0O79wR-OOMl-meVqgE?from_tab_name=main'
}


# 处理视频数据的函数
def process_videos(video_list):
    for index in video_list:
        title = index['desc']
        video_id = index['aweme_id']
        video_url = index['video']['play_addr']['url_list'][0]

        # 清理标题中的非法字符
        clean_title = ''.join(c for c in title if c not in r'\/:*?"<>|')
        if not clean_title or clean_title.isspace():
            clean_title = "无标题"

        print(f"正在下载: {clean_title[:30]}... (ID: {video_id})")

        # 获取视频内容
        video_content = requests.get(url=video_url, headers=headers).content

        # 保存视频
        file_name = f'video/{clean_title}-{video_id}.mp4'
        with open(file_name, 'wb') as f:
            f.write(video_content)

        print(f"✓ 已保存: {file_name}")


# 主程序
dp = ChromiumPage()
dp.listen.start('/aweme/post/')
dp.get('https://www.douyin.com/user/MS4wLjABAAAAczLgM1eXmTLadiZ_T7_VyrbKP0O79wR-OOMl-meVqgE?from_tab_name=main')

# 创建保存视频的目录
os.makedirs('video', exist_ok=True)

# 获取第一页数据
resp = dp.listen.wait()

# 直接使用 resp.response.body,因为它已经是字典格式
json_data = resp.response.body

if 'aweme_list' in json_data:
    video_list = json_data['aweme_list']
    process_videos(video_list)

    # 如果需要更多数据,滚动加载
    while json_data.get('has_more', 0) == 1:
        # 滚动到页面底部
        tab = dp.ele('css:.Rcc71LyU')
        dp.scroll.to_see(tab)
        time.sleep(2)

        # 获取新数据
        resp = dp.listen.wait(timeout=5)
        if resp:
            # 直接使用 body,不需要再解析
            json_data = resp.response.body
            if 'aweme_list' in json_data:
                video_list = json_data['aweme_list']
                process_videos(video_list)

print("爬取完成!")

 

posted @ 2025-12-20 16:55  雨花阁  阅读(2)  评论(0)    收藏  举报