比较完善的批量下载github的release文件的python脚本

一、原始版本参考:

https://www.xjoker.us/其他/批量下载Github指定仓库的所有Release/

二、增加如下功能:

1、分标签创建下一级目录

2、支持过滤不必要的标签

3、支持指定同时下载线程个数

4、支持github代理,代理格式类似:https://ghproxy.com/https://github.com/xxxx/yyyy

三、v1完整代码如下:

import sys
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urlparse
import concurrent.futures
from pathlib import Path
import re
import argparse

def create_session(retries=5, backoff_factor=0.3, status_formalist=(500, 502, 504)):
    """Create and configure a requests session for automatic retries."""
    session = requests.Session()
    retry_strategy = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_formalist,
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def extract_user_repo(github_clone_url):
    """Extract username and repository name from a GitHub clone URL, supporting proxy URLs and no .git suffix."""
    parsed = urlparse(github_clone_url)

    # 检查是否为代理格式(如 ghproxy.com/https://github.com/user/repo)
    use_proxy = parsed.netloc and 'github.com' not in parsed.netloc

    # 如果没有使用代理
    path_parts = parsed.path.strip('/').split('/')
    if not use_proxy:
        user = path_parts[0]
        repo = path_parts[1].split('#')[0].split('?')[0]  # 去除 query 或 fragment
        repo = repo.rstrip('.git')  # 移除 .git(如果存在)
        return user, repo

    # 默认处理原始 GitHub 格式(如 github.com/user/repo 或 user/repo)
    path = parsed.path.strip('/').split('#')[0].split('?')[0]  # 去除 query 或 fragment
    parts = path.split('/')
    if len(parts) < 2:
        raise ValueError(f"Invalid GitHub clone URL: {github_clone_url}")

    user = parts[3]
    repo = parts[4].rstrip('.git')  # 移除 .git(如果存在)
    return user, repo

def download_asset(session, asset_url, file_path):
    """Download a single Release asset."""
    print(f"Downloading {file_path.name}...")
    try:
        response = session.get(asset_url, stream=True)
        response.raise_for_status()
        with file_path.open('wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded {file_path.name}")
    except requests.RequestException as e:
        print(f"Failed to download {file_path.name}: {e}")

def download_releases(github_clone_url, filter_tag_regex=None, threads=2):
    """Download all Releases for a specified repository and organize them by tag."""
    session = create_session()
    user, repo = extract_user_repo(github_clone_url)
    print(f"Downloading releases for {user}/{repo}...")

    # API 请求始终使用官方地址
    api_url = f"https://api.github.com/repos/{user}/{repo}/releases"

    # 判断是否使用代理(即 URL 是否为代理格式)
    parsed_input = urlparse(github_clone_url)
    use_proxy = parsed_input.netloc and 'github.com' not in parsed_input.netloc

    directory = Path.cwd() / repo
    directory.mkdir(exist_ok=True)

    page = 1
    total_assets = 0
    futures = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        while True:
            params = {'per_page': 100, 'page': page}
            response = session.get(api_url, params=params)
            releases = response.json()

            if not releases:
                break

            for release in releases:
                tag_name = release['tag_name']
                if filter_tag_regex and not re.match(filter_tag_regex, tag_name):
                    print(f"Skipping tag: {tag_name} (does not match regex: {filter_tag_regex})")
                    continue

                tag_directory = directory / tag_name
                tag_directory.mkdir(exist_ok=True)

                for asset in release['assets']:
                    original_asset_url = asset['browser_download_url']

                    # 使用提前判断的 use_proxy 变量
                    if use_proxy:
                        asset_url = f"{parsed_input.scheme}://{parsed_input.netloc}/{original_asset_url}"
                    else:
                        asset_url = original_asset_url

                    total_assets += 1
                    futures.append(
                        executor.submit(download_asset, session, asset_url, tag_directory / asset['name'])
                    )

            page += 1

        concurrent.futures.wait(futures)

    print(f"Total assets downloaded: {total_assets}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Download GitHub Release assets filtered by tag.")
    parser.add_argument("repo_url", help="GitHub repository clone URL (HTTPS or SSH)")
    parser.add_argument("-f", "--filter-tag", help="Regex pattern to filter tags (e.g., '^v\\d+\\.\\d+\\.\\d+$')")
    parser.add_argument("-t", "--threads", type=int, default=2, help="Number of threads to use for downloading (default: 2)")

    args = parser.parse_args()

    download_releases(args.repo_url, args.filter_tag, args.threads)

执行时的参数

python github_release.py https://ghproxy.com/https://github.com/xxx/yyyy --filter-tag=^v\\d+\\.\\d+\\.\\d+$ --threads=4

四、v2版本增加功能:

1、增加下载文件大小校验,下载出错会自动重试

2、支持参数传入重试次数

五、相应代码:

import sys
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urlparse
import concurrent.futures
from pathlib import Path
import re
import argparse

def create_session(retries=5, backoff_factor=0.3, status_formalist=(500, 502, 504)):
    """Create and configure a requests session for automatic retries."""
    session = requests.Session()
    retry_strategy = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_formalist,
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def extract_user_repo(github_clone_url):
    """Extract username and repository name from a GitHub clone URL, supporting proxy URLs and no .git suffix."""
    parsed = urlparse(github_clone_url)

    # 检查是否为代理格式(如 ghproxy.com/https://github.com/user/repo)
    use_proxy = parsed.netloc and 'github.com' not in parsed.netloc

    # 如果没有使用代理
    path_parts = parsed.path.strip('/').split('/')
    if not use_proxy:
        user = path_parts[0]
        repo = path_parts[1].split('#')[0].split('?')[0]  # 去除 query 或 fragment
        repo = repo.rstrip('.git')  # 移除 .git(如果存在)
        return user, repo

    # 默认处理原始 GitHub 格式(如 github.com/user/repo 或 user/repo)
    path = parsed.path.strip('/').split('#')[0].split('?')[0]  # 去除 query 或 fragment
    parts = path.split('/')
    if len(parts) < 2:
        raise ValueError(f"Invalid GitHub clone URL: {github_clone_url}")

    user = parts[3]
    repo = parts[4].rstrip('.git')  # 移除 .git(如果存在)
    return user, repo

def download_asset(session, asset_url, file_path, expected_size, max_retries):
    """Download a single Release asset with retry on failure or size mismatch."""
    attempt = 0
    while True:
        attempt += 1
        if max_retries != 0 and attempt > max_retries:
            print(f"Failed to download {file_path.name} after {max_retries} attempts.")
            return False

        print(f"Attempt {attempt} for downloading: {file_path.name}")
        try:
            with file_path.open('wb') as file:
                response = session.get(asset_url, stream=True)
                response.raise_for_status()
                downloaded_size = 0
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
                    downloaded_size += len(chunk)

                # 校验文件大小
                if downloaded_size == expected_size:
                    print(f"Downloaded {file_path.name}")
                    return True
                else:
                    print(f"Size mismatch (expected {expected_size}, got {downloaded_size}), retrying...")
        except (requests.RequestException, IOError) as e:
            print(f"Attempt {attempt} failed: {e}")

        # 删除损坏文件
        if file_path.exists():
            file_path.unlink()

def download_releases(github_clone_url, filter_tag_regex=None, threads=2, max_retries=3):
    """Download all Releases for a specified repository and organize them by tag."""
    session = create_session()
    user, repo = extract_user_repo(github_clone_url)
    print(f"Downloading releases for {user}/{repo}...")

    # API 请求始终使用官方地址
    api_url = f"https://api.github.com/repos/{user}/{repo}/releases"

    # 判断是否使用代理(即 URL 是否为代理格式)
    parsed_input = urlparse(github_clone_url)
    use_proxy = parsed_input.netloc and 'github.com' not in parsed_input.netloc

    directory = Path.cwd() / repo
    directory.mkdir(exist_ok=True)

    page = 1
    total_assets = 0
    futures = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        while True:
            params = {'per_page': 100, 'page': page}
            response = session.get(api_url, params=params)
            releases = response.json()

            if not releases:
                break

            for release in releases:
                tag_name = release['tag_name']
                if filter_tag_regex and not re.match(filter_tag_regex, tag_name):
                    print(f"Skipping tag: {tag_name} (does not match regex: {filter_tag_regex})")
                    continue

                tag_directory = directory / tag_name
                tag_directory.mkdir(exist_ok=True)

                for asset in release['assets']:
                    original_asset_url = asset['browser_download_url']
                    expected_size = asset['size']

                    # 使用提前判断的 use_proxy 变量
                    if use_proxy:
                        asset_url = f"{parsed_input.scheme}://{parsed_input.netloc}/{original_asset_url}"
                    else:
                        asset_url = original_asset_url

                    total_assets += 1
                    futures.append(
                        executor.submit(download_asset, session, asset_url, tag_directory / asset['name'], expected_size, max_retries)
                    )

            page += 1

        concurrent.futures.wait(futures)

    print(f"Total assets downloaded: {total_assets}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Download GitHub Release assets filtered by tag.")
    parser.add_argument("repo_url", help="GitHub repository clone URL (HTTPS or SSH)")
    parser.add_argument("-f", "--filter-tag", help="Regex pattern to filter tags (e.g., '^v\\d+\\.\\d+\\.\\d+$')")
    parser.add_argument("-t", "--threads", type=int, default=2, help="Number of threads to use for downloading (default: 2)")
    parser.add_argument("-r", "--retries", type=int, default=3, help="Maximum number of retries for each file (0 means infinite).")

    args = parser.parse_args()

    download_releases(args.repo_url, args.filter_tag, args.threads, args.retries)

六、v3版本增加功能

1、支持文件名以正则表达式规则进行过滤,指定了过滤规则时,不匹配的文件会被直接跳过

v3相应代码如下:

import sys
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urlparse
import concurrent.futures
from pathlib import Path
import re
import argparse

def create_session(retries=5, backoff_factor=0.3, status_formalist=(500, 502, 504)):
    """Create and configure a requests session for automatic retries."""
    session = requests.Session()
    retry_strategy = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_formalist,
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def extract_user_repo(github_clone_url):
    """Extract username and repository name from a GitHub clone URL, supporting proxy URLs and no .git suffix."""
    parsed = urlparse(github_clone_url)

    # 检查是否为代理格式(如 ghproxy.com/https://github.com/user/repo)
    use_proxy = parsed.netloc and 'github.com' not in parsed.netloc

    # 如果没有使用代理
    path_parts = parsed.path.strip('/').split('/')
    if not use_proxy:
        user = path_parts[0]
        repo = path_parts[1].split('#')[0].split('?')[0]  # 去除 query 或 fragment
        repo = repo.rstrip('.git')  # 移除 .git(如果存在)
        return user, repo

    # 默认处理原始 GitHub 格式(如 github.com/user/repo 或 user/repo)
    path = parsed.path.strip('/').split('#')[0].split('?')[0]  # 去除 query 或 fragment
    parts = path.split('/')
    if len(parts) < 2:
        raise ValueError(f"Invalid GitHub clone URL: {github_clone_url}")

    user = parts[3]
    repo = parts[4].rstrip('.git')  # 移除 .git(如果存在)
    return user, repo

def download_asset(session, asset_url, file_path, expected_size, max_retries):
    """Download a single Release asset with retry on failure or size mismatch."""
    attempt = 0
    while True:
        attempt += 1
        if max_retries != 0 and attempt > max_retries:
            print(f"Failed to download {file_path.name} after {max_retries} attempts.")
            return False

        print(f"Attempt {attempt} for downloading: {file_path.name}")
        try:
            with file_path.open('wb') as file:
                response = session.get(asset_url, stream=True)
                response.raise_for_status()
                downloaded_size = 0
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
                    downloaded_size += len(chunk)

                # 校验文件大小
                if downloaded_size == expected_size:
                    print(f"Downloaded {file_path.name}")
                    return True
                else:
                    print(f"Size mismatch (expected {expected_size}, got {downloaded_size}), retrying...")
        except (requests.RequestException, IOError) as e:
            print(f"Attempt {attempt} failed: {e}")

        # 删除损坏文件
        if file_path.exists():
            file_path.unlink()

def download_releases(github_clone_url, filter_tag_regex=None, threads=2, max_retries=3, filter_name=None):
    """Download all Releases for a specified repository and organize them by tag."""
    session = create_session()
    user, repo = extract_user_repo(github_clone_url)
    print(f"Downloading releases for {user}/{repo}...")

    # API 请求始终使用官方地址
    api_url = f"https://api.github.com/repos/{user}/{repo}/releases"

    # 判断是否使用代理(即 URL 是否为代理格式)
    parsed_input = urlparse(github_clone_url)
    use_proxy = parsed_input.netloc and 'github.com' not in parsed_input.netloc

    directory = Path.cwd() / repo
    directory.mkdir(exist_ok=True)

    page = 1
    total_assets = 0
    futures = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        while True:
            params = {'per_page': 100, 'page': page}
            response = session.get(api_url, params=params)
            releases = response.json()

            if not releases:
                break

            for release in releases:
                tag_name = release['tag_name']
                # 过滤一下标签
                if filter_tag_regex and not re.match(filter_tag_regex, tag_name):
                    print(f"Skipping tag: {tag_name} (does not match regex: {filter_tag_regex})")
                    continue

                tag_directory = directory / tag_name
                tag_directory.mkdir(exist_ok=True)

                for asset in release['assets']:
                    original_asset_url = asset['browser_download_url']
                    expected_size = asset['size']
                    asset_name = asset['name']

                    # 新增:根据文件名正则过滤
                    if filter_name and not re.match(filter_name, asset_name):
                        print(f"Skipping asset: {asset_name} (does not match regex: {filter_name})")
                        continue

                    # 使用提前判断的 use_proxy 变量
                    if use_proxy:
                        asset_url = f"{parsed_input.scheme}://{parsed_input.netloc}/{original_asset_url}"
                    else:
                        asset_url = original_asset_url

                    total_assets += 1
                    futures.append(
                        executor.submit(download_asset, session, asset_url, tag_directory / asset['name'], expected_size, max_retries)
                    )

            page += 1

        concurrent.futures.wait(futures)

    print(f"Total assets downloaded: {total_assets}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Download GitHub Release assets filtered by tag.")
    parser.add_argument("repo_url", help="GitHub repository clone URL (HTTPS or SSH)")
    parser.add_argument("-f", "--filter-tag", help="Regex pattern to filter tags (e.g., '^v\\d+\\.\\d+\\.\\d+$')")
    parser.add_argument("-t", "--threads", type=int, default=2, help="Number of threads to use for downloading (default: 2)")
    parser.add_argument("-r", "--retries", type=int, default=3, help="Maximum number of retries for each file (0 means infinite).")
    parser.add_argument("-n", "--filter-name", help="Regex pattern to filter asset filenames (e.g., '.*\\.zip$')")

    args = parser.parse_args()

    download_releases(args.repo_url, args.filter_tag, args.threads, args.retries, args.filter_name)

执行时的命令参数示例:

python github_release.py https://ghproxy.com/https://github.com/xxx/yyyy --filter-tag=^v\\d+\\.\\d+\\.\\d+$ --threads=4 --filter-name='.*\.(7z|dmg|zip|deb|msi)$'
posted @ 2025-05-08 13:42  日月王  阅读(115)  评论(0)    收藏  举报