比较完善的批量下载github的release文件的python脚本
一、原始版本参考:
https://www.xjoker.us/其他/批量下载Github指定仓库的所有Release/
二、增加如下功能:
1、分标签创建下一级目录
2、支持过滤不必要的标签
3、支持指定同时下载线程个数
4、支持github代理,代理格式类似:https://ghproxy.com/https://github.com/xxxx/yyyy
三、v1完整代码如下:
import sys
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urlparse
import concurrent.futures
from pathlib import Path
import re
import argparse
def create_session(retries=5, backoff_factor=0.3, status_formalist=(500, 502, 504)):
"""Create and configure a requests session for automatic retries."""
session = requests.Session()
retry_strategy = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_formalist,
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def extract_user_repo(github_clone_url):
"""Extract username and repository name from a GitHub clone URL, supporting proxy URLs and no .git suffix."""
parsed = urlparse(github_clone_url)
# 检查是否为代理格式(如 ghproxy.com/https://github.com/user/repo)
use_proxy = parsed.netloc and 'github.com' not in parsed.netloc
# 如果没有使用代理
path_parts = parsed.path.strip('/').split('/')
if not use_proxy:
user = path_parts[0]
repo = path_parts[1].split('#')[0].split('?')[0] # 去除 query 或 fragment
repo = repo.rstrip('.git') # 移除 .git(如果存在)
return user, repo
# 默认处理原始 GitHub 格式(如 github.com/user/repo 或 user/repo)
path = parsed.path.strip('/').split('#')[0].split('?')[0] # 去除 query 或 fragment
parts = path.split('/')
if len(parts) < 2:
raise ValueError(f"Invalid GitHub clone URL: {github_clone_url}")
user = parts[3]
repo = parts[4].rstrip('.git') # 移除 .git(如果存在)
return user, repo
def download_asset(session, asset_url, file_path):
"""Download a single Release asset."""
print(f"Downloading {file_path.name}...")
try:
response = session.get(asset_url, stream=True)
response.raise_for_status()
with file_path.open('wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print(f"Downloaded {file_path.name}")
except requests.RequestException as e:
print(f"Failed to download {file_path.name}: {e}")
def download_releases(github_clone_url, filter_tag_regex=None, threads=2):
"""Download all Releases for a specified repository and organize them by tag."""
session = create_session()
user, repo = extract_user_repo(github_clone_url)
print(f"Downloading releases for {user}/{repo}...")
# API 请求始终使用官方地址
api_url = f"https://api.github.com/repos/{user}/{repo}/releases"
# 判断是否使用代理(即 URL 是否为代理格式)
parsed_input = urlparse(github_clone_url)
use_proxy = parsed_input.netloc and 'github.com' not in parsed_input.netloc
directory = Path.cwd() / repo
directory.mkdir(exist_ok=True)
page = 1
total_assets = 0
futures = []
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
while True:
params = {'per_page': 100, 'page': page}
response = session.get(api_url, params=params)
releases = response.json()
if not releases:
break
for release in releases:
tag_name = release['tag_name']
if filter_tag_regex and not re.match(filter_tag_regex, tag_name):
print(f"Skipping tag: {tag_name} (does not match regex: {filter_tag_regex})")
continue
tag_directory = directory / tag_name
tag_directory.mkdir(exist_ok=True)
for asset in release['assets']:
original_asset_url = asset['browser_download_url']
# 使用提前判断的 use_proxy 变量
if use_proxy:
asset_url = f"{parsed_input.scheme}://{parsed_input.netloc}/{original_asset_url}"
else:
asset_url = original_asset_url
total_assets += 1
futures.append(
executor.submit(download_asset, session, asset_url, tag_directory / asset['name'])
)
page += 1
concurrent.futures.wait(futures)
print(f"Total assets downloaded: {total_assets}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download GitHub Release assets filtered by tag.")
parser.add_argument("repo_url", help="GitHub repository clone URL (HTTPS or SSH)")
parser.add_argument("-f", "--filter-tag", help="Regex pattern to filter tags (e.g., '^v\\d+\\.\\d+\\.\\d+$')")
parser.add_argument("-t", "--threads", type=int, default=2, help="Number of threads to use for downloading (default: 2)")
args = parser.parse_args()
download_releases(args.repo_url, args.filter_tag, args.threads)
执行时的参数
python github_release.py https://ghproxy.com/https://github.com/xxx/yyyy --filter-tag=^v\\d+\\.\\d+\\.\\d+$ --threads=4
四、v2版本增加功能:
1、增加下载文件大小校验,下载出错会自动重试
2、支持参数传入重试次数
五、相应代码:
import sys
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urlparse
import concurrent.futures
from pathlib import Path
import re
import argparse
def create_session(retries=5, backoff_factor=0.3, status_formalist=(500, 502, 504)):
"""Create and configure a requests session for automatic retries."""
session = requests.Session()
retry_strategy = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_formalist,
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def extract_user_repo(github_clone_url):
"""Extract username and repository name from a GitHub clone URL, supporting proxy URLs and no .git suffix."""
parsed = urlparse(github_clone_url)
# 检查是否为代理格式(如 ghproxy.com/https://github.com/user/repo)
use_proxy = parsed.netloc and 'github.com' not in parsed.netloc
# 如果没有使用代理
path_parts = parsed.path.strip('/').split('/')
if not use_proxy:
user = path_parts[0]
repo = path_parts[1].split('#')[0].split('?')[0] # 去除 query 或 fragment
repo = repo.rstrip('.git') # 移除 .git(如果存在)
return user, repo
# 默认处理原始 GitHub 格式(如 github.com/user/repo 或 user/repo)
path = parsed.path.strip('/').split('#')[0].split('?')[0] # 去除 query 或 fragment
parts = path.split('/')
if len(parts) < 2:
raise ValueError(f"Invalid GitHub clone URL: {github_clone_url}")
user = parts[3]
repo = parts[4].rstrip('.git') # 移除 .git(如果存在)
return user, repo
def download_asset(session, asset_url, file_path, expected_size, max_retries):
"""Download a single Release asset with retry on failure or size mismatch."""
attempt = 0
while True:
attempt += 1
if max_retries != 0 and attempt > max_retries:
print(f"Failed to download {file_path.name} after {max_retries} attempts.")
return False
print(f"Attempt {attempt} for downloading: {file_path.name}")
try:
with file_path.open('wb') as file:
response = session.get(asset_url, stream=True)
response.raise_for_status()
downloaded_size = 0
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
downloaded_size += len(chunk)
# 校验文件大小
if downloaded_size == expected_size:
print(f"Downloaded {file_path.name}")
return True
else:
print(f"Size mismatch (expected {expected_size}, got {downloaded_size}), retrying...")
except (requests.RequestException, IOError) as e:
print(f"Attempt {attempt} failed: {e}")
# 删除损坏文件
if file_path.exists():
file_path.unlink()
def download_releases(github_clone_url, filter_tag_regex=None, threads=2, max_retries=3):
"""Download all Releases for a specified repository and organize them by tag."""
session = create_session()
user, repo = extract_user_repo(github_clone_url)
print(f"Downloading releases for {user}/{repo}...")
# API 请求始终使用官方地址
api_url = f"https://api.github.com/repos/{user}/{repo}/releases"
# 判断是否使用代理(即 URL 是否为代理格式)
parsed_input = urlparse(github_clone_url)
use_proxy = parsed_input.netloc and 'github.com' not in parsed_input.netloc
directory = Path.cwd() / repo
directory.mkdir(exist_ok=True)
page = 1
total_assets = 0
futures = []
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
while True:
params = {'per_page': 100, 'page': page}
response = session.get(api_url, params=params)
releases = response.json()
if not releases:
break
for release in releases:
tag_name = release['tag_name']
if filter_tag_regex and not re.match(filter_tag_regex, tag_name):
print(f"Skipping tag: {tag_name} (does not match regex: {filter_tag_regex})")
continue
tag_directory = directory / tag_name
tag_directory.mkdir(exist_ok=True)
for asset in release['assets']:
original_asset_url = asset['browser_download_url']
expected_size = asset['size']
# 使用提前判断的 use_proxy 变量
if use_proxy:
asset_url = f"{parsed_input.scheme}://{parsed_input.netloc}/{original_asset_url}"
else:
asset_url = original_asset_url
total_assets += 1
futures.append(
executor.submit(download_asset, session, asset_url, tag_directory / asset['name'], expected_size, max_retries)
)
page += 1
concurrent.futures.wait(futures)
print(f"Total assets downloaded: {total_assets}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download GitHub Release assets filtered by tag.")
parser.add_argument("repo_url", help="GitHub repository clone URL (HTTPS or SSH)")
parser.add_argument("-f", "--filter-tag", help="Regex pattern to filter tags (e.g., '^v\\d+\\.\\d+\\.\\d+$')")
parser.add_argument("-t", "--threads", type=int, default=2, help="Number of threads to use for downloading (default: 2)")
parser.add_argument("-r", "--retries", type=int, default=3, help="Maximum number of retries for each file (0 means infinite).")
args = parser.parse_args()
download_releases(args.repo_url, args.filter_tag, args.threads, args.retries)
六、v3版本增加功能
1、支持文件名以正则表达式规则进行过滤,指定了过滤规则时,不匹配的文件会被直接跳过
v3相应代码如下:
import sys
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urlparse
import concurrent.futures
from pathlib import Path
import re
import argparse
def create_session(retries=5, backoff_factor=0.3, status_formalist=(500, 502, 504)):
"""Create and configure a requests session for automatic retries."""
session = requests.Session()
retry_strategy = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_formalist,
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def extract_user_repo(github_clone_url):
"""Extract username and repository name from a GitHub clone URL, supporting proxy URLs and no .git suffix."""
parsed = urlparse(github_clone_url)
# 检查是否为代理格式(如 ghproxy.com/https://github.com/user/repo)
use_proxy = parsed.netloc and 'github.com' not in parsed.netloc
# 如果没有使用代理
path_parts = parsed.path.strip('/').split('/')
if not use_proxy:
user = path_parts[0]
repo = path_parts[1].split('#')[0].split('?')[0] # 去除 query 或 fragment
repo = repo.rstrip('.git') # 移除 .git(如果存在)
return user, repo
# 默认处理原始 GitHub 格式(如 github.com/user/repo 或 user/repo)
path = parsed.path.strip('/').split('#')[0].split('?')[0] # 去除 query 或 fragment
parts = path.split('/')
if len(parts) < 2:
raise ValueError(f"Invalid GitHub clone URL: {github_clone_url}")
user = parts[3]
repo = parts[4].rstrip('.git') # 移除 .git(如果存在)
return user, repo
def download_asset(session, asset_url, file_path, expected_size, max_retries):
"""Download a single Release asset with retry on failure or size mismatch."""
attempt = 0
while True:
attempt += 1
if max_retries != 0 and attempt > max_retries:
print(f"Failed to download {file_path.name} after {max_retries} attempts.")
return False
print(f"Attempt {attempt} for downloading: {file_path.name}")
try:
with file_path.open('wb') as file:
response = session.get(asset_url, stream=True)
response.raise_for_status()
downloaded_size = 0
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
downloaded_size += len(chunk)
# 校验文件大小
if downloaded_size == expected_size:
print(f"Downloaded {file_path.name}")
return True
else:
print(f"Size mismatch (expected {expected_size}, got {downloaded_size}), retrying...")
except (requests.RequestException, IOError) as e:
print(f"Attempt {attempt} failed: {e}")
# 删除损坏文件
if file_path.exists():
file_path.unlink()
def download_releases(github_clone_url, filter_tag_regex=None, threads=2, max_retries=3, filter_name=None):
"""Download all Releases for a specified repository and organize them by tag."""
session = create_session()
user, repo = extract_user_repo(github_clone_url)
print(f"Downloading releases for {user}/{repo}...")
# API 请求始终使用官方地址
api_url = f"https://api.github.com/repos/{user}/{repo}/releases"
# 判断是否使用代理(即 URL 是否为代理格式)
parsed_input = urlparse(github_clone_url)
use_proxy = parsed_input.netloc and 'github.com' not in parsed_input.netloc
directory = Path.cwd() / repo
directory.mkdir(exist_ok=True)
page = 1
total_assets = 0
futures = []
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
while True:
params = {'per_page': 100, 'page': page}
response = session.get(api_url, params=params)
releases = response.json()
if not releases:
break
for release in releases:
tag_name = release['tag_name']
# 过滤一下标签
if filter_tag_regex and not re.match(filter_tag_regex, tag_name):
print(f"Skipping tag: {tag_name} (does not match regex: {filter_tag_regex})")
continue
tag_directory = directory / tag_name
tag_directory.mkdir(exist_ok=True)
for asset in release['assets']:
original_asset_url = asset['browser_download_url']
expected_size = asset['size']
asset_name = asset['name']
# 新增:根据文件名正则过滤
if filter_name and not re.match(filter_name, asset_name):
print(f"Skipping asset: {asset_name} (does not match regex: {filter_name})")
continue
# 使用提前判断的 use_proxy 变量
if use_proxy:
asset_url = f"{parsed_input.scheme}://{parsed_input.netloc}/{original_asset_url}"
else:
asset_url = original_asset_url
total_assets += 1
futures.append(
executor.submit(download_asset, session, asset_url, tag_directory / asset['name'], expected_size, max_retries)
)
page += 1
concurrent.futures.wait(futures)
print(f"Total assets downloaded: {total_assets}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download GitHub Release assets filtered by tag.")
parser.add_argument("repo_url", help="GitHub repository clone URL (HTTPS or SSH)")
parser.add_argument("-f", "--filter-tag", help="Regex pattern to filter tags (e.g., '^v\\d+\\.\\d+\\.\\d+$')")
parser.add_argument("-t", "--threads", type=int, default=2, help="Number of threads to use for downloading (default: 2)")
parser.add_argument("-r", "--retries", type=int, default=3, help="Maximum number of retries for each file (0 means infinite).")
parser.add_argument("-n", "--filter-name", help="Regex pattern to filter asset filenames (e.g., '.*\\.zip$')")
args = parser.parse_args()
download_releases(args.repo_url, args.filter_tag, args.threads, args.retries, args.filter_name)
执行时的命令参数示例:
python github_release.py https://ghproxy.com/https://github.com/xxx/yyyy --filter-tag=^v\\d+\\.\\d+\\.\\d+$ --threads=4 --filter-name='.*\.(7z|dmg|zip|deb|msi)$'