#!/usr/bin/env python3
"""
阿里云 OSS 目录上传工具(优化分片上传)
使用 Python 3.11 的新特性
"""
import os
import sys
import hashlib
import base64
import hmac
import requests
from urllib.parse import quote_plus
from datetime import datetime
import time
import mimetypes
import json
import signal
import logging
from pathlib import Path
from typing import List, Tuple, Dict, Optional
import subprocess
import argparse
import threading
import getpass
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor, as_completed
# ============================ 配置区 ============================
CONFIG = {
# OSS 配置
"ACCESS_KEY_ID": "你的AccessKey ID",
"ACCESS_KEY_SECRET": "你的AccessKey Secret",
"ENDPOINT": "oss-cn-hangzhou.aliyuncs.com",
"BUCKET_NAME": "BUCKET",
# 上传配置
"LOCAL_DIR": "/data/backup",
"OSS_PREFIX": "2026",
# 分片配置 - 优化分片大小
"CHUNK_SIZE": 20 * 1024 * 1024, # 20MB 分片大小
"MAX_CONCURRENT_PARTS": 3, # 同时上传的分片数
"MAX_RETRIES": 5,
"TIMEOUT": 300, # 5分钟超时
# 断点续传配置
"CHECKPOINT_DIR": ".oss_checkpoints",
"CHECKPOINT_INTERVAL": 30,
# 过滤配置
"EXCLUDE_EXTENSIONS": [".tmp", ".log", ".DS_Store"],
"EXCLUDE_DIRS": [".git", "__pycache__", "node_modules"],
"MAX_FILE_SIZE": 500 * 1024 * 1024 * 1024, # 500GB
}
# ============================ 配置结束 ============================
# 确保使用 Python 3.11
if sys.version_info < (3, 7):
print("警告:建议使用 Python 3.7+ 以获得更好的性能")
print(f"当前版本: {sys.version}")
class CheckpointManager:
"""断点续传管理器"""
def __init__(self, checkpoint_dir: str):
self.checkpoint_dir = Path(checkpoint_dir)
self.checkpoint_dir.mkdir(exist_ok=True)
self.task_id = datetime.now().strftime('%Y%m%d_%H%M%S')
def get_checkpoint_file(self, suffix: str = "") -> Path:
if suffix:
return self.checkpoint_dir / f"checkpoint_{self.task_id}_{suffix}.json"
return self.checkpoint_dir / f"checkpoint_{self.task_id}.json"
def save_checkpoint(self, data: Dict, suffix: str = "") -> bool:
try:
checkpoint_file = self.get_checkpoint_file(suffix)
temp_file = checkpoint_file.with_suffix('.tmp')
with open(temp_file, 'w', encoding='utf-8') as f:
json.dump({
'timestamp': datetime.now().isoformat(),
'data': data
}, f, ensure_ascii=False, indent=2)
temp_file.rename(checkpoint_file)
return True
except Exception as e:
logging.error(f"保存检查点失败: {e}")
return False
def load_checkpoint(self, suffix: str = "") -> Optional[Dict]:
try:
checkpoint_file = self.get_checkpoint_file(suffix)
if checkpoint_file.exists():
with open(checkpoint_file, 'r', encoding='utf-8') as f:
checkpoint_data = json.load(f)
return checkpoint_data['data']
except Exception as e:
logging.error(f"加载检查点失败: {e}")
return None
def delete_checkpoint(self, suffix: str = "") -> bool:
try:
checkpoint_file = self.get_checkpoint_file(suffix)
if checkpoint_file.exists():
checkpoint_file.unlink()
return True
except:
return False
class OSSUploader:
"""OSS 上传器 - 优化分片上传"""
def __init__(self, config: Dict, daemon_mode: bool = False):
self.config = config
self.daemon_mode = daemon_mode
self.host = f"{config['BUCKET_NAME']}.{config['ENDPOINT']}"
self.access_key_secret = config['ACCESS_KEY_SECRET'].encode('utf-8')
# 初始化检查点管理器
self.checkpoint_manager = CheckpointManager(config['CHECKPOINT_DIR'])
# 上传统计
self.stats = {
'total_files': 0,
'total_size': 0,
'uploaded_files': 0,
'uploaded_size': 0,
'failed_files': 0,
'skipped_files': 0,
'start_time': time.time(),
'task_id': self.checkpoint_manager.task_id,
'local_dir': config['LOCAL_DIR'],
'oss_prefix': config['OSS_PREFIX'],
'user': getpass.getuser()
}
# 上传状态
self.is_running = True
self.current_file = None
self.current_file_size = 0
self.current_bytes_uploaded = 0
# 文件哈希缓存
self.file_hashes = {}
# 会话保持
self.session = requests.Session()
# 设置日志
self._setup_logging()
# 注册信号处理
self._setup_signal_handlers()
def _setup_logging(self):
"""设置日志系统"""
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
log_file = f"{log_dir}/oss_upload_{self.checkpoint_manager.task_id}.log"
# 配置 logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
handlers=[
logging.FileHandler(log_file, encoding='utf-8'),
logging.StreamHandler() if not self.daemon_mode else logging.NullHandler()
]
)
self.logger = logging.getLogger(__name__)
self.log_file = log_file
self.stats['log_file'] = log_file
# 添加控制台输出(后台模式也输出)
if self.daemon_mode:
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(message)s', '%H:%M:%S')
console.setFormatter(formatter)
self.logger.addHandler(console)
def _setup_signal_handlers(self):
"""设置信号处理"""
def signal_handler(signum, frame):
self.logger.info(f"收到停止信号,正在保存状态...")
self.is_running = False
self._save_checkpoint_now()
time.sleep(2)
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
def _save_checkpoint_now(self):
"""立即保存检查点"""
if self.current_file:
checkpoint_data = {
'stats': self.stats,
'current_file': self.current_file,
'current_file_size': self.current_file_size,
'current_bytes_uploaded': self.current_bytes_uploaded,
'file_hashes': self.file_hashes
}
if self.checkpoint_manager.save_checkpoint(checkpoint_data):
self.logger.debug("检查点已保存")
def _calculate_file_hash(self, file_path: str) -> str:
"""计算文件哈希"""
try:
hasher = hashlib.sha256()
with open(file_path, 'rb') as f:
# 只读取文件头尾来计算快速哈希
f.seek(0)
head = f.read(65536)
hasher.update(head)
if os.path.getsize(file_path) > 131072:
f.seek(-65536, 2)
tail = f.read(65536)
hasher.update(tail)
stat = os.stat(file_path)
hasher.update(str(stat.st_size).encode())
hasher.update(str(stat.st_mtime).encode())
return hasher.hexdigest()[:16]
except Exception as e:
self.logger.warning(f"计算文件哈希失败 {file_path}: {e}")
return ""
def _sign_request(self, method: str, key: str, headers: Optional[Dict] = None) -> Tuple[str, str]:
"""生成 OSS 请求签名"""
if headers is None:
headers = {}
date = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
canonicalized_resource = f"/{self.config['BUCKET_NAME']}/{key}"
canonicalized_headers = ""
sorted_keys = sorted([k for k in headers.keys() if k.lower().startswith('x-oss-')])
for k in sorted_keys:
canonicalized_headers += f"{k.lower()}:{headers[k]}\n"
content_type = headers.get('Content-Type', '')
string_to_sign = f"{method}\n\n{content_type}\n{date}\n{canonicalized_headers}{canonicalized_resource}"
signature = base64.b64encode(
hmac.new(self.access_key_secret, string_to_sign.encode('utf-8'), hashlib.sha1).digest()
).decode('utf-8')
return date, signature
def _get_files_to_upload(self, local_dir: str) -> List[Tuple[str, str, int, str]]:
"""获取要上传的所有文件列表"""
files = []
local_path = Path(local_dir).resolve()
if not local_path.exists():
self.logger.error(f"本地目录不存在: {local_dir}")
return files
if not local_path.is_dir():
# 单个文件
try:
file_size = local_path.stat().st_size
if file_size > self.config['MAX_FILE_SIZE']:
self.logger.warning(f"文件过大,跳过: {local_dir}")
return files
file_hash = self._calculate_file_hash(str(local_path))
filename = local_path.name
oss_key = f"{self.config['OSS_PREFIX']}/{filename}"
files.append((str(local_path), oss_key, file_size, file_hash))
return files
except Exception as e:
self.logger.error(f"处理文件失败 {local_dir}: {e}")
return files
self.logger.info(f"开始扫描目录: {local_dir}")
for root, dirs, filenames in os.walk(local_dir):
dirs[:] = [d for d in dirs if d not in self.config['EXCLUDE_DIRS']]
for filename in filenames:
if any(filename.endswith(ext) for ext in self.config['EXCLUDE_EXTENSIONS']):
continue
local_file = Path(root) / filename
try:
file_size = local_file.stat().st_size
except Exception as e:
self.logger.warning(f"无法获取文件大小 {local_file}: {e}")
continue
if file_size > self.config['MAX_FILE_SIZE']:
size_gb = self.config['MAX_FILE_SIZE'] // 1024 // 1024 // 1024
self.logger.warning(f"跳过超大文件(>{size_gb}GB): {local_file}")
continue
try:
relative_path = local_file.relative_to(local_path)
oss_key = f"{self.config['OSS_PREFIX']}/{relative_path}".replace("\\", "/")
if oss_key.startswith('/'):
oss_key = oss_key[1:]
file_hash = self._calculate_file_hash(str(local_file))
files.append((str(local_file), oss_key, file_size, file_hash))
except Exception as e:
self.logger.warning(f"处理文件失败 {local_file}: {e}")
self.logger.info(f"扫描完成,发现 {len(files)} 个文件")
return files
def _format_size(self, size_bytes: int) -> str:
"""格式化文件大小"""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size_bytes < 1024.0:
return f"{size_bytes:.2f} {unit}"
size_bytes /= 1024.0
return f"{size_bytes:.2f} PB"
def _format_time(self, seconds: float) -> str:
"""格式化时间"""
if seconds < 60:
return f"{seconds:.1f}秒"
elif seconds < 3600:
return f"{seconds/60:.1f}分钟"
else:
return f"{seconds/3600:.1f}小时"
def _parse_xml_response(self, xml_text: str, tag_name: str) -> Optional[str]:
"""解析 XML 响应"""
try:
root = ET.fromstring(xml_text)
element = root.find(tag_name)
return element.text if element is not None else None
except Exception as e:
self.logger.warning(f"解析 XML 失败: {e}")
return None
def _test_bucket_capabilities(self) -> Dict[str, bool]:
"""测试 Bucket 支持的功能"""
capabilities = {
'multipart_upload': True, # 默认假设支持
'standard_storage': True
}
# 这里可以添加更详细的测试逻辑
return capabilities
def _upload_small_file(self, local_path: str, oss_key: str, file_size: int) -> Tuple[bool, str]:
"""上传小文件(直接上传)"""
for retry in range(self.config['MAX_RETRIES'] + 1):
if not self.is_running:
return False, "上传被停止"
try:
with open(local_path, 'rb') as f:
file_data = f.read()
content_type, _ = mimetypes.guess_type(local_path)
if not content_type:
content_type = 'application/octet-stream'
headers = {
'Content-Type': content_type,
'Content-Length': str(file_size),
'x-oss-object-acl': 'private'
}
date, signature = self._sign_request('PUT', oss_key, headers)
headers['Date'] = date
headers['Authorization'] = f"OSS {self.config['ACCESS_KEY_ID']}:{signature}"
url = f"https://{self.host}/{quote_plus(oss_key)}"
self.logger.debug(f"小文件上传URL: {url}")
start_time = time.time()
response = self.session.put(
url,
data=file_data,
headers=headers,
timeout=self.config['TIMEOUT']
)
elapsed = time.time() - start_time
if response.status_code == 200:
speed = file_size / elapsed / 1024 / 1024 if elapsed > 0 else 0
return True, f"用时: {elapsed:.1f}s, 速度: {speed:.2f} MB/s"
else:
error_msg = f"HTTP {response.status_code}"
if response.text:
error_msg += f": {response.text[:200]}"
except Exception as e:
error_msg = f"错误: {str(e)}"
if retry < self.config['MAX_RETRIES']:
wait_time = 2 ** retry
self.logger.warning(f"上传失败,{wait_time}秒后重试({retry+1}/{self.config['MAX_RETRIES']}): {error_msg}")
time.sleep(wait_time)
else:
break
return False, error_msg
def _init_multipart_upload(self, oss_key: str) -> Tuple[Optional[str], str]:
"""初始化分片上传"""
for retry in range(self.config['MAX_RETRIES'] + 1):
if not self.is_running:
return None, "上传被停止"
try:
init_url = f"https://{self.host}/{quote_plus(oss_key)}?uploads"
init_headers = {}
init_date, init_signature = self._sign_request('POST', f"{oss_key}?uploads", init_headers)
init_headers['Date'] = init_date
init_headers['Authorization'] = f"OSS {self.config['ACCESS_KEY_ID']}:{init_signature}"
response = self.session.post(init_url, headers=init_headers, timeout=self.config['TIMEOUT'])
if response.status_code == 200:
# 解析 XML 响应
upload_id = self._parse_xml_response(response.text, 'UploadId')
if upload_id:
self.logger.info(f"分片上传初始化成功,UploadId: {upload_id[:20]}...")
return upload_id, ""
else:
error_msg = f"无法从XML获取UploadId"
self.logger.debug(f"XML响应: {response.text[:500]}")
else:
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
except Exception as e:
error_msg = f"错误: {str(e)}"
if retry < self.config['MAX_RETRIES']:
wait_time = 2 ** retry
self.logger.warning(f"初始化分片上传失败,{wait_time}秒后重试({retry+1}/{self.config['MAX_RETRIES']}): {error_msg}")
time.sleep(wait_time)
else:
break
return None, error_msg
def _upload_part_with_retry(self, oss_key: str, upload_id: str, part_number: int,
chunk: bytes, offset: int) -> Tuple[Optional[str], str]:
"""上传单个分片(带重试)"""
for retry in range(self.config['MAX_RETRIES'] + 1):
if not self.is_running:
return None, "上传被停止"
try:
part_url = f"https://{self.host}/{quote_plus(oss_key)}?partNumber={part_number}&uploadId={upload_id}"
part_headers = {
'Content-Length': str(len(chunk))
}
part_date, part_signature = self._sign_request(
'PUT',
f"{oss_key}?partNumber={part_number}&uploadId={upload_id}",
part_headers
)
part_headers['Date'] = part_date
part_headers['Authorization'] = f"OSS {self.config['ACCESS_KEY_ID']}:{part_signature}"
start_time = time.time()
response = self.session.put(
part_url,
data=chunk,
headers=part_headers,
timeout=self.config['TIMEOUT']
)
elapsed = time.time() - start_time
if response.status_code == 200:
etag = response.headers.get('ETag', '').strip('"')
if etag:
speed = len(chunk) / elapsed / 1024 / 1024 if elapsed > 0 else 0
self.logger.debug(f"分片 {part_number} 上传成功,速度: {speed:.2f} MB/s")
return etag, ""
else:
error_msg = f"未返回 ETag"
else:
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
except Exception as e:
error_msg = f"错误: {str(e)}"
if retry < self.config['MAX_RETRIES']:
wait_time = 2 ** retry
self.logger.warning(f"分片 {part_number} 上传失败,{wait_time}秒后重试({retry+1}/{self.config['MAX_RETRIES']}): {error_msg}")
time.sleep(wait_time)
else:
break
return None, error_msg
def _complete_multipart_upload(self, oss_key: str, upload_id: str, parts: List[Dict]) -> Tuple[bool, str]:
"""完成分片上传"""
# 按分片号排序
sorted_parts = sorted(parts, key=lambda x: x['PartNumber'])
# 构建 XML
complete_xml = '<?xml version="1.0" encoding="UTF-8"?><CompleteMultipartUpload>'
for part in sorted_parts:
complete_xml += f'<Part><PartNumber>{part["PartNumber"]}</PartNumber><ETag>{part["ETag"]}</ETag></Part>'
complete_xml += '</CompleteMultipartUpload>'
for retry in range(self.config['MAX_RETRIES'] + 1):
if not self.is_running:
return False, "上传被停止"
try:
complete_url = f"https://{self.host}/{quote_plus(oss_key)}?uploadId={upload_id}"
complete_headers = {
'Content-Type': 'application/xml',
'Content-Length': str(len(complete_xml))
}
complete_date, complete_signature = self._sign_request(
'POST',
f"{oss_key}?uploadId={upload_id}",
complete_headers
)
complete_headers['Date'] = complete_date
complete_headers['Authorization'] = f"OSS {self.config['ACCESS_KEY_ID']}:{complete_signature}"
response = self.session.post(
complete_url,
data=complete_xml,
headers=complete_headers,
timeout=self.config['TIMEOUT']
)
if response.status_code == 200:
self.logger.info("分片上传合并成功")
return True, "合并成功"
else:
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
except Exception as e:
error_msg = f"错误: {str(e)}"
if retry < self.config['MAX_RETRIES']:
wait_time = 2 ** retry
self.logger.warning(f"合并上传失败,{wait_time}秒后重试({retry+1}/{self.config['MAX_RETRIES']}): {error_msg}")
time.sleep(wait_time)
else:
break
return False, error_msg
def _upload_large_file_parallel(self, local_path: str, oss_key: str, file_size: int) -> Tuple[bool, str]:
"""并行分片上传大文件"""
try:
# 1. 初始化分片上传
self.logger.info("正在初始化分片上传...")
upload_id, error_msg = self._init_multipart_upload(oss_key)
if not upload_id:
return False, f"初始化失败: {error_msg}"
# 2. 计算分片信息
chunk_size = self.config['CHUNK_SIZE']
total_parts = (file_size + chunk_size - 1) // chunk_size
self.logger.info(f"文件大小: {self._format_size(file_size)}")
self.logger.info(f"分片大小: {self._format_size(chunk_size)}")
self.logger.info(f"总片数: {total_parts}")
self.logger.info(f"并行上传数: {self.config['MAX_CONCURRENT_PARTS']}")
# 3. 读取文件并上传分片
parts = []
completed_parts = 0
start_time = time.time()
last_progress_time = time.time()
def upload_part_task(part_num: int, chunk_data: bytes, chunk_offset: int):
"""上传单个分片的任务"""
if not self.is_running:
return None, "上传被停止"
etag, error_msg = self._upload_part_with_retry(
oss_key, upload_id, part_num, chunk_data, chunk_offset
)
if etag:
return {
'PartNumber': part_num,
'ETag': etag,
'Size': len(chunk_data)
}, None
else:
return None, error_msg
# 使用线程池并行上传
with ThreadPoolExecutor(max_workers=self.config['MAX_CONCURRENT_PARTS']) as executor:
future_to_part = {}
with open(local_path, 'rb') as f:
part_number = 1
offset = 0
while offset < file_size and self.is_running:
# 计算当前分片大小
remaining = file_size - offset
current_chunk_size = min(chunk_size, remaining)
# 读取分片数据
f.seek(offset)
chunk_data = f.read(current_chunk_size)
if not chunk_data:
break
# 提交上传任务
future = executor.submit(upload_part_task, part_number, chunk_data, offset)
future_to_part[future] = part_number
part_number += 1
offset += current_chunk_size
# 控制提交速度,避免内存占用过高
if len(future_to_part) >= self.config['MAX_CONCURRENT_PARTS'] * 2:
time.sleep(0.1)
# 处理已完成的任务
for future in as_completed(future_to_part):
if not self.is_running:
break
part_num = future_to_part[future]
try:
result, error_msg = future.result(timeout=self.config['TIMEOUT'])
if result:
parts.append(result)
completed_parts += 1
# 显示进度
current_time = time.time()
if current_time - last_progress_time > 10: # 每10秒显示一次进度
progress = completed_parts / total_parts * 100
elapsed = current_time - start_time
uploaded_size = sum(p['Size'] for p in parts)
speed = uploaded_size / elapsed / 1024 / 1024 if elapsed > 0 else 0
self.logger.info(f"进度: {progress:.1f}% "
f"({completed_parts}/{total_parts}个分片), "
f"已上传: {self._format_size(uploaded_size)}, "
f"速度: {speed:.2f} MB/s")
last_progress_time = current_time
# 定期保存检查点
if completed_parts % 10 == 0:
self._save_checkpoint_now()
else:
self.logger.error(f"分片 {part_num} 上传失败: {error_msg}")
# 停止所有任务
self.is_running = False
executor.shutdown(wait=False)
return False, f"分片 {part_num} 上传失败: {error_msg}"
except Exception as e:
self.logger.error(f"分片 {part_num} 任务异常: {str(e)}")
self.is_running = False
executor.shutdown(wait=False)
return False, f"分片 {part_num} 异常: {str(e)}"
if not self.is_running:
return False, "上传被停止"
# 4. 检查是否所有分片都上传完成
if len(parts) != total_parts:
return False, f"上传不完整: {len(parts)}/{total_parts}个分片"
# 5. 完成上传
self.logger.info("所有分片上传完成,正在合并文件...")
success, error_msg = self._complete_multipart_upload(oss_key, upload_id, parts)
if success:
total_time = time.time() - start_time
avg_speed = file_size / total_time / 1024 / 1024 if total_time > 0 else 0
self.logger.info(f"大文件上传完成!总用时: {total_time:.1f}s, 平均速度: {avg_speed:.2f} MB/s")
return True, f"上传完成 - {len(parts)}个分片,用时: {total_time:.1f}s"
else:
return False, f"合并失败: {error_msg}"
except Exception as e:
self.logger.error(f"分片上传异常: {str(e)}")
import traceback
self.logger.error(f"异常详情:\n{traceback.format_exc()}")
return False, f"异常: {str(e)}"
def upload_directory(self, resume: bool = False) -> Dict:
"""上传整个目录到 OSS"""
self.logger.info("=" * 70)
self.logger.info("OSS 目录上传工具启动 (优化分片版)")
self.logger.info(f"任务ID: {self.stats['task_id']}")
self.logger.info(f"Python版本: {sys.version}")
self.logger.info(f"模式: {'后台' if self.daemon_mode else '前台'}")
self.logger.info(f"断点续传: {'启用' if resume else '禁用'}")
self.logger.info("=" * 70)
# 获取要上传的文件
all_files = self._get_files_to_upload(self.config['LOCAL_DIR'])
if not all_files:
self.logger.warning("没有找到要上传的文件")
return self.stats
# 更新总文件数
self.stats['total_files'] = len(all_files)
self.stats['total_size'] = sum(f[2] for f in all_files)
self.logger.info(f"开始上传 {len(all_files)} 个文件,总大小: {self._format_size(self.stats['total_size'])}")
# 开始上传
for i, (local_path, oss_key, file_size, file_hash) in enumerate(all_files, 1):
if not self.is_running:
self.logger.info("上传被停止")
break
# 更新当前文件状态
self.current_file = local_path
self.current_file_size = file_size
self.current_bytes_uploaded = 0
# 显示进度
overall_progress = (self.stats['uploaded_files'] + i - 1) / len(all_files) * 100
filename = os.path.basename(local_path)
self.logger.info("-" * 70)
self.logger.info(f"文件 {i}/{len(all_files)} - 总体进度: {overall_progress:.1f}%")
self.logger.info(f"正在上传: {filename}")
self.logger.info(f"文件大小: {self._format_size(file_size)}")
self.logger.info(f"OSS路径: {oss_key}")
# 根据文件大小选择上传方式
success = False
message = ""
# 简单判断:大于1GB的使用分片上传
if file_size > 1 * 1024 * 1024 * 1024: # 1GB
self.logger.info("使用分片上传(大文件)")
success, message = self._upload_large_file_parallel(local_path, oss_key, file_size)
else:
self.logger.info("使用简单上传(小文件)")
success, message = self._upload_small_file(local_path, oss_key, file_size)
# 更新统计
if success:
self.stats['uploaded_files'] += 1
self.stats['uploaded_size'] += file_size
self.logger.info(f"✓ 上传成功: {filename}")
else:
self.stats['failed_files'] += 1
self.logger.error(f"✗ 上传失败: {filename} - {message}")
# 保存检查点
self._save_checkpoint_now()
# 重置当前文件状态
self.current_file = None
self.current_file_size = 0
self.current_bytes_uploaded = 0
# 文件间暂停
time.sleep(1)
# 生成报告
self._generate_report()
# 清理检查点
if self.stats['failed_files'] == 0:
self.checkpoint_manager.delete_checkpoint()
return self.stats
def _generate_report(self) -> None:
"""生成上传报告"""
elapsed_time = time.time() - self.stats['start_time']
self.logger.info("=" * 70)
self.logger.info("上传任务完成")
self.logger.info("=" * 70)
self.logger.info("统计信息:")
self.logger.info(f" 任务ID: {self.stats['task_id']}")
self.logger.info(f" 总文件数: {self.stats['total_files']}")
self.logger.info(f" 总大小: {self._format_size(self.stats['total_size'])}")
self.logger.info(f" 上传成功: {self.stats['uploaded_files']}")
self.logger.info(f" 上传失败: {self.stats['failed_files']}")
self.logger.info(f" 总用时: {self._format_time(elapsed_time)}")
if elapsed_time > 0:
avg_speed = self.stats['uploaded_size'] / elapsed_time / 1024 / 1024 # MB/s
self.logger.info(f" 平均速度: {avg_speed:.2f} MB/s")
self.logger.info(f"日志文件: {self.log_file}")
def get_running_pids():
"""获取正在运行的进程ID"""
try:
result = subprocess.check_output(
['pgrep', '-f', 'oss_uploader_v2.py'],
stderr=subprocess.STDOUT
)
pids = [int(pid) for pid in result.decode().strip().split('\n') if pid]
return pids
except subprocess.CalledProcessError:
return []
except Exception as e:
print(f"获取进程ID失败: {e}")
return []
def test_oss_multipart(config: Dict) -> bool:
"""测试 OSS 分片上传功能"""
try:
import hashlib
import hmac
from datetime import datetime
host = f"{config['BUCKET_NAME']}.{config['ENDPOINT']}"
access_key_secret = config['ACCESS_KEY_SECRET'].encode('utf-8')
# 创建一个测试文件路径
test_key = f"{config['OSS_PREFIX']}/_test_multipart.txt"
# 测试初始化分片上传
date = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
canonicalized_resource = f"/{config['BUCKET_NAME']}/{test_key}?uploads"
string_to_sign = f"POST\n\n\n{date}\n{canonicalized_resource}"
signature = base64.b64encode(
hmac.new(access_key_secret, string_to_sign.encode('utf-8'), hashlib.sha1).digest()
).decode('utf-8')
headers = {
'Date': date,
'Authorization': f"OSS {config['ACCESS_KEY_ID']}:{signature}"
}
url = f"https://{host}/{quote_plus(test_key)}?uploads"
response = requests.post(url, headers=headers, timeout=30)
if response.status_code == 200:
print(f"✓ 分片上传功能测试成功")
# 解析 UploadId 并立即中止上传
import xml.etree.ElementTree as ET
root = ET.fromstring(response.text)
upload_id_elem = root.find('UploadId')
if upload_id_elem is not None:
upload_id = upload_id_elem.text
# 中止上传
abort_date = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
abort_resource = f"/{config['BUCKET_NAME']}/{test_key}?uploadId={upload_id}"
abort_string = f"DELETE\n\n\n{abort_date}\n{abort_resource}"
abort_signature = base64.b64encode(
hmac.new(access_key_secret, abort_string.encode('utf-8'), hashlib.sha1).digest()
).decode('utf-8')
abort_headers = {
'Date': abort_date,
'Authorization': f"OSS {config['ACCESS_KEY_ID']}:{abort_signature}"
}
abort_url = f"https://{host}/{quote_plus(test_key)}?uploadId={upload_id}"
requests.delete(abort_url, headers=abort_headers, timeout=10)
print(f" 已清理测试上传任务")
return True
elif response.status_code == 400:
error_text = response.text[:500]
if "OperationNotSupported" in error_text:
print(f"✗ Bucket 不支持分片上传")
print(f" 响应: {error_text}")
return False
else:
print(f"✗ 分片上传测试失败: HTTP {response.status_code}")
print(f" 响应: {error_text}")
return False
else:
print(f"✗ 分片上传测试失败: HTTP {response.status_code}")
print(f" 响应: {response.text[:500]}")
return False
except Exception as e:
print(f"✗ 分片上传测试异常: {str(e)}")
return False
def main():
"""主函数"""
# 检查 Python 版本
python_version = sys.version_info
if python_version < (3, 7):
print("警告:当前 Python 版本较低,建议使用 Python 3.7+")
print(f"当前版本: {sys.version}")
print("可以使用: python3.11 oss_uploader_v2.py ...")
parser = argparse.ArgumentParser(
description='阿里云 OSS 目录上传工具(优化分片版)',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
使用示例:
# 使用 Python 3.11 运行(推荐)
python3.11 oss_uploader_v2.py start
# 后台上传
python3.11 oss_uploader_v2.py daemon start
# 查看状态
python3.11 oss_uploader_v2.py daemon status
# 查看日志
python3.11 oss_uploader_v2.py daemon logs
# 停止后台任务
python3.11 oss_uploader_v2.py daemon stop
'''
)
subparsers = parser.add_subparsers(dest='command', help='命令')
# start 命令
start_parser = subparsers.add_parser('start', help='前台上传')
start_parser.add_argument('--resume', action='store_true', help='断点续传')
start_parser.add_argument('--local-dir', help='本地目录路径')
start_parser.add_argument('--oss-prefix', help='OSS目标前缀')
# daemon 命令
daemon_parser = subparsers.add_parser('daemon', help='后台进程管理')
daemon_parser.add_argument('action', choices=['start', 'stop', 'status', 'logs'],
help='start:启动后台, stop:停止, status:状态, logs:日志')
daemon_parser.add_argument('--resume', action='store_true', help='断点续传')
daemon_parser.add_argument('--local-dir', help='本地目录路径')
daemon_parser.add_argument('--oss-prefix', help='OSS目标前缀')
args = parser.parse_args()
if not args.command:
parser.print_help()
return
# 更新配置
config = CONFIG.copy()
if hasattr(args, 'local_dir') and args.local_dir:
config['LOCAL_DIR'] = args.local_dir
if hasattr(args, 'oss_prefix') and args.oss_prefix:
config['OSS_PREFIX'] = args.oss_prefix
# 检查配置
required_keys = ['ACCESS_KEY_ID', 'ACCESS_KEY_SECRET', 'ENDPOINT', 'BUCKET_NAME']
missing_keys = []
for key in required_keys:
value = config.get(key)
if not value or '你的' in str(value) or 'your-' in str(value) or 'xxxx' in str(value):
missing_keys.append(key)
if missing_keys:
print(f"错误:以下配置项未正确填写:{', '.join(missing_keys)}")
print("请编辑脚本开头的 CONFIG 部分")
sys.exit(1)
# 检查本地目录
if hasattr(args, 'local_dir') and args.local_dir:
if not os.path.exists(config['LOCAL_DIR']):
print(f"错误:本地目录不存在: {config['LOCAL_DIR']}")
sys.exit(1)
else:
if not os.path.exists(config['LOCAL_DIR']):
print(f"错误:本地目录不存在: {config['LOCAL_DIR']}")
sys.exit(1)
# 执行命令
if args.command == 'start':
# 测试分片上传功能
print("正在测试 OSS 分片上传功能...")
if not test_oss_multipart(config):
print("警告:Bucket 可能不支持分片上传")
print("将使用备用上传策略...")
time.sleep(2)
print(f"开始上传: {config['LOCAL_DIR']} -> oss://{config['BUCKET_NAME']}/{config['OSS_PREFIX']}")
print(f"Python版本: {sys.version}")
print(f"日志文件: logs/oss_upload_*.log")
print("按 Ctrl+C 停止上传")
print("-" * 70)
try:
uploader = OSSUploader(config, daemon_mode=False)
stats = uploader.upload_directory(resume=args.resume)
if stats['failed_files'] == 0:
print("上传完成!")
sys.exit(0)
else:
print(f"上传完成,但有 {stats['failed_files']} 个文件失败")
sys.exit(1)
except KeyboardInterrupt:
print("\n上传被用户中断")
sys.exit(130)
except Exception as e:
print(f"上传失败: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
elif args.command == 'daemon':
if args.action == 'start':
# 检查是否已在运行
pids = get_running_pids()
if len(pids) > 1:
print(f"上传任务已在运行 (PID: {pids[0]})")
print("查看日志: tail -f logs/oss_upload_*.log")
return
print(f"启动后台上传任务...")
print(f"源目录: {config['LOCAL_DIR']}")
print(f"目标: oss://{config['BUCKET_NAME']}/{config['OSS_PREFIX']}")
print(f"Python版本: {sys.version}")
# 构建命令 - 使用当前 Python 解释器
cmd = [sys.executable, __file__, 'start']
if args.resume:
cmd.append('--resume')
if hasattr(args, 'local_dir') and args.local_dir:
cmd.extend(['--local-dir', config['LOCAL_DIR']])
if hasattr(args, 'oss_prefix') and args.oss_prefix:
cmd.extend(['--oss-prefix', config['OSS_PREFIX']])
# 创建日志目录
os.makedirs('logs', exist_ok=True)
# 启动进程
with open('nohup.out', 'a') as f:
process = subprocess.Popen(
cmd,
stdout=f,
stderr=subprocess.STDOUT,
preexec_fn=os.setpgrp
)
print(f"后台任务已启动 (PID: {process.pid})")
print(f"查看日志: tail -f logs/oss_upload_*.log")
print(f"查看输出: tail -f nohup.out")
print("使用 'python3 oss_uploader_v2.py daemon status' 查看状态")
elif args.action == 'stop':
pids = get_running_pids()
if pids:
print(f"正在停止进程: {pids}")
for pid in pids:
try:
os.kill(pid, signal.SIGTERM)
print(f"已发送停止信号到进程 {pid}")
except Exception as e:
print(f"停止进程 {pid} 失败: {e}")
time.sleep(2)
remaining = get_running_pids()
if remaining:
print("强制终止剩余进程...")
for pid in remaining:
try:
os.kill(pid, signal.SIGKILL)
except:
pass
print("所有上传任务已停止")
else:
print("没有找到运行中的上传任务")
elif args.action == 'status':
pids = get_running_pids()
if pids:
print(f"上传任务正在运行 (PID: {pids})")
log_dir = Path('logs')
if log_dir.exists():
log_files = list(log_dir.glob('oss_upload_*.log'))
if log_files:
log_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
latest_log = log_files[0]
print(f"最新日志: {latest_log}")
print(f"查看实时日志: tail -f {latest_log}")
else:
print("上传任务未运行")
elif args.action == 'logs':
log_dir = Path('logs')
if log_dir.exists():
log_files = list(log_dir.glob('oss_upload_*.log'))
if log_files:
log_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
latest_log = log_files[0]
print(f"显示日志: {latest_log}")
print("-" * 70)
try:
with open(latest_log, 'r', encoding='utf-8') as f:
lines = f.readlines()
start = max(0, len(lines) - 100)
for line in lines[start:]:
print(line.rstrip())
except Exception as e:
print(f"读取日志失败: {e}")
else:
print("没有找到日志文件")
else:
print("日志目录不存在")
if __name__ == "__main__":
# 检查依赖
try:
import requests
except ImportError:
print("错误:requests 库未安装")
print("请运行: pip install requests")
sys.exit(1)
# 创建必要的目录
os.makedirs('logs', exist_ok=True)
os.makedirs(CONFIG['CHECKPOINT_DIR'], exist_ok=True)
# 运行主程序
main()
浙公网安备 33010602011771号