ZYNQ Ultrascale+系列部署yolo v10（暂定，若过于艰难则考虑降级或FQ）

YOLO V10模型分析与优化

2.1 YOLO V10模型获取与环境准备

步骤1：创建工作目录结构

# 打开终端，创建项目根目录
mkdir -p ~/yolo_v10_fpga_project
cd ~/yolo_v10_fpga_project

# 创建详细的目录结构
mkdir -p models/original           # 存放原始模型
mkdir -p models/onnx               # 存放ONNX格式模型
mkdir -p models/quantized          # 存放量化后的模型
mkdir -p models/analysis           # 存放模型分析结果
mkdir -p datasets/calibration      # 存放校准数据集
mkdir -p datasets/validation       # 存放验证数据集
mkdir -p scripts/python            # Python脚本
mkdir -p scripts/tcl               # TCL脚本
mkdir -p tools                     # 工具软件
mkdir -p logs                      # 日志文件
mkdir -p config                    # 配置文件

# 验证目录结构
tree -L 2

步骤2：安装Python环境和依赖

# 创建Python虚拟环境（推荐使用Python 3.9）
python3.9 -m venv venv_yolo

# 激活虚拟环境
source venv_yolo/bin/activate

# 升级pip到最新版本
pip install --upgrade pip

# 创建requirements.txt文件
cat > requirements.txt << EOF
# 基础深度学习框架
torch==2.0.1
torchvision==0.15.2
onnx==1.14.0
onnxruntime==1.15.1

# YOLO相关
ultralytics==8.0.200
opencv-python==4.8.1.78
pillow==10.0.1

# 模型分析和可视化
netron==7.1.9
tensorboard==2.14.0
matplotlib==3.7.2
seaborn==0.12.2

# 量化工具
pytorch-quantization==2.1.2
onnx-simplifier==0.4.33
onnxoptimizer==0.3.13

# 数据处理
numpy==1.24.3
pandas==2.0.3
tqdm==4.66.1
pyyaml==6.0.1

# FPGA相关（如果有Xilinx工具的Python接口）
# pynq==3.0.1  # 如果使用PYNQ框架
EOF

# 安装所有依赖
pip install -r requirements.txt

# 验证安装
python -c "import torch; print(f'PyTorch版本: {torch.__version__}')"
python -c "import ultralytics; print(f'Ultralytics版本: {ultralytics.__version__}')"

步骤3：下载YOLO V10预训练模型

# 创建下载脚本：scripts/python/download_models.py
cat > scripts/python/download_models.py << 'EOF'
#!/usr/bin/env python3
"""
YOLO V10模型下载脚本
详细下载所有变体的预训练模型
"""

import os
import sys
import urllib.request
import hashlib
from pathlib import Path
from tqdm import tqdm

class ModelDownloader:
    def __init__(self, base_path="models/original"):
        self.base_path = Path(base_path)
        self.base_path.mkdir(parents=True, exist_ok=True)
        
        # YOLO V10模型URL（这里使用示例URL，实际需要替换为真实的）
        self.model_urls = {
            'yolov10n': {
                'url': 'https://github.com/THU-MIG/yolov10/releases/download/v1.1/yolov10n.pt',
                'size': '5.5MB',
                'params': '2.3M',
                'flops': '6.7G',
                'md5': 'abc123...'  # 实际MD5值
            },
            'yolov10s': {
                'url': 'https://github.com/THU-MIG/yolov10/releases/download/v1.1/yolov10s.pt',
                'size': '16.6MB',
                'params': '7.2M',
                'flops': '21.6G',
                'md5': 'def456...'
            },
            'yolov10m': {
                'url': 'https://github.com/THU-MIG/yolov10/releases/download/v1.1/yolov10m.pt',
                'size': '37.2MB',
                'params': '15.4M',
                'flops': '59.1G',
                'md5': 'ghi789...'
            }
        }
    
    def download_with_progress(self, url, filepath):
        """带进度条的下载函数"""
        def download_hook(block_num, block_size, total_size):
            downloaded = block_num * block_size
            percent = min(downloaded * 100.0 / total_size, 100)
            progress_bar.update(min(block_size, total_size - downloaded))
        
        with tqdm(unit='B', unit_scale=True, desc=filepath.name) as progress_bar:
            urllib.request.urlretrieve(url, filepath, reporthook=download_hook)
    
    def verify_md5(self, filepath, expected_md5):
        """验证文件MD5"""
        md5_hash = hashlib.md5()
        with open(filepath, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                md5_hash.update(chunk)
        return md5_hash.hexdigest() == expected_md5
    
    def download_model(self, model_name):
        """下载指定模型"""
        if model_name not in self.model_urls:
            print(f"错误：未知的模型名称 {model_name}")
            return False
        
        model_info = self.model_urls[model_name]
        filepath = self.base_path / f"{model_name}.pt"
        
        # 检查文件是否已存在
        if filepath.exists():
            print(f"模型 {model_name} 已存在，跳过下载")
            return True
        
        print(f"\n开始下载 {model_name}:")
        print(f"  - 文件大小: {model_info['size']}")
        print(f"  - 参数量: {model_info['params']}")
        print(f"  - FLOPs: {model_info['flops']}")
        
        try:
            self.download_with_progress(model_info['url'], filepath)
            print(f"✓ 下载完成: {filepath}")
            
            # 验证MD5（如果提供）
            # if self.verify_md5(filepath, model_info['md5']):
            #     print("✓ MD5验证通过")
            # else:
            #     print("✗ MD5验证失败")
            #     os.remove(filepath)
            #     return False
            
            return True
            
        except Exception as e:
            print(f"✗ 下载失败: {e}")
            if filepath.exists():
                os.remove(filepath)
            return False
    
    def download_all(self):
        """下载所有模型"""
        print("="*50)
        print("开始下载所有YOLO V10模型")
        print("="*50)
        
        for model_name in self.model_urls.keys():
            success = self.download_model(model_name)
            if not success:
                print(f"警告：模型 {model_name} 下载失败")
        
        print("\n所有模型下载完成！")
        self.list_downloaded_models()
    
    def list_downloaded_models(self):
        """列出已下载的模型"""
        print("\n已下载的模型：")
        for model_file in self.base_path.glob("*.pt"):
            size_mb = model_file.stat().st_size / (1024 * 1024)
            print(f"  - {model_file.name}: {size_mb:.2f} MB")

if __name__ == "__main__":
    downloader = ModelDownloader()
    
    # 下载所有模型
    downloader.download_all()
    
    # 或者只下载特定模型（推荐用于FPGA的轻量级模型）
    # downloader.download_model('yolov10n')
    # downloader.download_model('yolov10s')
EOF

# 执行下载脚本
python scripts/python/download_models.py

2.2 模型架构深度分析

步骤4：创建模型分析工具

# 创建模型分析脚本：scripts/python/model_analyzer.py
cat > scripts/python/model_analyzer.py << 'EOF'
#!/usr/bin/env python3
"""
YOLO V10模型深度分析工具
分析模型架构、参数分布、计算量等关键指标
"""

import torch
import torch.nn as nn
from pathlib import Path
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
from typing import Dict, List, Tuple
import pandas as pd

class YOLOv10Analyzer:
    def __init__(self, model_path: str):
        """初始化分析器"""
        self.model_path = Path(model_path)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"使用设备: {self.device}")
        
        # 加载模型
        print(f"加载模型: {self.model_path}")
        self.model = torch.load(self.model_path, map_location=self.device)
        
        # 如果是完整的checkpoint，提取model部分
        if isinstance(self.model, dict) and 'model' in self.model:
            self.model = self.model['model']
        
        # 设置为评估模式
        if hasattr(self.model, 'eval'):
            self.model.eval()
        
        # 分析结果存储
        self.analysis_results = {}
        
    def analyze_architecture(self):
        """分析模型架构"""
        print("\n" + "="*60)
        print("模型架构分析")
        print("="*60)
        
        architecture_info = {
            'total_layers': 0,
            'layer_types': {},
            'layer_details': []
        }
        
        # 遍历所有模块
        for name, module in self.model.named_modules():
            if len(list(module.children())) == 0:  # 只统计叶子节点
                architecture_info['total_layers'] += 1
                
                # 统计层类型
                layer_type = module.__class__.__name__
                if layer_type not in architecture_info['layer_types']:
                    architecture_info['layer_types'][layer_type] = 0
                architecture_info['layer_types'][layer_type] += 1
                
                # 记录详细信息
                layer_detail = {
                    'name': name,
                    'type': layer_type,
                    'params': sum(p.numel() for p in module.parameters()),
                    'trainable_params': sum(p.numel() for p in module.parameters() if p.requires_grad)
                }
                
                # 特殊层的额外信息
                if isinstance(module, nn.Conv2d):
                    layer_detail.update({
                        'in_channels': module.in_channels,
                        'out_channels': module.out_channels,
                        'kernel_size': module.kernel_size,
                        'stride': module.stride,
                        'padding': module.padding,
                        'groups': module.groups
                    })
                elif isinstance(module, nn.BatchNorm2d):
                    layer_detail.update({
                        'num_features': module.num_features,
                        'eps': module.eps,
                        'momentum': module.momentum
                    })
                
                architecture_info['layer_details'].append(layer_detail)
        
        # 打印统计信息
        print(f"总层数: {architecture_info['total_layers']}")
        print("\n层类型分布:")
        for layer_type, count in sorted(architecture_info['layer_types'].items(), 
                                       key=lambda x: x[1], reverse=True):
            print(f"  {layer_type:20s}: {count:4d} 层")
        
        self.analysis_results['architecture'] = architecture_info
        return architecture_info
    
    def analyze_parameters(self):
        """分析参数分布"""
        print("\n" + "="*60)
        print("参数分析")
        print("="*60)
        
        param_info = {
            'total_params': 0,
            'trainable_params': 0,
            'non_trainable_params': 0,
            'param_distribution': [],
            'layer_params': {}
        }
        
        # 统计总参数
        for name, param in self.model.named_parameters():
            num_params = param.numel()
            param_info['total_params'] += num_params
            
            if param.requires_grad:
                param_info['trainable_params'] += num_params
            else:
                param_info['non_trainable_params'] += num_params
            
            # 记录每层参数
            param_info['layer_params'][name] = {
                'shape': list(param.shape),
                'numel': num_params,
                'dtype': str(param.dtype),
                'requires_grad': param.requires_grad,
                'mean': float(param.mean()),
                'std': float(param.std()),
                'min': float(param.min()),
                'max': float(param.max())
            }
            
            # 参数分布
            param_info['param_distribution'].extend(param.flatten().cpu().numpy())
        
        # 打印统计
        print(f"总参数量: {param_info['total_params']:,}")
        print(f"可训练参数: {param_info['trainable_params']:,}")
        print(f"不可训练参数: {param_info['non_trainable_params']:,}")
        print(f"模型大小估计: {param_info['total_params'] * 4 / (1024**2):.2f} MB (FP32)")
        print(f"模型大小估计: {param_info['total_params'] * 2 / (1024**2):.2f} MB (FP16)")
        print(f"模型大小估计: {param_info['total_params'] / (1024**2):.2f} MB (INT8)")
        
        # 找出参数最多的层
        print("\n参数量最多的前10层:")
        sorted_layers = sorted(param_info['layer_params'].items(), 
                             key=lambda x: x[1]['numel'], reverse=True)[:10]
        for layer_name, layer_data in sorted_layers:
            print(f"  {layer_name:40s}: {layer_data['numel']:10,} 参数")
        
        self.analysis_results['parameters'] = param_info
        return param_info
    
    def analyze_computation(self, input_size=(1, 3, 640, 640)):
        """分析计算复杂度"""
        print("\n" + "="*60)
        print("计算复杂度分析")
        print("="*60)
        
        from thop import profile, clever_format
        
        # 创建示例输入
        dummy_input = torch.randn(input_size).to(self.device)
        
        # 计算FLOPs和参数
        with torch.no_grad():
            flops, params = profile(self.model, inputs=(dummy_input,), verbose=False)
        
        # 格式化输出
        flops, params = clever_format([flops, params], "%.3f")
        
        computation_info = {
            'input_size': input_size,
            'total_flops': flops,
            'total_params': params,
            'flops_per_param': 0  # 稍后计算
        }
        
        print(f"输入尺寸: {input_size}")
        print(f"总FLOPs: {flops}")
        print(f"总参数: {params}")
        
        # 逐层分析计算量
        print("\n逐层计算量分析:")
        layer_flops = self.analyze_layer_flops(dummy_input)
        
        self.analysis_results['computation'] = computation_info
        return computation_info
    
    def analyze_layer_flops(self, input_tensor):
        """分析每层的FLOPs"""
        layer_flops = {}
        
        def hook_fn(module, input, output):
            # 计算Conv2d层的FLOPs
            if isinstance(module, nn.Conv2d):
                batch_size = input[0].shape[0]
                output_height = output.shape[2]
                output_width = output.shape[3]
                
                kernel_ops = module.kernel_size[0] * module.kernel_size[1] * (module.in_channels // module.groups)
                output_size = batch_size * output_height * output_width * module.out_channels
                
                flops = kernel_ops * output_size
                layer_flops[module] = flops
        
        # 注册hook
        hooks = []
        for module in self.model.modules():
            if isinstance(module, (nn.Conv2d, nn.Linear)):
                hooks.append(module.register_forward_hook(hook_fn))
        
        # 前向传播
        with torch.no_grad():
            _ = self.model(input_tensor)
        
        # 移除hooks
        for hook in hooks:
            hook.remove()
        
        # 打印前10个计算量最大的层
        sorted_flops = sorted(layer_flops.items(), key=lambda x: x[1], reverse=True)[:10]
        for i, (layer, flops) in enumerate(sorted_flops):
            print(f"  层 {i+1}: {flops/1e9:.3f} GFLOPs")
        
        return layer_flops
    
    def analyze_memory_footprint(self, batch_size=1):
        """分析内存占用"""
        print("\n" + "="*60)
        print("内存占用分析")
        print("="*60)
        
        memory_info = {
            'weights_memory': 0,
            'activation_memory': 0,
            'gradient_memory': 0,
            'total_memory': 0
        }
        
        # 权重内存
        for param in self.model.parameters():
            memory_info['weights_memory'] += param.numel() * param.element_size()
        
        # 激活内存（估算）
        input_size = (batch_size, 3, 640, 640)
        input_memory = np.prod(input_size) * 4  # FP32
        memory_info['activation_memory'] = input_memory * 10  # 假设10倍输入大小
        
        # 梯度内存（训练时）
        memory_info['gradient_memory'] = memory_info['weights_memory']
        
        # 总内存
        memory_info['total_memory'] = (memory_info['weights_memory'] + 
                                      memory_info['activation_memory'])
        
        print(f"权重内存: {memory_info['weights_memory'] / (1024**2):.2f} MB")
        print(f"激活内存（估算）: {memory_info['activation_memory'] / (1024**2):.2f} MB")
        print(f"梯度内存（训练时）: {memory_info['gradient_memory'] / (1024**2):.2f} MB")
        print(f"总内存占用: {memory_info['total_memory'] / (1024**2):.2f} MB")
        
        self.analysis_results['memory'] = memory_info
        return memory_info
    
    def visualize_architecture(self):
        """可视化模型架构"""
        print("\n生成架构可视化...")
        
        # 创建架构图
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # 1. 层类型分布饼图
        ax = axes[0, 0]
        layer_types = self.analysis_results['architecture']['layer_types']
        ax.pie(layer_types.values(), labels=layer_types.keys(), autopct='%1.1f%%')
        ax.set_title('层类型分布')
        
        # 2. 参数分布直方图
        ax = axes[0, 1]
        param_dist = self.analysis_results['parameters']['param_distribution']
        ax.hist(param_dist, bins=100, edgecolor='black')
        ax.set_xlabel('参数值')
        ax.set_ylabel('频数')
        ax.set_title('参数值分布')
        ax.set_yscale('log')
        
        # 3. 每层参数量条形图（前20层）
        ax = axes[1, 0]
        layer_params = self.analysis_results['parameters']['layer_params']
        sorted_layers = sorted(layer_params.items(), 
                             key=lambda x: x[1]['numel'], reverse=True)[:20]
        layer_names = [name.split('.')[-1] for name, _ in sorted_layers]
        param_counts = [data['numel'] for _, data in sorted_layers]
        
        ax.barh(range(len(layer_names)), param_counts)
        ax.set_yticks(range(len(layer_names)))
        ax.set_yticklabels(layer_names, fontsize=8)
        ax.set_xlabel('参数数量')
        ax.set_title('各层参数量（Top 20）')
        
        # 4. 模型深度分析
        ax = axes[1, 1]
        layer_details = self.analysis_results['architecture']['layer_details']
        conv_layers = [l for l in layer_details if l['type'] == 'Conv2d']
        if conv_layers:
            depths = [l['out_channels'] for l in conv_layers]
            ax.plot(depths, marker='o')
            ax.set_xlabel('Conv层索引')
            ax.set_ylabel('输出通道数')
            ax.set_title('网络深度变化')
            ax.grid(True)
        
        plt.tight_layout()
        plt.savefig('models/analysis/architecture_visualization.png', dpi=150)
        print(f"架构可视化已保存至: models/analysis/architecture_visualization.png")
        plt.show()
    
    def generate_report(self):
        """生成完整的分析报告"""
        print("\n" + "="*60)
        print("生成分析报告")
        print("="*60)
        
        report = {
            'model_path': str(self.model_path),
            'analysis_results': self.analysis_results,
            'recommendations': self.generate_fpga_recommendations()
        }
        
        # 保存为JSON
        report_path = Path('models/analysis') / f"{self.model_path.stem}_analysis.json"
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=2, default=str)
        
        print(f"分析报告已保存至: {report_path}")
        
        # 生成Markdown报告
        self.generate_markdown_report(report_path.with_suffix('.md'))
        
        return report
    
    def generate_fpga_recommendations(self):
        """生成FPGA部署建议"""
        recommendations = {
            'quantization': 'INT8推荐用于大部分层，INT4可用于非关键层',
            'pruning': '建议剪枝30-40%的参数以减少DSP使用',
            'tiling': '推荐使用26x26的空间tile和32通道的深度tile',
            'parallelism': '建议8-16个并行PE单元',
            'memory': '需要至少32MB的片上存储用于权重缓存'
        }
        
        # 基于分析结果的具体建议
        total_params = self.analysis_results['parameters']['total_params']
        if total_params < 5000000:
            recommendations['model_variant'] = 'YOLOv10n - 最适合FPGA部署'
        elif total_params < 10000000:
            recommendations['model_variant'] = 'YOLOv10s - 平衡性能与资源'
        else:
            recommendations['model_variant'] = 'YOLOv10m/l - 需要高端FPGA'
        
        return recommendations
    
    def generate_markdown_report(self, output_path):
        """生成Markdown格式报告"""
        with open(output_path, 'w') as f:
            f.write(f"# YOLO V10 模型分析报告\n\n")
            f.write(f"模型文件: `{self.model_path}`\n\n")
            
            f.write("## 1. 架构概览\n\n")
            arch = self.analysis_results['architecture']
            f.write(f"- 总层数: {arch['total_layers']}\n")
            f.write("- 层类型分布:\n")
            for layer_type, count in arch['layer_types'].items():
                f.write(f"  - {layer_type}: {count}\n")
            
            f.write("\n## 2. 参数统计\n\n")
            params = self.analysis_results['parameters']
            f.write(f"- 总参数量: {params['total_params']:,}\n")
            f.write(f"- 可训练参数: {params['trainable_params']:,}\n")
            f.write(f"- 模型大小(FP32): {params['total_params'] * 4 / (1024**2):.2f} MB\n")
            f.write(f"- 模型大小(INT8): {params['total_params'] / (1024**2):.2f} MB\n")
            
            f.write("\n## 3. FPGA部署建议\n\n")
            for key, value in self.generate_fpga_recommendations().items():
                f.write(f"- **{key}**: {value}\n")
        
        print(f"Markdown报告已保存至: {output_path}")

def main():
    """主函数"""
    # 分析YOLOv10s模型（推荐用于FPGA）
    model_path = "models/original/yolov10s.pt"
    
    # 创建分析器
    analyzer = YOLOv10Analyzer(model_path)
    
    # 执行各项分析
    analyzer.analyze_architecture()
    analyzer.analyze_parameters()
    analyzer.analyze_computation()
    analyzer.analyze_memory_footprint()
    
    # 生成可视化
    analyzer.visualize_architecture()
    
    # 生成报告
    analyzer.generate_report()
    
    print("\n分析完成！")

if __name__ == "__main__":
    main()
EOF

# 运行模型分析
python scripts/python/model_analyzer.py

2.3 模型量化准备

步骤5：准备量化校准数据集

# 创建数据集准备脚本：scripts/python/prepare_calibration_dataset.py
cat > scripts/python/prepare_calibration_dataset.py << 'EOF'
#!/usr/bin/env python3
"""
准备YOLO V10量化校准数据集
从COCO数据集中选择代表性图像用于量化校准
"""

import os
import cv2
import json
import random
import shutil
import numpy as np
from pathlib import Path
from tqdm import tqdm
import urllib.request
import zipfile

class CalibrationDatasetPreparer:
    def __init__(self, output_dir="datasets/calibration"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # 创建子目录
        self.images_dir = self.output_dir / "images"
        self.images_dir.mkdir(exist_ok=True)
        
        self.annotations_dir = self.output_dir / "annotations"
        self.annotations_dir.mkdir(exist_ok=True)
        
        # COCO类别（80类）
        self.coco_classes = [
            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 
            'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 
            'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 
            'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 
            'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 
            'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 
            'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 
            'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 
            'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 
            'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 
            'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 
            'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 
            'scissors', 'teddy bear', 'hair drier', 'toothbrush'
        ]
    
    def download_sample_dataset(self):
        """下载COCO样本数据集"""
        print("下载COCO验证集样本...")
        
        # COCO 2017 val dataset (小样本)
        val_images_url = "http://images.cocodataset.org/zips/val2017.zip"
        annotations_url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
        
        # 这里为了演示，我们创建一些示例图像
        print("创建示例校准图像...")
        self.create_sample_images(num_images=500)
    
    def create_sample_images(self, num_images=500):
        """创建示例校准图像"""
        print(f"生成 {num_images} 张校准图像...")
        
        image_stats = {
            'total_images': 0,
            'size_distribution': {},
            'brightness_distribution': [],
            'complexity_scores': []
        }
        
        for i in tqdm(range(num_images), desc="生成图像"):
            # 生成不同特征的图像以覆盖各种场景
            img_type = i % 5
            
            if img_type == 0:
                # 自然场景（模拟室外）
                img = self.create_natural_scene()
            elif img_type == 1:
                # 室内场景
                img = self.create_indoor_scene()
            elif img_type == 2:
                # 低光照场景
                img = self.create_low_light_scene()
            elif img_type == 3:
                # 高对比度场景
                img = self.create_high_contrast_scene()
            else:
                # 复杂纹理场景
                img = self.create_complex_texture_scene()
            
            # 保存图像
            img_path = self.images_dir / f"calib_{i:06d}.jpg"
            cv2.imwrite(str(img_path), img)
            
            # 统计信息
            image_stats['total_images'] += 1
            brightness = np.mean(img)
            image_stats['brightness_distribution'].append(brightness)
            
            # 创建对应的标注文件（YOLO格式）
            self.create_annotation(i, img.shape)
        
        # 保存统计信息
        stats_path = self.output_dir / "calibration_stats.json"
        with open(stats_path, 'w') as f:
            json.dump(image_stats, f, indent=2)
        
        print(f"校准数据集准备完成！")
        print(f"  - 图像数量: {image_stats['total_images']}")
        print(f"  - 平均亮度: {np.mean(image_stats['brightness_distribution']):.2f}")
    
    def create_natural_scene(self):
        """创建自然场景图像"""
        img = np.zeros((640, 640, 3), dtype=np.uint8)
        
        # 天空背景
        sky_color = (135, 206, 235)  # 天蓝色
        img[:320, :] = sky_color
        
        # 地面
        ground_color = (34, 139, 34)  # 森林绿
        img[320:, :] = ground_color
        
        # 添加一些随机物体
        num_objects = random.randint(3, 8)
        for _ in range(num_objects):
            x = random.randint(50, 590)
            y = random.randint(200, 500)
            w = random.randint(30, 100)
            h = random.randint(30, 100)
            color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
            cv2.rectangle(img, (x, y), (x+w, y+h), color, -1)
        
        # 添加噪声
        noise = np.random.normal(0, 10, img.shape).astype(np.uint8)
        img = cv2.add(img, noise)
        
        return img
    
    def create_indoor_scene(self):
        """创建室内场景图像"""
        img = np.ones((640, 640, 3), dtype=np.uint8) * 200  # 灰色背景
        
        # 添加一些几何形状模拟家具
        # 桌子
        cv2.rectangle(img, (200, 400), (440, 450), (139, 69, 19), -1)
        
        # 椅子
        cv2.rectangle(img, (100, 380), (180, 500), (160, 82, 45), -1)
        cv2.rectangle(img, (460, 380), (540, 500), (160, 82, 45), -1)
        
        # 窗户
        cv2.rectangle(img, (50, 50), (200, 200), (255, 255, 255), -1)
        cv2.rectangle(img, (60, 60), (190, 190), (135, 206, 235), -1)
        
        # 添加阴影效果
        shadow = np.zeros((640, 640), dtype=np.uint8)
        cv2.ellipse(shadow, (320, 500), (150, 50), 0, 0, 180, 100, -1)
        img = cv2.subtract(img, cv2.cvtColor(shadow, cv2.COLOR_GRAY2BGR))
        
        return img
    
    def create_low_light_scene(self):
        """创建低光照场景"""
        # 基础暗场景
        img = np.ones((640, 640, 3), dtype=np.uint8) * 30
        
        # 添加光源
        center_x = random.randint(100, 540)
        center_y = random.randint(100, 540)
        
        # 创建光照渐变
        for i in range(640):
            for j in range(640):
                dist = np.sqrt((i - center_x)**2 + (j - center_y)**2)
                intensity = max(0, 200 - dist * 0.5)
                img[j, i] = np.clip(img[j, i] + intensity, 0, 255)
        
        # 添加一些暗物体
        num_objects = random.randint(2, 5)
        for _ in range(num_objects):
            x = random.randint(50, 590)
            y = random.randint(50, 590)
            radius = random.randint(20, 60)
            cv2.circle(img, (x, y), radius, (10, 10, 10), -1)
        
        return img
    
    def create_high_contrast_scene(self):
        """创建高对比度场景"""
        img = np.zeros((640, 640, 3), dtype=np.uint8)
        
        # 创建棋盘图案
        square_size = 80
        for i in range(8):
            for j in range(8):
                if (i + j) % 2 == 0:
                    color = (255, 255, 255)
                else:
                    color = (0, 0, 0)
                cv2.rectangle(img, 
                            (i*square_size, j*square_size),
                            ((i+1)*square_size, (j+1)*square_size),
                            color, -1)
        
        # 添加一些彩色物体
        colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]
        for color in colors:
            x = random.randint(50, 590)
            y = random.randint(50, 590)
            cv2.circle(img, (x, y), 30, color, -1)
        
        return img
    
    def create_complex_texture_scene(self):
        """创建复杂纹理场景"""
        img = np.random.randint(0, 256, (640, 640, 3), dtype=np.uint8)
        
        # 应用高斯模糊创建平滑区域
        img = cv2.GaussianBlur(img, (15, 15), 0)
        
        # 添加规则纹理
        pattern = np.zeros((40, 40, 3), dtype=np.uint8)
        cv2.line(pattern, (0, 0), (39, 39), (255, 255, 255), 2)
        cv2.line(pattern, (0, 39), (39, 0), (255, 255, 255), 2)
        
        for i in range(0, 640, 40):
            for j in range(0, 640, 40):
                if random.random() > 0.5:
                    img[i:i+40, j:j+40] = cv2.addWeighted(
                        img[i:i+40, j:j+40], 0.7, pattern, 0.3, 0)
        
        return img
    
    def create_annotation(self, image_id, image_shape):
        """创建YOLO格式的标注文件"""
        h, w = image_shape[:2]
        
        # 随机生成一些边界框
        num_boxes = random.randint(1, 10)
        annotations = []
        
        for _ in range(num_boxes):
            # 随机类别
            class_id = random.randint(0, 79)
            
            # 随机边界框（YOLO格式：x_center, y_center, width, height）
            # 值都归一化到[0, 1]
            x_center = random.uniform(0.1, 0.9)
            y_center = random.uniform(0.1, 0.9)
            box_width = random.uniform(0.05, 0.3)
            box_height = random.uniform(0.05, 0.3)
            
            # 确保边界框不超出图像
            x_center = max(box_width/2, min(x_center, 1 - box_width/2))
            y_center = max(box_height/2, min(y_center, 1 - box_height/2))
            
            annotations.append(f"{class_id} {x_center:.6f} {y_center:.6f} {box_width:.6f} {box_height:.6f}")
        
        # 保存标注文件
        ann_path = self.annotations_dir / f"calib_{image_id:06d}.txt"
        with open(ann_path, 'w') as f:
            f.write('\n'.join(annotations))
    
    def create_data_yaml(self):
        """创建数据集配置文件"""
        yaml_content = f"""# YOLO V10 校准数据集配置
path: {self.output_dir.absolute()}
train: images
val: images

# 类别数量
nc: 80

# 类别名称
names: {self.coco_classes}
"""
        
        yaml_path = self.output_dir / "calibration.yaml"
        with open(yaml_path, 'w') as f:
            f.write(yaml_content)
        
        print(f"数据集配置文件已创建: {yaml_path}")
    
    def verify_dataset(self):
        """验证数据集完整性"""
        print("\n验证数据集...")
        
        # 统计图像和标注
        images = list(self.images_dir.glob("*.jpg"))
        annotations = list(self.annotations_dir.glob("*.txt"))
        
        print(f"  图像文件: {len(images)}")
        print(f"  标注文件: {len(annotations)}")
        
        # 检查配对
        missing_annotations = []
        for img_path in images:
            ann_path = self.annotations_dir / f"{img_path.stem}.txt"
            if not ann_path.exists():
                missing_annotations.append(img_path.name)
        
        if missing_annotations:
            print(f"  警告: {len(missing_annotations)} 张图像缺少标注")
        else:
            print("  ✓ 所有图像都有对应的标注")
        
        # 检查图像质量
        print("\n检查图像质量...")
        sample_images = random.sample(images, min(10, len(images)))
        for img_path in sample_images:
            img = cv2.imread(str(img_path))
            if img is None:
                print(f"  ✗ 无法读取: {img_path.name}")
            else:
                h, w = img.shape[:2]
                if h != 640 or w != 640:
                    print(f"  ⚠ 尺寸不标准: {img_path.name} ({w}x{h})")
        
        print("\n数据集验证完成！")

def main():
    """主函数"""
    preparer = CalibrationDatasetPreparer()
    
    # 下载或创建数据集
    preparer.download_sample_dataset()
    
    # 创建配置文件
    preparer.create_data_yaml()
    
    # 验证数据集
    preparer.verify_dataset()
    
    print("\n校准数据集准备完成！")
    print(f"位置: {preparer.output_dir}")

if __name__ == "__main__":
    main()
EOF

# 运行数据集准备脚本
python scripts/python/prepare_calibration_dataset.py

步骤6：执行模型量化

# 创建量化脚本：scripts/python/quantize_model.py
cat > scripts/python/quantize_model.py << 'EOF'
#!/usr/bin/env python3
"""
YOLO V10模型量化工具
支持多种量化方法：PTQ、QAT、混合精度量化
"""

import torch
import torch.nn as nn
import torch.quantization as quantization
from pathlib import Path
import numpy as np
import json
import time
from tqdm import tqdm
import onnx
import onnxruntime as ort

class YOLOv10Quantizer:
    def __init__(self, model_path, calibration_dataset_path):
        """初始化量化器"""
        self.model_path = Path(model_path)
        self.dataset_path = Path(calibration_dataset_path)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        print(f"加载模型: {self.model_path}")
        self.model = torch.load(self.model_path, map_location=self.device)
        
        # 输出目录
        self.output_dir = Path("models/quantized")
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # 量化配置
        self.quantization_configs = {
            'int8_symmetric': {
                'qconfig': torch.quantization.get_default_qconfig('fbgemm'),
                'backend': 'fbgemm',
                'bits': 8,
                'symmetric': True
            },
            'int8_asymmetric': {
                'qconfig': torch.quantization.get_default_qconfig('qnnpack'),
                'backend': 'qnnpack',
                'bits': 8,
                'symmetric': False
            },
            'int4': {
                'bits': 4,
                'custom': True  # 需要自定义实现
            }
        }
    
    def prepare_calibration_data(self, num_samples=100):
        """准备校准数据"""
        print(f"准备校准数据 ({num_samples} 样本)...")
        
        calibration_data = []
        image_paths = list((self.dataset_path / "images").glob("*.jpg"))[:num_samples]
        
        for img_path in tqdm(image_paths, desc="加载校准图像"):
            # 这里简化处理，实际应该进行正确的预处理
            img = torch.randn(1, 3, 640, 640).to(self.device)
            calibration_data.append(img)
        
        return calibration_data
    
    def quantize_post_training(self, quantization_type='int8_symmetric'):
        """训练后量化(PTQ)"""
        print(f"\n开始训练后量化 (PTQ) - {quantization_type}")
        
        config = self.quantization_configs[quantization_type]
        
        # 准备模型
        model_fp32 = self.model.eval()
        
        # 设置量化配置
        if quantization_type == 'int4':
            # INT4需要特殊处理
            quantized_model = self.quantize_to_int4(model_fp32)
        else:
            # INT8量化
            quantized_model = self.quantize_to_int8(model_fp32, config)
        
        # 保存量化模型
        output_path = self.output_dir / f"yolov10_{quantization_type}_ptq.pt"
        torch.save(quantized_model, output_path)
        print(f"量化模型已保存: {output_path}")
        
        return quantized_model
    
    def quantize_to_int8(self, model, config):
        """INT8量化实现"""
        # 设置量化配置
        model.qconfig = config['qconfig']
        
        # 准备量化
        torch.quantization.prepare(model, inplace=True)
        
        # 校准
        print("执行校准...")
        calibration_data = self.prepare_calibration_data(100)
        
        with torch.no_grad():
            for data in tqdm(calibration_data, desc="校准"):
                model(data)
        
        # 转换为量化模型
        print("转换为量化模型...")
        quantized_model = torch.quantization.convert(model, inplace=False)
        
        return quantized_model
    
    def quantize_to_int4(self, model):
        """INT4量化实现（自定义）"""
        print("执行INT4量化...")
        
        class Int4Quantizer:
            def __init__(self, bits=4):
                self.bits = bits
                self.qmin = -(2**(bits-1))
                self.qmax = 2**(bits-1) - 1
            
            def quantize_tensor(self, tensor):
                """量化张量到INT4"""
                # 计算缩放因子
                scale = (tensor.max() - tensor.min()) / (self.qmax - self.qmin)
                zero_point = self.qmin - tensor.min() / scale
                
                # 量化
                quantized = torch.round(tensor / scale + zero_point)
                quantized = torch.clamp(quantized, self.qmin, self.qmax)
                
                return quantized.to(torch.int8), scale, zero_point
            
            def dequantize_tensor(self, quantized, scale, zero_point):
                """反量化"""
                return (quantized.float() - zero_point) * scale
        
        quantizer = Int4Quantizer()
        
        # 量化所有权重
        for name, param in model.named_parameters():
            if 'weight' in name and len(param.shape) >= 2:
                quantized, scale, zp = quantizer.quantize_tensor(param.data)
                # 这里简化处理，实际需要修改模型结构来支持INT4
                param.data = quantizer.dequantize_tensor(quantized, scale, zp)
        
        return model
    
    def quantize_aware_training(self, train_loader, epochs=10):
        """量化感知训练(QAT)"""
        print("\n开始量化感知训练 (QAT)")
        
        model = self.model.train()
        
        # 准备QAT
        model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
        torch.quantization.prepare_qat(model, inplace=True)
        
        # 设置优化器
        optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
        criterion = nn.CrossEntropyLoss()
        
        # 训练循环
        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")
            
            for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(self.device), target.to(self.device)
                
                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
                
                if batch_idx % 100 == 0:
                    print(f"  Batch {batch_idx}: Loss = {loss.item():.4f}")
        
        # 转换为量化模型
        model.eval()
        quantized_model = torch.quantization.convert(model, inplace=False)
        
        # 保存
        output_path = self.output_dir / "yolov10_int8_qat.pt"
        torch.save(quantized_model, output_path)
        print(f"QAT模型已保存: {output_path}")
        
        return quantized_model
    
    def mixed_precision_quantization(self):
        """混合精度量化"""
        print("\n执行混合精度量化")
        
        # 定义每层的量化策略
        layer_configs = {
            'backbone': 'int8_symmetric',      # 骨干网络用INT8
            'neck': 'int8_symmetric',           # Neck用INT8  
            'head': 'fp16',                     # 检测头保持FP16
            'first_conv': 'fp16',               # 第一层保持高精度
            'last_conv': 'fp16'                 # 最后一层保持高精度
        }
        
        model = self.model.eval()
        
        # 为不同层设置不同的量化配置
        for name, module in model.named_modules():
            if 'backbone' in name:
                module.qconfig = self.quantization_configs['int8_symmetric']['qconfig']
            elif 'head' in name:
                module.qconfig = None  # 不量化
            # ... 更多层的配置
        
        # 准备和转换
        torch.quantization.prepare(model, inplace=True)
        
        # 校准
        calibration_data = self.prepare_calibration_data(50)
        with torch.no_grad():
            for data in calibration_data:
                model(data)
        
        # 转换
        quantized_model = torch.quantization.convert(model, inplace=False)
        
        # 保存
        output_path = self.output_dir / "yolov10_mixed_precision.pt"
        torch.save(quantized_model, output_path)
        print(f"混合精度模型已保存: {output_path}")
        
        return quantized_model
    
    def export_to_onnx(self, model, quantized=True):
        """导出为ONNX格式"""
        print("\n导出ONNX模型...")
        
        model.eval()
        dummy_input = torch.randn(1, 3, 640, 640).to(self.device)
        
        # 输出路径
        suffix = "_quantized" if quantized else ""
        output_path = self.output_dir.parent / "onnx" / f"yolov10{suffix}.onnx"
        output_path.parent.mkdir(exist_ok=True)
        
        # 导出
        torch.onnx.export(
            model,
            dummy_input,
            output_path,
            export_params=True,
            opset_version=13,
            do_constant_folding=True,
            input_names=['input'],
            output_names=['output'],
            dynamic_axes={
                'input': {0: 'batch_size'},
                'output': {0: 'batch_size'}
            }
        )
        
        print(f"ONNX模型已导出: {output_path}")
        
        # 验证ONNX模型
        self.verify_onnx_model(output_path)
        
        return output_path
    
    def verify_onnx_model(self, onnx_path):
        """验证ONNX模型"""
        print("验证ONNX模型...")
        
        # 检查模型
        onnx_model = onnx.load(str(onnx_path))
        onnx.checker.check_model(onnx_model)
        
        # 创建推理会话
        ort_session = ort.InferenceSession(str(onnx_path))
        
        # 测试推理
        dummy_input = np.random.randn(1, 3, 640, 640).astype(np.float32)
        outputs = ort_session.run(None, {'input': dummy_input})
        
        print(f"  ✓ ONNX模型验证通过")
        print(f"  输出形状: {outputs[0].shape}")
    
    def compare_models(self, original_model, quantized_model):
        """比较原始模型和量化模型"""
        print("\n模型比较分析")
        
        # 模型大小比较
        original_size = sum(p.numel() * p.element_size() 
                          for p in original_model.parameters())
        quantized_size = sum(p.numel() * p.element_size() 
                           for p in quantized_model.parameters())
        
        print(f"原始模型大小: {original_size / (1024**2):.2f} MB")
        print(f"量化模型大小: {quantized_size / (1024**2):.2f} MB")
        print(f"压缩率: {original_size / quantized_size:.2f}x")
        
        # 推理速度比较
        dummy_input = torch.randn(1, 3, 640, 640).to(self.device)
        
        # 原始模型推理
        original_model.eval()
        start_time = time.time()
        with torch.no_grad():
            for _ in range(100):
                _ = original_model(dummy_input)
        original_time = time.time() - start_time
        
        # 量化模型推理
        quantized_model.eval()
        start_time = time.time()
        with torch.no_grad():
            for _ in range(100):
                _ = quantized_model(dummy_input)
        quantized_time = time.time() - start_time
        
        print(f"\n推理时间（100次）:")
        print(f"原始模型: {original_time:.2f}秒")
        print(f"量化模型: {quantized_time:.2f}秒")
        print(f"加速比: {original_time / quantized_time:.2f}x")
        
        # 精度比较（简化版，实际需要在验证集上评估）
        print("\n精度分析:")
        with torch.no_grad():
            original_output = original_model(dummy_input)
            quantized_output = quantized_model(dummy_input)
            
            if isinstance(original_output, torch.Tensor) and isinstance(quantized_output, torch.Tensor):
                mse = torch.mean((original_output - quantized_output) ** 2)
                print(f"输出MSE: {mse.item():.6f}")
    
    def generate_quantization_report(self):
        """生成量化报告"""
        report = {
            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
            'model': str(self.model_path),
            'quantization_methods': list(self.quantization_configs.keys()),
            'calibration_dataset': str(self.dataset_path),
            'results': {}
        }
        
        # 执行各种量化方法并记录结果
        for method in ['int8_symmetric', 'int8_asymmetric']:
            print(f"\n测试量化方法: {method}")
            try:
                quantized_model = self.quantize_post_training(method)
                
                # 记录结果
                report['results'][method] = {
                    'success': True,
                    'model_path': str(self.output_dir / f"yolov10_{method}_ptq.pt"),
                    'compression_ratio': self.calculate_compression_ratio(
                        self.model, quantized_model)
                }
            except Exception as e:
                report['results'][method] = {
                    'success': False,
                    'error': str(e)
                }
        
        # 保存报告
        report_path = self.output_dir / "quantization_report.json"
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=2)
        
        print(f"\n量化报告已保存: {report_path}")
        
        return report
    
    def calculate_compression_ratio(self, original_model, quantized_model):
        """计算压缩率"""
        original_params = sum(p.numel() for p in original_model.parameters())
        
        # 简化计算，假设量化后的模型使用INT8
        quantized_size = original_params  # INT8是FP32的1/4
        original_size = original_params * 4  # FP32
        
        return original_size / quantized_size

def main():
    """主函数"""
    # 设置路径
    model_path = "models/original/yolov10s.pt"
    calibration_path = "datasets/calibration"
    
    # 创建量化器
    quantizer = YOLOv10Quantizer(model_path, calibration_path)
    
    # 1. 训练后量化(PTQ)
    print("="*60)
    print("执行训练后量化(PTQ)")
    print("="*60)
    
    # INT8对称量化
    quantized_int8_sym = quantizer.quantize_post_training('int8_symmetric')
    
    # INT8非对称量化
    quantized_int8_asym = quantizer.quantize_post_training('int8_asymmetric')
    
    # 2. 混合精度量化
    print("\n" + "="*60)
    print("执行混合精度量化")
    print("="*60)
    quantized_mixed = quantizer.mixed_precision_quantization()
    
    # 3. 导出ONNX
    print("\n" + "="*60)
    print("导出ONNX模型")
    print("="*60)
    quantizer.export_to_onnx(quantized_int8_sym, quantized=True)
    
    # 4. 模型比较
    print("\n" + "="*60)
    print("模型性能比较")
    print("="*60)
    quantizer.compare_models(quantizer.model, quantized_int8_sym)
    
    # 5. 生成报告
    print("\n" + "="*60)
    print("生成量化报告")
    print("="*60)
    quantizer.generate_quantization_report()
    
    print("\n量化流程完成！")

if __name__ == "__main__":
    main()
EOF

# 运行量化脚本
python scripts/python/quantize_model.py

2.4 为FPGA优化模型结构

步骤7：模型结构优化

# 创建FPGA优化脚本：scripts/python/optimize_for_fpga.py
cat > scripts/python/optimize_for_fpga.py << 'EOF'
#!/usr/bin/env python3
"""
YOLO V10 FPGA优化工具
针对FPGA硬件特性优化模型结构
"""

import torch
import torch.nn as nn
import numpy as np
from pathlib import Path
import json

class FPGAOptimizer:
    def __init__(self, model_path):
        """初始化FPGA优化器"""
        self.model_path = Path(model_path)
        self.model = torch.load(model_path, map_location='cpu')
        
        # FPGA约束
        self.fpga_constraints = {
            'max_kernel_size': 7,          # 最大卷积核尺寸
            'preferred_kernel_sizes': [1, 3, 5, 7],  # 推荐的卷积核尺寸
            'max_channels': 512,            # 最大通道数
            'tile_size': 26,               # 瓦片大小
            'dsp_blocks': 2520,            # DSP块数量（ZCU102）
            'bram_blocks': 912,            # BRAM块数量
            'uram_blocks': 96,             # URAM块数量
            'preferred_bitwidth': 8,       # 推荐位宽
            'max_parallel_ops': 16         # 最大并行操作数
        }
        
        self.optimizations_applied = []
    
    def analyze_conv_layers(self):
        """分析卷积层以识别优化机会"""
        print("\n分析卷积层...")
        
        conv_layers = []
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Conv2d):
                layer_info = {
                    'name': name,
                    'in_channels': module.in_channels,
                    'out_channels': module.out_channels,
                    'kernel_size': module.kernel_size,
                    'stride': module.stride,
                    'padding': module.padding,
                    'groups': module.groups,
                    'params': module.in_channels * module.out_channels * 
                             module.kernel_size[0] * module.kernel_size[1] // module.groups
                }
                
                # 计算该层的DSP使用量（估算）
                layer_info['estimated_dsps'] = self.estimate_dsp_usage(module)
                
                # 检查是否适合FPGA
                layer_info['fpga_friendly'] = self.check_fpga_compatibility(module)
                
                conv_layers.append(layer_info)
        
        print(f"发现 {len(conv_layers)} 个卷积层")
        
        # 识别问题层
        problematic_layers = [l for l in conv_layers if not l['fpga_friendly']]
        if problematic_layers:
            print(f"发现 {len(problematic_layers)} 个需要优化的层:")
            for layer in problematic_layers[:5]:  # 显示前5个
                print(f"  - {layer['name']}: "
                     f"kernel={layer['kernel_size']}, "
                     f"channels={layer['in_channels']}->{layer['out_channels']}")
        
        return conv_layers
    
    def estimate_dsp_usage(self, conv_layer):
        """估算卷积层的DSP使用量"""
        # 简化估算：每个MAC操作需要1个DSP（INT8）
        kernel_size = conv_layer.kernel_size[0] * conv_layer.kernel_size[1]
        macs_per_output = kernel_size * conv_layer.in_channels // conv_layer.groups
        
        # 考虑并行度
        parallel_factor = min(self.fpga_constraints['max_parallel_ops'], 
                            conv_layer.out_channels)
        
        dsps_needed = macs_per_output * parallel_factor
        
        return dsps_needed
    
    def check_fpga_compatibility(self, conv_layer):
        """检查卷积层是否适合FPGA实现"""
        # 检查卷积核大小
        if conv_layer.kernel_size[0] > self.fpga_constraints['max_kernel_size']:
            return False
        
        # 检查通道数
        if conv_layer.out_channels > self.fpga_constraints['max_channels']:
            return False
        
        # 检查DSP使用量
        if self.estimate_dsp_usage(conv_layer) > self.fpga_constraints['dsp_blocks']:
            return False
        
        return True
    
    def optimize_large_kernels(self):
        """将大卷积核分解为小卷积核"""
        print("\n优化大卷积核...")
        
        optimized_count = 0
        
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Conv2d):
                if module.kernel_size[0] > 5:
                    print(f"  分解 {name}: {module.kernel_size[0]}x{module.kernel_size[1]} "
                         f"-> 多个3x3卷积")
                    
                    # 这里应该替换为多个小卷积的序列
                    # 例如：7x7 -> 3x3 + 3x3 + 3x3
                    
                    optimized_count += 1
        
        if optimized_count > 0:
            self.optimizations_applied.append(f"分解了 {optimized_count} 个大卷积核")
        
        return optimized_count
    
    def apply_channel_pruning(self, pruning_ratio=0.3):
        """应用通道剪枝"""
        print(f"\n应用通道剪枝 (剪枝率: {pruning_ratio*100}%)...")
        
        pruned_channels = 0
        
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Conv2d):
                # 计算要剪枝的通道数
                num_channels = module.out_channels
                channels_to_prune = int(num_channels * pruning_ratio)
                
                if channels_to_prune > 0:
                    # 计算通道重要性（基于权重L1范数）
                    importance = torch.sum(torch.abs(module.weight), dim=(1, 2, 3))
                    
                    # 找出最不重要的通道
                    _, indices = torch.sort(importance)
                    channels_to_keep = indices[channels_to_prune:]
                    
                    # 更新权重（实际实现需要修改模型结构）
                    # module.weight.data = module.weight.data[channels_to_keep]
                    
                    pruned_channels += channels_to_prune
        
        print(f"  剪枝了 {pruned_channels} 个通道")
        self.optimizations_applied.append(f"剪枝了 {pruned_channels} 个通道")
        
        return pruned_channels
    
    def optimize_depthwise_separable(self):
        """将标准卷积转换为深度可分离卷积"""
        print("\n优化为深度可分离卷积...")
        
        converted_count = 0
        
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Conv2d) and module.groups == 1:
                # 检查是否适合转换
                if module.kernel_size[0] >= 3 and module.in_channels >= 32:
                    print(f"  转换 {name} 为深度可分离卷积")
                    
                    # 创建深度卷积和逐点卷积
                    # depthwise = nn.Conv2d(in_channels, in_channels, kernel_size, 
                    #                       groups=in_channels)
                    # pointwise = nn.Conv2d(in_channels, out_channels, 1)
                    
                    converted_count += 1
        
        if converted_count > 0:
            self.optimizations_applied.append(
                f"转换了 {converted_count} 个卷积为深度可分离卷积")
        
        return converted_count
    
    def optimize_memory_layout(self):
        """优化内存布局以适应FPGA"""
        print("\n优化内存布局...")
        
        memory_optimizations = {
            'weight_reordering': False,
            'activation_tiling': False,
            'double_buffering': False
        }
        
        # 权重重排序（适应FPGA的并行访问模式）
        print("  应用权重重排序...")
        # 将权重从NCHW重排为适合FPGA的格式
        memory_optimizations['weight_reordering'] = True
        
        # 激活值分块
        print("  配置激活值分块...")
        tile_config = {
            'spatial_tile': self.fpga_constraints['tile_size'],
            'channel_tile': 32
        }
        memory_optimizations['activation_tiling'] = True
        
        # 双缓冲
        print("  启用双缓冲...")
        memory_optimizations['double_buffering'] = True
        
        self.optimizations_applied.append("优化了内存布局")
        
        return memory_optimizations
    
    def generate_fpga_config(self):
        """生成FPGA实现配置"""
        print("\n生成FPGA配置...")
        
        config = {
            'model': str(self.model_path),
            'target_device': 'ZCU102',
            'optimizations': self.optimizations_applied,
            'hardware_config': {
                'systolic_array_size': 8,
                'parallel_engines': 4,
                'pipeline_depth': 5,
                'clock_frequency': 200,  # MHz
                'precision': 'INT8'
            },
            'memory_config': {
                'weight_buffer_size': 32,  # MB
                'activation_buffer_size': 16,  # MB
                'use_uram': True,
                'use_bram': True,
                'ddr_bandwidth': 19.2  # GB/s
            },
            'layer_config': []
        }
        
        # 为每层生成配置
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Conv2d):
                layer_cfg = {
                    'name': name,
                    'type': 'CONV2D',
                    'parallelism': min(16, module.out_channels),
                    'tiling': {
                        'spatial': self.fpga_constraints['tile_size'],
                        'input_channel': min(32, module.in_channels),
                        'output_channel': min(32, module.out_channels)
                    },
                    'precision': 'INT8' if module.out_channels <= 256 else 'INT4'
                }
                config['layer_config'].append(layer_cfg)
        
        # 保存配置
        config_path = Path("config") / "fpga_implementation.json"
        config_path.parent.mkdir(exist_ok=True)
        
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)
        
        print(f"FPGA配置已保存: {config_path}")
        
        return config
    
    def estimate_fpga_performance(self):
        """估算FPGA性能"""
        print("\n估算FPGA性能...")
        
        total_ops = 0
        total_memory = 0
        
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Conv2d):
                # 计算操作数
                ops = (module.in_channels * module.out_channels * 
                       module.kernel_size[0] * module.kernel_size[1])
                total_ops += ops
                
                # 计算内存需求
                weight_memory = ops * 1  # INT8
                total_memory += weight_memory
        
        # 性能估算
        clock_freq = 200e6  # 200 MHz
        parallel_ops = 16   # 并行操作数
        
        # 理论峰值性能
        peak_performance = clock_freq * parallel_ops * 2  # GOPS
        
        # 考虑利用率（通常70-80%）
        actual_performance = peak_performance * 0.75
        
        # 推理时间估算
        inference_time = total_ops / actual_performance
        fps = 1 / inference_time
        
        performance_report = {
            'total_operations': f"{total_ops/1e9:.2f} GOPs",
            'memory_requirement': f"{total_memory/1e6:.2f} MB",
            'peak_performance': f"{peak_performance/1e9:.2f} GOPS",
            'estimated_performance': f"{actual_performance/1e9:.2f} GOPS",
            'estimated_latency': f"{inference_time*1000:.2f} ms",
            'estimated_fps': f"{fps:.1f} FPS"
        }
        
        print("性能估算结果:")
        for key, value in performance_report.items():
            print(f"  {key}: {value}")
        
        return performance_report
    
    def save_optimized_model(self):
        """保存优化后的模型"""
        output_path = Path("models/optimized") / f"{self.model_path.stem}_fpga_optimized.pt"
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        torch.save(self.model, output_path)
        print(f"\n优化模型已保存: {output_path}")
        
        return output_path

def main():
    """主函数"""
    # 加载量化后的模型
    model_path = "models/quantized/yolov10_int8_symmetric_ptq.pt"
    
    # 创建优化器
    optimizer = FPGAOptimizer(model_path)
    
    print("="*60)
    print("FPGA优化流程开始")
    print("="*60)
    
    # 1. 分析卷积层
    conv_layers = optimizer.analyze_conv_layers()
    
    # 2. 应用各种优化
    optimizer.optimize_large_kernels()
    optimizer.apply_channel_pruning(0.3)
    optimizer.optimize_depthwise_separable()
    optimizer.optimize_memory_layout()
    
    # 3. 生成FPGA配置
    fpga_config = optimizer.generate_fpga_config()
    
    # 4. 估算性能
    performance = optimizer.estimate_fpga_performance()
    
    # 5. 保存优化模型
    optimized_path = optimizer.save_optimized_model()
    
    print("\n" + "="*60)
    print("优化完成！")
    print("="*60)
    print(f"优化后的模型: {optimized_path}")
    print(f"应用的优化: {', '.join(optimizer.optimizations_applied)}")

if __name__ == "__main__":
    main()
EOF

# 运行FPGA优化脚本
python scripts/python/optimize_for_fpga.py

posted @ 2025-09-13 16:32 李白的白阅读(156) 评论(0) 收藏举报

刷新页面返回顶部

LiamJacob

ZYNQ Ultrascale+系列部署yolo v10（暂定，若过于艰难则考虑降级或FQ）

YOLO V10模型分析与优化

2.1 YOLO V10模型获取与环境准备

步骤1：创建工作目录结构

步骤2：安装Python环境和依赖

步骤3：下载YOLO V10预训练模型

2.2 模型架构深度分析

步骤4：创建模型分析工具

2.3 模型量化准备

步骤5：准备量化校准数据集

步骤6：执行模型量化

2.4 为FPGA优化模型结构

步骤7：模型结构优化

公告