综合与实现流程【p3】--(DSP-存储)优化&PS系统集成

(一)资源优化

1 DSP优化

创建优化的DSP映射

创建文件 dsp_optimized_pe.v

`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// 优化的PE模块 - 直接使用DSP48E2原语
//////////////////////////////////////////////////////////////////////////////////

module dsp_optimized_pe #(
    parameter USE_DSP_PACKING = 1  // 使用DSP打包优化
)(
    input wire clk,
    input wire rst_n,
    input wire ce,
    
    // 双精度输入支持INT8打包
    input wire signed [7:0] a0, a1,  // 两个激活值
    input wire signed [7:0] b0, b1,  // 两个权重
    input wire signed [31:0] c_in,   // 累加输入
    
    output reg signed [31:0] p_out   // 结果输出
);

generate
    if (USE_DSP_PACKING) begin : gen_packed_dsp
        // 使用单个DSP48E2实现2个INT8 MAC
        
        // 打包输入到更宽的信号
        wire signed [26:0] a_packed;
        wire signed [17:0] b_packed;
        
        // A端口打包: [空闲位][a1][隔离0s][a0]
        assign a_packed = {3'b0, a1, 8'b0, a0};
        
        // B端口打包: [b1][隔离0s][b0]  
        assign b_packed = {1'b0, b1, 1'b0, b0};
        
        // DSP48E2原语实例化
        wire [47:0] dsp_p;
        
        DSP48E2 #(
            // 特性配置
            .A_INPUT("DIRECT"),
            .B_INPUT("DIRECT"),
            .USE_MULT("MULTIPLY"),
            .USE_PATTERN_DETECT("NO_PATDET"),
            .USE_SIMD("TWO12"),  // 关键:SIMD模式用于并行计算
            
            // 寄存器配置
            .ACASCREG(1),
            .ADREG(1),
            .ALUMODEREG(1),
            .AREG(1),
            .AUTORESET_PATDET("NO_RESET"),
            .BCASCREG(1),
            .BREG(1),
            .CARRYINREG(1),
            .CARRYINSELREG(1),
            .CREG(1),
            .DREG(1),
            .INMODEREG(1),
            .MREG(1),
            .OPMODEREG(1),
            .PREG(1)
        ) DSP48E2_inst (
            // 时钟和控制
            .CLK(clk),
            .CE(ce),
            .RSTA(~rst_n),
            .RSTB(~rst_n),
            .RSTC(~rst_n),
            .RSTM(~rst_n),
            .RSTP(~rst_n),
            
            // 数据输入
            .A({3'b0, a_packed}),  // 30位A输入
            .B(b_packed),           // 18位B输入
            .C({16'b0, c_in}),     // 48位C输入(累加)
            
            // 控制输入
            .OPMODE(9'b000110101),    // C + A*B
            .ALUMODE(4'b0000),        // ADD
            .INMODE(5'b00000),
            .CARRYINSEL(3'b000),
            
            // 级联(未使用)
            .ACIN(30'b0),
            .BCIN(18'b0),
            .PCIN(48'b0),
            .CARRYIN(1'b0),
            
            // 动态控制(未使用)
            .D(27'b0),
            .CEA1(1'b0),
            .CEA2(1'b1),
            .CEB1(1'b0),
            .CEB2(1'b1),
            
            // 输出
            .P(dsp_p),
            
            // 未使用的输出
            .ACOUT(),
            .BCOUT(),
            .PCOUT(),
            .CARRYOUT(),
            .PATTERNDETECT(),
            .PATTERNBDETECT(),
            .OVERFLOW(),
            .UNDERFLOW()
        );
        
        // 提取并组合结果
        always @(posedge clk) begin
            if (!rst_n) begin
                p_out <= 32'b0;
            end else if (ce) begin
                // 从DSP输出提取两个MAC结果并相加
                p_out <= dsp_p[15:0] + dsp_p[31:16] + c_in;
            end
        end
        
    end else begin : gen_standard_dsp
        // 标准实现(编译器推断)
        
        reg signed [15:0] prod0, prod1;
        reg signed [31:0] sum;
        
        always @(posedge clk) begin
            if (!rst_n) begin
                prod0 <= 16'b0;
                prod1 <= 16'b0;
                sum <= 32'b0;
                p_out <= 32'b0;
            end else if (ce) begin
                // 两个独立的乘法
                prod0 <= a0 * b0;
                prod1 <= a1 * b1;
                
                // 累加
                sum <= prod0 + prod1 + c_in;
                p_out <= sum;
            end
        end
    end
endgenerate

endmodule

创建DSP资源监控模块

创建文件 dsp_monitor.v

`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// DSP资源使用监控器
//////////////////////////////////////////////////////////////////////////////////

module dsp_monitor #(
    parameter NUM_DSPS = 64,
    parameter MONITOR_PERIOD = 1000  // 监控周期(时钟周期数)
)(
    input wire clk,
    input wire rst_n,
    input wire enable,
    
    // DSP活动信号
    input wire [NUM_DSPS-1:0] dsp_active,
    
    // 监控输出
    output reg [31:0] total_cycles,
    output reg [31:0] active_cycles [0:NUM_DSPS-1],
    output reg [7:0] utilization_percent,
    output reg [31:0] peak_usage,
    output reg [31:0] average_usage
);

    // 内部计数器
    reg [31:0] period_counter;
    reg [31:0] active_count;
    reg [31:0] usage_accumulator;
    
    integer i;
    
    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            total_cycles <= 0;
            utilization_percent <= 0;
            peak_usage <= 0;
            average_usage <= 0;
            period_counter <= 0;
            active_count <= 0;
            usage_accumulator <= 0;
            
            for (i = 0; i < NUM_DSPS; i = i + 1) begin
                active_cycles[i] <= 0;
            end
            
        end else if (enable) begin
            // 总周期计数
            total_cycles <= total_cycles + 1;
            period_counter <= period_counter + 1;
            
            // 统计每个DSP的活动周期
            for (i = 0; i < NUM_DSPS; i = i + 1) begin
                if (dsp_active[i]) begin
                    active_cycles[i] <= active_cycles[i] + 1;
                    active_count <= active_count + 1;
                end
            end
            
            // 累积使用量
            usage_accumulator <= usage_accumulator + active_count;
            
            // 更新峰值使用
            if (active_count > peak_usage) begin
                peak_usage <= active_count;
            end
            
            // 周期性计算利用率
            if (period_counter >= MONITOR_PERIOD) begin
                average_usage <= usage_accumulator / MONITOR_PERIOD;
                utilization_percent <= (average_usage * 100) / NUM_DSPS;
                
                // 重置周期计数器
                period_counter <= 0;
                usage_accumulator <= 0;
            end
            
            // 重置活动计数
            active_count <= 0;
        end
    end
    
    // 生成利用率报告(仿真用)
    `ifdef SIMULATION
    always @(posedge clk) begin
        if (period_counter == MONITOR_PERIOD - 1) begin
            $display("DSP利用率报告 @%0t:", $time);
            $display("  平均使用: %0d/%0d DSPs", average_usage, NUM_DSPS);
            $display("  利用率: %0d%%", utilization_percent);
            $display("  峰值使用: %0d DSPs", peak_usage);
        end
    end
    `endif

endmodule

2存储优化

创建高效的存储管理器

创建文件 memory_manager.v

`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// 智能存储管理器 - 优化BRAM/URAM使用
//////////////////////////////////////////////////////////////////////////////////

module memory_manager #(
    parameter DATA_WIDTH = 8,
    parameter ADDR_WIDTH = 10,
    parameter USE_URAM = 1,      // 1=URAM, 0=BRAM
    parameter DOUBLE_BUFFER = 1,  // 使用双缓冲
    parameter PREFETCH = 1        // 启用预取
)(
    input wire clk,
    input wire rst_n,
    
    // 写接口
    input wire wr_en,
    input wire [ADDR_WIDTH-1:0] wr_addr,
    input wire [DATA_WIDTH-1:0] wr_data,
    
    // 读接口
    input wire rd_en,
    input wire [ADDR_WIDTH-1:0] rd_addr,
    output reg [DATA_WIDTH-1:0] rd_data,
    output reg rd_valid,
    
    // 控制
    input wire buffer_swap,      // 交换缓冲区
    output reg buffer_ready,     // 缓冲区准备好
    
    // 性能监控
    output reg [31:0] rd_hits,   // 读命中次数
    output reg [31:0] rd_misses  // 读未命中次数
);

    // 存储器实例
    localparam MEM_DEPTH = 2**ADDR_WIDTH;
    
    generate
        if (USE_URAM) begin : gen_uram
            // 使用URAM(288Kb块)
            (* ram_style = "ultra" *)
            reg [71:0] uram_array_a [0:4095];
            reg [71:0] uram_array_b [0:4095];
            
            // URAM需要72位宽,所以需要适配
            wire [71:0] uram_wr_data = {64'b0, wr_data};
            wire [11:0] uram_addr = {2'b0, wr_addr};
            
            always @(posedge clk) begin
                if (wr_en) begin
                    if (!buffer_swap || !DOUBLE_BUFFER) begin
                        uram_array_a[uram_addr] <= uram_wr_data;
                    end else begin
                        uram_array_b[uram_addr] <= uram_wr_data;
                    end
                end
                
                if (rd_en) begin
                    if (!buffer_swap || !DOUBLE_BUFFER) begin
                        rd_data <= uram_array_a[{2'b0, rd_addr}][DATA_WIDTH-1:0];
                    end else begin
                        rd_data <= uram_array_b[{2'b0, rd_addr}][DATA_WIDTH-1:0];
                    end
                end
            end
            
        end else begin : gen_bram
            // 使用BRAM(36Kb块)
            (* ram_style = "block" *)
            reg [DATA_WIDTH-1:0] bram_array_a [0:MEM_DEPTH-1];
            reg [DATA_WIDTH-1:0] bram_array_b [0:MEM_DEPTH-1];
            
            always @(posedge clk) begin
                if (wr_en) begin
                    if (!buffer_swap || !DOUBLE_BUFFER) begin
                        bram_array_a[wr_addr] <= wr_data;
                    end else begin
                        bram_array_b[wr_addr] <= wr_data;
                    end
                end
                
                if (rd_en) begin
                    if (!buffer_swap || !DOUBLE_BUFFER) begin
                        rd_data <= bram_array_a[rd_addr];
                    end else begin
                        rd_data <= bram_array_b[rd_addr];
                    end
                end
            end
        end
    endgenerate
    
    // 预取逻辑
    generate
        if (PREFETCH) begin : gen_prefetch
            reg [DATA_WIDTH-1:0] prefetch_buffer [0:3];
            reg [ADDR_WIDTH-1:0] prefetch_addr;
            reg [1:0] prefetch_valid;
            
            always @(posedge clk or negedge rst_n) begin
                if (!rst_n) begin
                    prefetch_addr <= 0;
                    prefetch_valid <= 0;
                end else if (rd_en) begin
                    // 预取下一个地址
                    prefetch_addr <= rd_addr + 1;
                    
                    // 检查预取命中
                    if (rd_addr == prefetch_addr && prefetch_valid[0]) begin
                        rd_hits <= rd_hits + 1;
                        rd_data <= prefetch_buffer[0];
                    end else begin
                        rd_misses <= rd_misses + 1;
                    end
                    
                    // 更新预取缓冲
                    prefetch_valid <= {prefetch_valid[0], 1'b1};
                end
            end
        end
    endgenerate
    
    // 读有效信号生成
    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            rd_valid <= 1'b0;
        end else begin
            rd_valid <= rd_en;
        end
    end
    
    // 缓冲区状态管理
    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            buffer_ready <= 1'b0;
        end else begin
            buffer_ready <= 1'b1;  // 简化:总是准备好
        end
    end
    
    // 性能计数器
    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            rd_hits <= 0;
            rd_misses <= 0;
        end
        // 计数逻辑在预取部分实现
    end

endmodule

(二)PS系统集成

创建Block Design

  1. 将卷积引擎打包为IP

创建文件 package_conv_engine.tcl

###############################################
# 将卷积引擎打包为IP核
###############################################

# 设置IP打包项目路径
set ip_proj_dir "./ip_repo/conv_engine_ip"

# 创建IP打包项目
create_project conv_engine_ip $ip_proj_dir -part xczu9eg-ffvb1156-2-e -force
set_property board_part xilinx.com:zcu102:part0:3.4 [current_project]

# 添加源文件
add_files -norecurse {
    ./src/hdl/conv_engine/conv_engine_top.v
    ./src/hdl/conv_engine/systolic_array.v
    ./src/hdl/conv_engine/systolic_controller.v
    ./src/hdl/primitives/processing_element.v
    ./src/hdl/primitives/dsp_optimized_pe.v
    ./src/hdl/utilities/memory_manager.v
}

# 设置顶层
set_property top conv_engine_top [current_fileset]

# 打包IP
ipx::package_project -root_dir $ip_proj_dir -vendor user.org \
    -library user -name conv_engine -taxonomy /UserIP

# 设置IP核心属性
set_property vendor_display_name {YOLO V10 Conv Engine} [ipx::current_core]
set_property display_name {Convolution Engine for YOLO V10} [ipx::current_core]
set_property description {High-performance systolic array based convolution engine optimized for YOLO V10} [ipx::current_core]
set_property company_url {http://www.example.com} [ipx::current_core]
set_property supported_families {zynquplus Production} [ipx::current_core]
set_property version 1.0 [ipx::current_core]

# 自动推断接口
ipx::infer_bus_interface clk xilinx.com:signal:clock_rtl:1.0 [ipx::current_core]
ipx::infer_bus_interface rst_n xilinx.com:signal:reset_rtl:1.0 [ipx::current_core]

# 配置AXI接口
ipx::infer_bus_interface s_axi_awaddr xilinx.com:interface:aximm_rtl:1.0 [ipx::current_core]
ipx::infer_bus_interface s_axis_tdata xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]
ipx::infer_bus_interface m_axis_tdata xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]

# 关联时钟
ipx::associate_bus_interfaces -busif s_axi -clock clk [ipx::current_core]
ipx::associate_bus_interfaces -busif s_axis -clock clk [ipx::current_core]
ipx::associate_bus_interfaces -busif m_axis -clock clk [ipx::current_core]

# 设置存储器映射
ipx::add_memory_map s_axi [ipx::current_core]
set_property slave_memory_map_ref s_axi [ipx::get_bus_interfaces s_axi -of_objects [ipx::current_core]]

ipx::add_address_block axi_lite_regs [ipx::get_memory_maps s_axi -of_objects [ipx::current_core]]
set_property range 4096 [ipx::get_address_blocks axi_lite_regs \
    -of_objects [ipx::get_memory_maps s_axi -of_objects [ipx::current_core]]]

# 生成示例驱动
set_property driver_strength strong [ipx::current_core]
set_property auto_family_support_level optimized [ipx::current_core]

# 创建GUI定制页面
ipgui::add_page -name {Basic} -component [ipx::current_core] \
    -display_name {Basic Configuration}
ipgui::add_param -name {ARRAY_SIZE} -component [ipx::current_core] \
    -parent [ipgui::get_pagespec -name Basic -component [ipx::current_core]]
ipgui::add_param -name {DATA_WIDTH} -component [ipx::current_core] \
    -parent [ipgui::get_pagespec -name Basic -component [ipx::current_core]]

# 保存和关闭IP
ipx::save_core [ipx::current_core]
ipx::check_integrity [ipx::current_core]
ipx::archive_core $ip_proj_dir/conv_engine_ip_1.0.zip [ipx::current_core]

close_project

puts "IP核打包完成!"
puts "IP保存位置: $ip_proj_dir"

连接系统组件

创建TCL脚本 build_system_bd.tcl

# 打开Block Design
open_bd_design {conv_engine_system.bd}

# 1. 添加PS
if {[llength [get_bd_cells zynq_ultra_ps_e_0]] == 0} {
    create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.5 zynq_ultra_ps_e_0
    apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e \
        -config {apply_board_preset "1"} [get_bd_cells zynq_ultra_ps_e_0]
}

# 2. 配置PS
set_property -dict [list \
    CONFIG.PSU__USE__M_AXI_GP0 {1} \
    CONFIG.PSU__USE__M_AXI_GP1 {0} \
    CONFIG.PSU__USE__S_AXI_GP0 {1} \
    CONFIG.PSU__USE__S_AXI_GP2 {1} \
    CONFIG.PSU__SAXIGP0__DATA_WIDTH {128} \
    CONFIG.PSU__SAXIGP2__DATA_WIDTH {128} \
    CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ {200} \
    CONFIG.PSU__CRL_APB__PL1_REF_CTRL__FREQMHZ {100} \
] [get_bd_cells zynq_ultra_ps_e_0]

# 3. 添加卷积引擎IP
create_bd_cell -type ip -vlnv user.org:user:conv_engine:1.0 conv_engine_0

# 4. 添加AXI互连
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_0
set_property -dict [list CONFIG.NUM_SI {1} CONFIG.NUM_MI {1}] [get_bd_cells axi_interconnect_0]

# 5. 添加DMA控制器
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
set_property -dict [list \
    CONFIG.c_include_sg {0} \
    CONFIG.c_sg_include_stscntrl_strm {0} \
    CONFIG.c_sg_length_width {26} \
    CONFIG.c_m_axi_mm2s_data_width {64} \
    CONFIG.c_m_axis_mm2s_tdata_width {64} \
    CONFIG.c_mm2s_burst_size {256} \
    CONFIG.c_m_axi_s2mm_data_width {64} \
    CONFIG.c_s_axis_s2mm_tdata_width {64} \
    CONFIG.c_s2mm_burst_size {256} \
] [get_bd_cells axi_dma_0]

# 6. 添加AXI SmartConnect用于高性能数据传输
create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 smartconnect_0
set_property -dict [list CONFIG.NUM_SI {2} CONFIG.NUM_MI {1}] [get_bd_cells smartconnect_0]

# 7. 连接控制路径(PS到卷积引擎)
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
    Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} \
    Slave {/conv_engine_0/s_axi} \
    ddr_seg {Auto} \
    intc_ip {/axi_interconnect_0} \
    master_apm {0}} [get_bd_intf_pins conv_engine_0/s_axi]

# 8. 连接DMA控制
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
    Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} \
    Slave {/axi_dma_0/S_AXI_LITE} \
    ddr_seg {Auto} \
    intc_ip {/axi_interconnect_0} \
    master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE]

# 9. 连接数据路径(DMA到DDR)
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
    Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Master {/axi_dma_0/M_AXI_MM2S} \
    Slave {/zynq_ultra_ps_e_0/S_AXI_HP0_FPD} \
    ddr_seg {Auto} \
    intc_ip {/smartconnect_0} \
    master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HP0_FPD]

apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
    Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
    Master {/axi_dma_0/M_AXI_S2MM} \
    Slave {/zynq_ultra_ps_e_0/S_AXI_HP0_FPD} \
    ddr_seg {Auto} \
    intc_ip {/smartconnect_0} \
    master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HP0_FPD]

# 10. 连接Stream接口
connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] \
                    [get_bd_intf_pins conv_engine_0/s_axis]
connect_bd_intf_net [get_bd_intf_pins conv_engine_0/m_axis] \
                    [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]

# 11. 连接时钟
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \
               [get_bd_pins conv_engine_0/clk]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \
               [get_bd_pins axi_dma_0/m_axi_mm2s_aclk]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \
               [get_bd_pins axi_dma_0/m_axi_s2mm_aclk]

# 12. 连接复位
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_resetn0] \
               [get_bd_pins conv_engine_0/rst_n]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_resetn0] \
               [get_bd_pins axi_dma_0/axi_resetn]

# 13. 添加中断控制器
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_intc:4.1 axi_intc_0

# 连接中断
connect_bd_net [get_bd_pins axi_dma_0/mm2s_introut] \
               [get_bd_pins axi_intc_0/intr]
connect_bd_net [get_bd_pins axi_intc_0/irq] \
               [get_bd_pins zynq_ultra_ps_e_0/pl_ps_irq0]

# 14. 添加ILA调试核心(可选)
create_bd_cell -type ip -vlnv xilinx.com:ip:system_ila:1.1 system_ila_0
set_property -dict [list CONFIG.C_SLOT_0_AXI_PROTOCOL {AXI4S}] [get_bd_cells system_ila_0]
connect_bd_intf_net [get_bd_intf_pins system_ila_0/SLOT_0_AXIS] \
                    [get_bd_intf_pins conv_engine_0/m_axis]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \
               [get_bd_pins system_ila_0/clk]

# 15. 验证设计
validate_bd_design

# 16. 保存Block Design
save_bd_design

# 17. 生成输出产品
generate_target all [get_files conv_engine_system.bd]

# 18. 创建HDL Wrapper
make_wrapper -files [get_files conv_engine_system.bd] -top
add_files -norecurse conv_engine_system_wrapper.v

# 19. 设置顶层
set_property top conv_engine_system_wrapper [current_fileset]

生成比特流

运行完整实现流程

# 运行综合
launch_runs synth_1 -jobs 8
wait_on_run synth_1

# 运行实现
launch_runs impl_1 -jobs 8
wait_on_run impl_1

# 生成比特流
launch_runs impl_1 -to_step write_bitstream -jobs 8
wait_on_run impl_1

# 导出硬件(包含比特流)
write_hw_platform -fixed -include_bit -force \
    -file ./conv_engine_system.xsa

Linux驱动

创建内核驱动

创建文件 conv_engine_driver.c

/**
 * 卷积引擎Linux内核驱动
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/platform_device.h>
#include <linux/interrupt.h>
#include <linux/dma-mapping.h>
#include <linux/slab.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/of_dma.h>
#include <linux/dmaengine.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/cdev.h>

#define DRIVER_NAME "conv_engine"
#define DEVICE_NAME "conv_engine"

/* 寄存器偏移定义 */
#define CONV_CTRL_REG       0x00
#define CONV_STATUS_REG     0x04
#define CONV_IMG_SIZE_REG   0x08
#define CONV_KERNEL_REG     0x0C
#define CONV_CHANNEL_REG    0x10
#define CONV_STRIDE_PAD_REG 0x14
#define CONV_PERF_CNT_REG   0x18

/* 控制位定义 */
#define CTRL_START_BIT      (1 << 0)
#define CTRL_CLEAR_BIT      (1 << 1)
#define CTRL_WEIGHT_LOAD    (1 << 2)

/* 状态位定义 */
#define STATUS_BUSY_BIT     (1 << 0)
#define STATUS_DONE_BIT     (1 << 1)

struct conv_engine_dev {
    void __iomem *regs;
    struct device *dev;
    struct cdev cdev;
    dev_t devno;
    
    /* DMA相关 */
    struct dma_chan *tx_chan;
    struct dma_chan *rx_chan;
    dma_addr_t tx_dma_handle;
    dma_addr_t rx_dma_handle;
    void *tx_virt;
    void *rx_virt;
    size_t dma_size;
    
    /* 中断 */
    int irq;
    struct completion dma_complete;
    
    /* 性能统计 */
    u32 total_inferences;
    u64 total_cycles;
};

static struct class *conv_engine_class;

/* 寄存器读写函数 */
static inline u32 conv_read_reg(struct conv_engine_dev *dev, u32 offset)
{
    return ioread32(dev->regs + offset);
}

static inline void conv_write_reg(struct conv_engine_dev *dev, u32 offset, u32 value)
{
    iowrite32(value, dev->regs + offset);
}

/* 中断处理函数 */
static irqreturn_t conv_engine_isr(int irq, void *dev_id)
{
    struct conv_engine_dev *dev = dev_id;
    u32 status;
    
    status = conv_read_reg(dev, CONV_STATUS_REG);
    
    if (status & STATUS_DONE_BIT) {
        /* 清除完成标志 */
        conv_write_reg(dev, CONV_CTRL_REG, CTRL_CLEAR_BIT);
        
        /* 更新统计 */
        dev->total_inferences++;
        dev->total_cycles += conv_read_reg(dev, CONV_PERF_CNT_REG);
        
        /* 通知完成 */
        complete(&dev->dma_complete);
        
        return IRQ_HANDLED;
    }
    
    return IRQ_NONE;
}

/* DMA回调函数 */
static void dma_complete_callback(void *completion)
{
    complete(completion);
}

/* 配置并启动DMA传输 */
static int conv_engine_dma_transfer(struct conv_engine_dev *dev, 
                                   void *src, size_t len, bool is_tx)
{
    struct dma_async_tx_descriptor *tx_desc;
    struct dma_chan *chan;
    dma_addr_t dma_src, dma_dst;
    struct completion *cmp = &dev->dma_complete;
    dma_cookie_t cookie;
    int ret;
    
    chan = is_tx ? dev->tx_chan : dev->rx_chan;
    
    if (is_tx) {
        /* 发送数据到设备 */
        memcpy(dev->tx_virt, src, len);
        dma_src = dev->tx_dma_handle;
        dma_dst = 0;  /* 设备地址由DMA控制器管理 */
    } else {
        /* 从设备接收数据 */
        dma_src = 0;
        dma_dst = dev->rx_dma_handle;
    }
    
    /* 准备DMA描述符 */
    tx_desc = dmaengine_prep_slave_single(chan, 
                                         is_tx ? dma_src : dma_dst,
                                         len,
                                         is_tx ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM,
                                         DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
    if (!tx_desc) {
        dev_err(dev->dev, "Failed to prepare DMA descriptor\n");
        return -ENOMEM;
    }
    
    /* 设置回调 */
    tx_desc->callback = dma_complete_callback;
    tx_desc->callback_param = cmp;
    
    /* 提交DMA传输 */
    init_completion(cmp);
    cookie = dmaengine_submit(tx_desc);
    if (dma_submit_error(cookie)) {
        dev_err(dev->dev, "Failed to submit DMA\n");
        return -EINVAL;
    }
    
    /* 启动DMA */
    dma_async_issue_pending(chan);
    
    /* 等待完成 */
    ret = wait_for_completion_timeout(cmp, msecs_to_jiffies(5000));
    if (ret == 0) {
        dev_err(dev->dev, "DMA timeout\n");
        dmaengine_terminate_all(chan);
        return -ETIMEDOUT;
    }
    
    if (!is_tx) {
        /* 复制接收的数据 */
        memcpy(src, dev->rx_virt, len);
    }
    
    return 0;
}

/* 运行卷积推理 */
static int conv_engine_run_inference(struct conv_engine_dev *dev,
                                    void *input_data, size_t input_size,
                                    void *output_data, size_t output_size)
{
    int ret;
    u32 status;
    
    /* 检查设备是否忙 */
    status = conv_read_reg(dev, CONV_STATUS_REG);
    if (status & STATUS_BUSY_BIT) {
        dev_err(dev->dev, "Device is busy\n");
        return -EBUSY;
    }
    
    /* 发送输入数据 */
    ret = conv_engine_dma_transfer(dev, input_data, input_size, true);
    if (ret) {
        dev_err(dev->dev, "Failed to send input data\n");
        return ret;
    }
    
    /* 启动卷积 */
    conv_write_reg(dev, CONV_CTRL_REG, CTRL_START_BIT);
    
    /* 等待完成中断 */
    ret = wait_for_completion_timeout(&dev->dma_complete, 
                                     msecs_to_jiffies(1000));
    if (ret == 0) {
        dev_err(dev->dev, "Inference timeout\n");
        return -ETIMEDOUT;
    }
    
    /* 接收输出数据 */
    ret = conv_engine_dma_transfer(dev, output_data, output_size, false);
    if (ret) {
        dev_err(dev->dev, "Failed to receive output data\n");
        return ret;
    }
    
    return 0;
}

/* 文件操作函数 */
static int conv_engine_open(struct inode *inode, struct file *file)
{
    struct conv_engine_dev *dev;
    
    dev = container_of(inode->i_cdev, struct conv_engine_dev, cdev);
    file->private_data = dev;
    
    return 0;
}

static int conv_engine_release(struct inode *inode, struct file *file)
{
    return 0;
}

static long conv_engine_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
    struct conv_engine_dev *dev = file->private_data;
    int ret = 0;
    
    switch (cmd) {
    case 0x1001: /* 设置图像尺寸 */
        conv_write_reg(dev, CONV_IMG_SIZE_REG, arg);
        break;
        
    case 0x1002: /* 设置卷积核尺寸 */
        conv_write_reg(dev, CONV_KERNEL_REG, arg);
        break;
        
    case 0x1003: /* 设置通道数 */
        conv_write_reg(dev, CONV_CHANNEL_REG, arg);
        break;
        
    case 0x1004: /* 获取性能计数 */
        ret = put_user(dev->total_cycles / dev->total_inferences, 
                      (u32 __user *)arg);
        break;
        
    default:
        ret = -EINVAL;
    }
    
    return ret;
}

static const struct file_operations conv_engine_fops = {
    .owner = THIS_MODULE,
    .open = conv_engine_open,
    .release = conv_engine_release,
    .unlocked_ioctl = conv_engine_ioctl,
};

/* 平台驱动probe函数 */
static int conv_engine_probe(struct platform_device *pdev)
{
    struct conv_engine_dev *dev;
    struct resource *res;
    int ret;
    
    dev_info(&pdev->dev, "Probing conv_engine driver\n");
    
    /* 分配设备结构 */
    dev = devm_kzalloc(&pdev->dev, sizeof(*dev), GFP_KERNEL);
    if (!dev)
        return -ENOMEM;
    
    dev->dev = &pdev->dev;
    
    /* 获取并映射寄存器 */
    res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
    dev->regs = devm_ioremap_resource(&pdev->dev, res);
    if (IS_ERR(dev->regs))
        return PTR_ERR(dev->regs);
    
    /* 获取中断 */
    dev->irq = platform_get_irq(pdev, 0);
    if (dev->irq < 0)
        return dev->irq;
    
    /* 注册中断处理 */
    ret = devm_request_irq(&pdev->dev, dev->irq, conv_engine_isr,
                          IRQF_SHARED, DRIVER_NAME, dev);
    if (ret) {
        dev_err(&pdev->dev, "Failed to request IRQ\n");
        return ret;
    }
    
    /* 获取DMA通道 */
    dev->tx_chan = dma_request_slave_channel(&pdev->dev, "tx");
    if (!dev->tx_chan) {
        dev_err(&pdev->dev, "Failed to request TX DMA channel\n");
        return -EPROBE_DEFER;
    }
    
    dev->rx_chan = dma_request_slave_channel(&pdev->dev, "rx");
    if (!dev->rx_chan) {
        dev_err(&pdev->dev, "Failed to request RX DMA channel\n");
        dma_release_channel(dev->tx_chan);
        return -EPROBE_DEFER;
    }
    
    /* 分配DMA缓冲区 */
    dev->dma_size = 640 * 640 * 3;  /* 最大图像尺寸 */
    dev->tx_virt = dma_alloc_coherent(&pdev->dev, dev->dma_size,
                                      &dev->tx_dma_handle, GFP_KERNEL);
    if (!dev->tx_virt) {
        dev_err(&pdev->dev, "Failed to allocate TX DMA buffer\n");
        ret = -ENOMEM;
        goto err_dma_alloc;
    }
    
    dev->rx_virt = dma_alloc_coherent(&pdev->dev, dev->dma_size,
                                      &dev->rx_dma_handle, GFP_KERNEL);
    if (!dev->rx_virt) {
        dev_err(&pdev->dev, "Failed to allocate RX DMA buffer\n");
        ret = -ENOMEM;
        goto err_rx_alloc;
    }
    
    /* 初始化完成量 */
    init_completion(&dev->dma_complete);
    
    /* 注册字符设备 */
    ret = alloc_chrdev_region(&dev->devno, 0, 1, DEVICE_NAME);
    if (ret < 0) {
        dev_err(&pdev->dev, "Failed to allocate char device region\n");
        goto err_chrdev;
    }
    
    cdev_init(&dev->cdev, &conv_engine_fops);
    dev->cdev.owner = THIS_MODULE;
    
    ret = cdev_add(&dev->cdev, dev->devno, 1);
    if (ret) {
        dev_err(&pdev->dev, "Failed to add char device\n");
        goto err_cdev_add;
    }
    
    /* 创建设备节点 */
    device_create(conv_engine_class, &pdev->dev, dev->devno,
                 NULL, DEVICE_NAME);
    
    platform_set_drvdata(pdev, dev);
    
    dev_info(&pdev->dev, "Conv engine driver probed successfully\n");
    
    return 0;
    
err_cdev_add:
    unregister_chrdev_region(dev->devno, 1);
err_chrdev:
    dma_free_coherent(&pdev->dev, dev->dma_size,
                     dev->rx_virt, dev->rx_dma_handle);
err_rx_alloc:
    dma_free_coherent(&pdev->dev, dev->dma_size,
                     dev->tx_virt, dev->tx_dma_handle);
err_dma_alloc:
    dma_release_channel(dev->rx_chan);
    dma_release_channel(dev->tx_chan);
    
    return ret;
}

static int conv_engine_remove(struct platform_device *pdev)
{
    struct conv_engine_dev *dev = platform_get_drvdata(pdev);
    
    device_destroy(conv_engine_class, dev->devno);
    cdev_del(&dev->cdev);
    unregister_chrdev_region(dev->devno, 1);
    
    dma_free_coherent(&pdev->dev, dev->dma_size,
                     dev->rx_virt, dev->rx_dma_handle);
    dma_free_coherent(&pdev->dev, dev->dma_size,
                     dev->tx_virt, dev->tx_dma_handle);
    
    dma_release_channel(dev->rx_chan);
    dma_release_channel(dev->tx_chan);
    
    return 0;
}

static const struct of_device_id conv_engine_of_match[] = {
    { .compatible = "xlnx,conv-engine-1.0", },
    {},
};
MODULE_DEVICE_TABLE(of, conv_engine_of_match);

static struct platform_driver conv_engine_driver = {
    .driver = {
        .name = DRIVER_NAME,
        .of_match_table = conv_engine_of_match,
    },
    .probe = conv_engine_probe,
    .remove = conv_engine_remove,
};

static int __init conv_engine_init(void)
{
    int ret;
    
    conv_engine_class = class_create(THIS_MODULE, DEVICE_NAME);
    if (IS_ERR(conv_engine_class))
        return PTR_ERR(conv_engine_class);
    
    ret = platform_driver_register(&conv_engine_driver);
    if (ret)
        class_destroy(conv_engine_class);
    
    return ret;
}

static void __exit conv_engine_exit(void)
{
    platform_driver_unregister(&conv_engine_driver);
    class_destroy(conv_engine_class);
}

module_init(conv_engine_init);
module_exit(conv_engine_exit);

MODULE_DESCRIPTION("Convolution Engine Driver for YOLO V10");
MODULE_AUTHOR("Your Name");
MODULE_LICENSE("GPL v2");

创建设备树配置

创建文件 conv_engine.dtsi

/ {
    conv_engine_0: conv_engine@a0000000 {
        compatible = "xlnx,conv-engine-1.0";
        reg = <0x0 0xa0000000 0x0 0x10000>;
        interrupt-parent = <&gic>;
        interrupts = <0 89 4>;
        interrupt-names = "conv_irq";
        
        dmas = <&axi_dma_0 0
                &axi_dma_0 1>;
        dma-names = "tx", "rx";
        
        clocks = <&zynqmp_clk 71>;
        clock-names = "axi_clk";
    };
    
    axi_dma_0: dma@a0010000 {
        compatible = "xlnx,axi-dma-7.1";
        reg = <0x0 0xa0010000 0x0 0x10000>;
        interrupt-parent = <&gic>;
        interrupts = <0 90 4
                      0 91 4>;
        interrupt-names = "mm2s_introut", "s2mm_introut";
        
        clocks = <&zynqmp_clk 71>, <&zynqmp_clk 71>;
        clock-names = "s_axi_lite_aclk", "m_axi_sg_aclk";
        
        #dma-cells = <1>;
        dma-channels = <2>;
    };
};

创建用户空间应用

Python接口

创建文件 conv_engine_python.py

#!/usr/bin/env python3
"""
卷积引擎Python接口
"""

import numpy as np
import mmap
import os
import struct
import time
from ctypes import *

class ConvEngine:
    def __init__(self):
        # 打开设备文件
        self.dev_file = "/dev/conv_engine"
        self.fd = os.open(self.dev_file, os.O_RDWR)
        
        # 寄存器偏移
        self.CTRL_REG = 0x00
        self.STATUS_REG = 0x04
        self.IMG_SIZE_REG = 0x08
        self.KERNEL_REG = 0x0C
        self.CHANNEL_REG = 0x10
        self.STRIDE_PAD_REG = 0x14
        self.PERF_CNT_REG = 0x18
        
        # 默认配置
        self.img_height = 640
        self.img_width = 640
        self.kernel_size = 3
        self.in_channels = 3
        self.out_channels = 16
        self.stride = 1
        self.padding = 1
        
    def configure(self, img_height=640, img_width=640, 
                 kernel_size=3, in_channels=3, out_channels=16,
                 stride=1, padding=1):
        """配置卷积参数"""
        self.img_height = img_height
        self.img_width = img_width
        self.kernel_size = kernel_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride = stride
        self.padding = padding
        
        # 写入配置寄存器
        img_size = (img_height << 16) | img_width
        kernel_cfg = (kernel_size << 8) | kernel_size
        channel_cfg = (in_channels << 16) | out_channels
        stride_pad = (padding << 4) | stride
        
        self._ioctl(0x1001, img_size)
        self._ioctl(0x1002, kernel_cfg)
        self._ioctl(0x1003, channel_cfg)
        
    def load_weights(self, weights):
        """加载卷积权重"""
        # 将权重量化为INT8
        weights_int8 = np.clip(weights * 127, -128, 127).astype(np.int8)
        
        # 打包权重数据
        weight_bytes = weights_int8.tobytes()
        
        # 发送到设备
        os.write(self.fd, weight_bytes)
        
    def run_inference(self, input_image):
        """运行推理"""
        # 预处理输入图像
        if input_image.shape != (self.img_height, self.img_width, self.in_channels):
            raise ValueError(f"输入图像尺寸不匹配,期望{(self.img_height, self.img_width, self.in_channels)}")
        
        # 量化为INT8
        input_int8 = np.clip(input_image * 127, -128, 127).astype(np.int8)
        
        # 开始计时
        start_time = time.time()
        
        # 发送输入数据
        input_bytes = input_int8.tobytes()
        os.write(self.fd, input_bytes)
        
        # 等待完成并读取输出
        output_size = self._calculate_output_size()
        output_bytes = os.read(self.fd, output_size)
        
        # 结束计时
        end_time = time.time()
        
        # 解析输出
        output_array = np.frombuffer(output_bytes, dtype=np.int32)
        output_shape = self._calculate_output_shape()
        output_array = output_array.reshape(output_shape)
        
        # 反量化
        output_float = output_array.astype(np.float32) / 127.0
        
        inference_time = (end_time - start_time) * 1000  # ms
        
        return output_float, inference_time
    
    def get_performance_stats(self):
        """获取性能统计"""
        avg_cycles = self._ioctl(0x1004, 0)
        
        # 假设200MHz时钟
        clock_freq = 200e6
        avg_time_ms = (avg_cycles / clock_freq) * 1000
        
        return {
            'average_cycles': avg_cycles,
            'average_time_ms': avg_time_ms,
            'throughput_fps': 1000.0 / avg_time_ms if avg_time_ms > 0 else 0
        }
    
    def _calculate_output_size(self):
        """计算输出数据大小"""
        out_h = (self.img_height + 2*self.padding - self.kernel_size) // self.stride + 1
        out_w = (self.img_width + 2*self.padding - self.kernel_size) // self.stride + 1
        return out_h * out_w * self.out_channels * 4  # INT32
    
    def _calculate_output_shape(self):
        """计算输出形状"""
        out_h = (self.img_height + 2*self.padding - self.kernel_size) // self.stride + 1
        out_w = (self.img_width + 2*self.padding - self.kernel_size) // self.stride + 1
        return (out_h, out_w, self.out_channels)
    
    def _ioctl(self, cmd, arg):
        """IOCTL调用"""
        import fcntl
        return fcntl.ioctl(self.fd, cmd, arg)
    
    def benchmark(self, num_iterations=100):
        """性能基准测试"""
        print("开始性能基准测试...")
        
        # 创建随机输入
        test_input = np.random.randn(self.img_height, self.img_width, self.in_channels)
        
        times = []
        for i in range(num_iterations):
            _, time_ms = self.run_inference(test_input)
            times.append(time_ms)
            
            if (i+1) % 10 == 0:
                print(f"  完成 {i+1}/{num_iterations} 次推理")
        
        times = np.array(times)
        
        print("\n基准测试结果:")
        print(f"  平均延迟: {np.mean(times):.2f} ms")
        print(f"  最小延迟: {np.min(times):.2f} ms")
        print(f"  最大延迟: {np.max(times):.2f} ms")
        print(f"  标准差: {np.std(times):.2f} ms")
        print(f"  吞吐量: {1000.0/np.mean(times):.2f} FPS")
        
        return times
    
    def __del__(self):
        """清理资源"""
        if hasattr(self, 'fd'):
            os.close(self.fd)

# 使用示例
if __name__ == "__main__":
    # 创建卷积引擎实例
    engine = ConvEngine()
    
    # 配置参数
    engine.configure(
        img_height=32,
        img_width=32,
        kernel_size=3,
        in_channels=3,
        out_channels=16
    )
    
    # 加载测试权重
    test_weights = np.random.randn(16, 3, 3, 3)  # [out_ch, in_ch, k_h, k_w]
    engine.load_weights(test_weights)
    
    # 运行推理
    test_image = np.random.randn(32, 32, 3)
    output, time_ms = engine.run_inference(test_image)
    
    print(f"推理完成!")
    print(f"  输出形状: {output.shape}")
    print(f"  推理时间: {time_ms:.2f} ms")
    
    # 获取性能统计
    stats = engine.get_performance_stats()
    print(f"\n性能统计:")
    print(f"  平均周期数: {stats['average_cycles']}")
    print(f"  平均时间: {stats['average_time_ms']:.2f} ms")
    print(f"  吞吐量: {stats['throughput_fps']:.2f} FPS")
    
    # 运行基准测试
    engine.benchmark(100)
posted @ 2025-09-18 21:09  李白的白  阅读(20)  评论(0)    收藏  举报