综合与实现流程【p3】--(DSP-存储)优化&PS系统集成
(一)资源优化
1 DSP优化
创建优化的DSP映射
创建文件 dsp_optimized_pe.v:
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// 优化的PE模块 - 直接使用DSP48E2原语
//////////////////////////////////////////////////////////////////////////////////
module dsp_optimized_pe #(
parameter USE_DSP_PACKING = 1 // 使用DSP打包优化
)(
input wire clk,
input wire rst_n,
input wire ce,
// 双精度输入支持INT8打包
input wire signed [7:0] a0, a1, // 两个激活值
input wire signed [7:0] b0, b1, // 两个权重
input wire signed [31:0] c_in, // 累加输入
output reg signed [31:0] p_out // 结果输出
);
generate
if (USE_DSP_PACKING) begin : gen_packed_dsp
// 使用单个DSP48E2实现2个INT8 MAC
// 打包输入到更宽的信号
wire signed [26:0] a_packed;
wire signed [17:0] b_packed;
// A端口打包: [空闲位][a1][隔离0s][a0]
assign a_packed = {3'b0, a1, 8'b0, a0};
// B端口打包: [b1][隔离0s][b0]
assign b_packed = {1'b0, b1, 1'b0, b0};
// DSP48E2原语实例化
wire [47:0] dsp_p;
DSP48E2 #(
// 特性配置
.A_INPUT("DIRECT"),
.B_INPUT("DIRECT"),
.USE_MULT("MULTIPLY"),
.USE_PATTERN_DETECT("NO_PATDET"),
.USE_SIMD("TWO12"), // 关键:SIMD模式用于并行计算
// 寄存器配置
.ACASCREG(1),
.ADREG(1),
.ALUMODEREG(1),
.AREG(1),
.AUTORESET_PATDET("NO_RESET"),
.BCASCREG(1),
.BREG(1),
.CARRYINREG(1),
.CARRYINSELREG(1),
.CREG(1),
.DREG(1),
.INMODEREG(1),
.MREG(1),
.OPMODEREG(1),
.PREG(1)
) DSP48E2_inst (
// 时钟和控制
.CLK(clk),
.CE(ce),
.RSTA(~rst_n),
.RSTB(~rst_n),
.RSTC(~rst_n),
.RSTM(~rst_n),
.RSTP(~rst_n),
// 数据输入
.A({3'b0, a_packed}), // 30位A输入
.B(b_packed), // 18位B输入
.C({16'b0, c_in}), // 48位C输入(累加)
// 控制输入
.OPMODE(9'b000110101), // C + A*B
.ALUMODE(4'b0000), // ADD
.INMODE(5'b00000),
.CARRYINSEL(3'b000),
// 级联(未使用)
.ACIN(30'b0),
.BCIN(18'b0),
.PCIN(48'b0),
.CARRYIN(1'b0),
// 动态控制(未使用)
.D(27'b0),
.CEA1(1'b0),
.CEA2(1'b1),
.CEB1(1'b0),
.CEB2(1'b1),
// 输出
.P(dsp_p),
// 未使用的输出
.ACOUT(),
.BCOUT(),
.PCOUT(),
.CARRYOUT(),
.PATTERNDETECT(),
.PATTERNBDETECT(),
.OVERFLOW(),
.UNDERFLOW()
);
// 提取并组合结果
always @(posedge clk) begin
if (!rst_n) begin
p_out <= 32'b0;
end else if (ce) begin
// 从DSP输出提取两个MAC结果并相加
p_out <= dsp_p[15:0] + dsp_p[31:16] + c_in;
end
end
end else begin : gen_standard_dsp
// 标准实现(编译器推断)
reg signed [15:0] prod0, prod1;
reg signed [31:0] sum;
always @(posedge clk) begin
if (!rst_n) begin
prod0 <= 16'b0;
prod1 <= 16'b0;
sum <= 32'b0;
p_out <= 32'b0;
end else if (ce) begin
// 两个独立的乘法
prod0 <= a0 * b0;
prod1 <= a1 * b1;
// 累加
sum <= prod0 + prod1 + c_in;
p_out <= sum;
end
end
end
endgenerate
endmodule
创建DSP资源监控模块
创建文件 dsp_monitor.v:
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// DSP资源使用监控器
//////////////////////////////////////////////////////////////////////////////////
module dsp_monitor #(
parameter NUM_DSPS = 64,
parameter MONITOR_PERIOD = 1000 // 监控周期(时钟周期数)
)(
input wire clk,
input wire rst_n,
input wire enable,
// DSP活动信号
input wire [NUM_DSPS-1:0] dsp_active,
// 监控输出
output reg [31:0] total_cycles,
output reg [31:0] active_cycles [0:NUM_DSPS-1],
output reg [7:0] utilization_percent,
output reg [31:0] peak_usage,
output reg [31:0] average_usage
);
// 内部计数器
reg [31:0] period_counter;
reg [31:0] active_count;
reg [31:0] usage_accumulator;
integer i;
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
total_cycles <= 0;
utilization_percent <= 0;
peak_usage <= 0;
average_usage <= 0;
period_counter <= 0;
active_count <= 0;
usage_accumulator <= 0;
for (i = 0; i < NUM_DSPS; i = i + 1) begin
active_cycles[i] <= 0;
end
end else if (enable) begin
// 总周期计数
total_cycles <= total_cycles + 1;
period_counter <= period_counter + 1;
// 统计每个DSP的活动周期
for (i = 0; i < NUM_DSPS; i = i + 1) begin
if (dsp_active[i]) begin
active_cycles[i] <= active_cycles[i] + 1;
active_count <= active_count + 1;
end
end
// 累积使用量
usage_accumulator <= usage_accumulator + active_count;
// 更新峰值使用
if (active_count > peak_usage) begin
peak_usage <= active_count;
end
// 周期性计算利用率
if (period_counter >= MONITOR_PERIOD) begin
average_usage <= usage_accumulator / MONITOR_PERIOD;
utilization_percent <= (average_usage * 100) / NUM_DSPS;
// 重置周期计数器
period_counter <= 0;
usage_accumulator <= 0;
end
// 重置活动计数
active_count <= 0;
end
end
// 生成利用率报告(仿真用)
`ifdef SIMULATION
always @(posedge clk) begin
if (period_counter == MONITOR_PERIOD - 1) begin
$display("DSP利用率报告 @%0t:", $time);
$display(" 平均使用: %0d/%0d DSPs", average_usage, NUM_DSPS);
$display(" 利用率: %0d%%", utilization_percent);
$display(" 峰值使用: %0d DSPs", peak_usage);
end
end
`endif
endmodule
2存储优化
创建高效的存储管理器
创建文件 memory_manager.v:
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// 智能存储管理器 - 优化BRAM/URAM使用
//////////////////////////////////////////////////////////////////////////////////
module memory_manager #(
parameter DATA_WIDTH = 8,
parameter ADDR_WIDTH = 10,
parameter USE_URAM = 1, // 1=URAM, 0=BRAM
parameter DOUBLE_BUFFER = 1, // 使用双缓冲
parameter PREFETCH = 1 // 启用预取
)(
input wire clk,
input wire rst_n,
// 写接口
input wire wr_en,
input wire [ADDR_WIDTH-1:0] wr_addr,
input wire [DATA_WIDTH-1:0] wr_data,
// 读接口
input wire rd_en,
input wire [ADDR_WIDTH-1:0] rd_addr,
output reg [DATA_WIDTH-1:0] rd_data,
output reg rd_valid,
// 控制
input wire buffer_swap, // 交换缓冲区
output reg buffer_ready, // 缓冲区准备好
// 性能监控
output reg [31:0] rd_hits, // 读命中次数
output reg [31:0] rd_misses // 读未命中次数
);
// 存储器实例
localparam MEM_DEPTH = 2**ADDR_WIDTH;
generate
if (USE_URAM) begin : gen_uram
// 使用URAM(288Kb块)
(* ram_style = "ultra" *)
reg [71:0] uram_array_a [0:4095];
reg [71:0] uram_array_b [0:4095];
// URAM需要72位宽,所以需要适配
wire [71:0] uram_wr_data = {64'b0, wr_data};
wire [11:0] uram_addr = {2'b0, wr_addr};
always @(posedge clk) begin
if (wr_en) begin
if (!buffer_swap || !DOUBLE_BUFFER) begin
uram_array_a[uram_addr] <= uram_wr_data;
end else begin
uram_array_b[uram_addr] <= uram_wr_data;
end
end
if (rd_en) begin
if (!buffer_swap || !DOUBLE_BUFFER) begin
rd_data <= uram_array_a[{2'b0, rd_addr}][DATA_WIDTH-1:0];
end else begin
rd_data <= uram_array_b[{2'b0, rd_addr}][DATA_WIDTH-1:0];
end
end
end
end else begin : gen_bram
// 使用BRAM(36Kb块)
(* ram_style = "block" *)
reg [DATA_WIDTH-1:0] bram_array_a [0:MEM_DEPTH-1];
reg [DATA_WIDTH-1:0] bram_array_b [0:MEM_DEPTH-1];
always @(posedge clk) begin
if (wr_en) begin
if (!buffer_swap || !DOUBLE_BUFFER) begin
bram_array_a[wr_addr] <= wr_data;
end else begin
bram_array_b[wr_addr] <= wr_data;
end
end
if (rd_en) begin
if (!buffer_swap || !DOUBLE_BUFFER) begin
rd_data <= bram_array_a[rd_addr];
end else begin
rd_data <= bram_array_b[rd_addr];
end
end
end
end
endgenerate
// 预取逻辑
generate
if (PREFETCH) begin : gen_prefetch
reg [DATA_WIDTH-1:0] prefetch_buffer [0:3];
reg [ADDR_WIDTH-1:0] prefetch_addr;
reg [1:0] prefetch_valid;
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
prefetch_addr <= 0;
prefetch_valid <= 0;
end else if (rd_en) begin
// 预取下一个地址
prefetch_addr <= rd_addr + 1;
// 检查预取命中
if (rd_addr == prefetch_addr && prefetch_valid[0]) begin
rd_hits <= rd_hits + 1;
rd_data <= prefetch_buffer[0];
end else begin
rd_misses <= rd_misses + 1;
end
// 更新预取缓冲
prefetch_valid <= {prefetch_valid[0], 1'b1};
end
end
end
endgenerate
// 读有效信号生成
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
rd_valid <= 1'b0;
end else begin
rd_valid <= rd_en;
end
end
// 缓冲区状态管理
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
buffer_ready <= 1'b0;
end else begin
buffer_ready <= 1'b1; // 简化:总是准备好
end
end
// 性能计数器
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
rd_hits <= 0;
rd_misses <= 0;
end
// 计数逻辑在预取部分实现
end
endmodule
(二)PS系统集成
创建Block Design
- 将卷积引擎打包为IP:
创建文件 package_conv_engine.tcl:
###############################################
# 将卷积引擎打包为IP核
###############################################
# 设置IP打包项目路径
set ip_proj_dir "./ip_repo/conv_engine_ip"
# 创建IP打包项目
create_project conv_engine_ip $ip_proj_dir -part xczu9eg-ffvb1156-2-e -force
set_property board_part xilinx.com:zcu102:part0:3.4 [current_project]
# 添加源文件
add_files -norecurse {
./src/hdl/conv_engine/conv_engine_top.v
./src/hdl/conv_engine/systolic_array.v
./src/hdl/conv_engine/systolic_controller.v
./src/hdl/primitives/processing_element.v
./src/hdl/primitives/dsp_optimized_pe.v
./src/hdl/utilities/memory_manager.v
}
# 设置顶层
set_property top conv_engine_top [current_fileset]
# 打包IP
ipx::package_project -root_dir $ip_proj_dir -vendor user.org \
-library user -name conv_engine -taxonomy /UserIP
# 设置IP核心属性
set_property vendor_display_name {YOLO V10 Conv Engine} [ipx::current_core]
set_property display_name {Convolution Engine for YOLO V10} [ipx::current_core]
set_property description {High-performance systolic array based convolution engine optimized for YOLO V10} [ipx::current_core]
set_property company_url {http://www.example.com} [ipx::current_core]
set_property supported_families {zynquplus Production} [ipx::current_core]
set_property version 1.0 [ipx::current_core]
# 自动推断接口
ipx::infer_bus_interface clk xilinx.com:signal:clock_rtl:1.0 [ipx::current_core]
ipx::infer_bus_interface rst_n xilinx.com:signal:reset_rtl:1.0 [ipx::current_core]
# 配置AXI接口
ipx::infer_bus_interface s_axi_awaddr xilinx.com:interface:aximm_rtl:1.0 [ipx::current_core]
ipx::infer_bus_interface s_axis_tdata xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]
ipx::infer_bus_interface m_axis_tdata xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]
# 关联时钟
ipx::associate_bus_interfaces -busif s_axi -clock clk [ipx::current_core]
ipx::associate_bus_interfaces -busif s_axis -clock clk [ipx::current_core]
ipx::associate_bus_interfaces -busif m_axis -clock clk [ipx::current_core]
# 设置存储器映射
ipx::add_memory_map s_axi [ipx::current_core]
set_property slave_memory_map_ref s_axi [ipx::get_bus_interfaces s_axi -of_objects [ipx::current_core]]
ipx::add_address_block axi_lite_regs [ipx::get_memory_maps s_axi -of_objects [ipx::current_core]]
set_property range 4096 [ipx::get_address_blocks axi_lite_regs \
-of_objects [ipx::get_memory_maps s_axi -of_objects [ipx::current_core]]]
# 生成示例驱动
set_property driver_strength strong [ipx::current_core]
set_property auto_family_support_level optimized [ipx::current_core]
# 创建GUI定制页面
ipgui::add_page -name {Basic} -component [ipx::current_core] \
-display_name {Basic Configuration}
ipgui::add_param -name {ARRAY_SIZE} -component [ipx::current_core] \
-parent [ipgui::get_pagespec -name Basic -component [ipx::current_core]]
ipgui::add_param -name {DATA_WIDTH} -component [ipx::current_core] \
-parent [ipgui::get_pagespec -name Basic -component [ipx::current_core]]
# 保存和关闭IP
ipx::save_core [ipx::current_core]
ipx::check_integrity [ipx::current_core]
ipx::archive_core $ip_proj_dir/conv_engine_ip_1.0.zip [ipx::current_core]
close_project
puts "IP核打包完成!"
puts "IP保存位置: $ip_proj_dir"
连接系统组件
创建TCL脚本 build_system_bd.tcl:
# 打开Block Design
open_bd_design {conv_engine_system.bd}
# 1. 添加PS
if {[llength [get_bd_cells zynq_ultra_ps_e_0]] == 0} {
create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.5 zynq_ultra_ps_e_0
apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e \
-config {apply_board_preset "1"} [get_bd_cells zynq_ultra_ps_e_0]
}
# 2. 配置PS
set_property -dict [list \
CONFIG.PSU__USE__M_AXI_GP0 {1} \
CONFIG.PSU__USE__M_AXI_GP1 {0} \
CONFIG.PSU__USE__S_AXI_GP0 {1} \
CONFIG.PSU__USE__S_AXI_GP2 {1} \
CONFIG.PSU__SAXIGP0__DATA_WIDTH {128} \
CONFIG.PSU__SAXIGP2__DATA_WIDTH {128} \
CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ {200} \
CONFIG.PSU__CRL_APB__PL1_REF_CTRL__FREQMHZ {100} \
] [get_bd_cells zynq_ultra_ps_e_0]
# 3. 添加卷积引擎IP
create_bd_cell -type ip -vlnv user.org:user:conv_engine:1.0 conv_engine_0
# 4. 添加AXI互连
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_0
set_property -dict [list CONFIG.NUM_SI {1} CONFIG.NUM_MI {1}] [get_bd_cells axi_interconnect_0]
# 5. 添加DMA控制器
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
set_property -dict [list \
CONFIG.c_include_sg {0} \
CONFIG.c_sg_include_stscntrl_strm {0} \
CONFIG.c_sg_length_width {26} \
CONFIG.c_m_axi_mm2s_data_width {64} \
CONFIG.c_m_axis_mm2s_tdata_width {64} \
CONFIG.c_mm2s_burst_size {256} \
CONFIG.c_m_axi_s2mm_data_width {64} \
CONFIG.c_s_axis_s2mm_tdata_width {64} \
CONFIG.c_s2mm_burst_size {256} \
] [get_bd_cells axi_dma_0]
# 6. 添加AXI SmartConnect用于高性能数据传输
create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 smartconnect_0
set_property -dict [list CONFIG.NUM_SI {2} CONFIG.NUM_MI {1}] [get_bd_cells smartconnect_0]
# 7. 连接控制路径(PS到卷积引擎)
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} \
Slave {/conv_engine_0/s_axi} \
ddr_seg {Auto} \
intc_ip {/axi_interconnect_0} \
master_apm {0}} [get_bd_intf_pins conv_engine_0/s_axi]
# 8. 连接DMA控制
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} \
Slave {/axi_dma_0/S_AXI_LITE} \
ddr_seg {Auto} \
intc_ip {/axi_interconnect_0} \
master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
# 9. 连接数据路径(DMA到DDR)
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Master {/axi_dma_0/M_AXI_MM2S} \
Slave {/zynq_ultra_ps_e_0/S_AXI_HP0_FPD} \
ddr_seg {Auto} \
intc_ip {/smartconnect_0} \
master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HP0_FPD]
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \
Master {/axi_dma_0/M_AXI_S2MM} \
Slave {/zynq_ultra_ps_e_0/S_AXI_HP0_FPD} \
ddr_seg {Auto} \
intc_ip {/smartconnect_0} \
master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HP0_FPD]
# 10. 连接Stream接口
connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] \
[get_bd_intf_pins conv_engine_0/s_axis]
connect_bd_intf_net [get_bd_intf_pins conv_engine_0/m_axis] \
[get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
# 11. 连接时钟
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \
[get_bd_pins conv_engine_0/clk]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \
[get_bd_pins axi_dma_0/m_axi_mm2s_aclk]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \
[get_bd_pins axi_dma_0/m_axi_s2mm_aclk]
# 12. 连接复位
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_resetn0] \
[get_bd_pins conv_engine_0/rst_n]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_resetn0] \
[get_bd_pins axi_dma_0/axi_resetn]
# 13. 添加中断控制器
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_intc:4.1 axi_intc_0
# 连接中断
connect_bd_net [get_bd_pins axi_dma_0/mm2s_introut] \
[get_bd_pins axi_intc_0/intr]
connect_bd_net [get_bd_pins axi_intc_0/irq] \
[get_bd_pins zynq_ultra_ps_e_0/pl_ps_irq0]
# 14. 添加ILA调试核心(可选)
create_bd_cell -type ip -vlnv xilinx.com:ip:system_ila:1.1 system_ila_0
set_property -dict [list CONFIG.C_SLOT_0_AXI_PROTOCOL {AXI4S}] [get_bd_cells system_ila_0]
connect_bd_intf_net [get_bd_intf_pins system_ila_0/SLOT_0_AXIS] \
[get_bd_intf_pins conv_engine_0/m_axis]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \
[get_bd_pins system_ila_0/clk]
# 15. 验证设计
validate_bd_design
# 16. 保存Block Design
save_bd_design
# 17. 生成输出产品
generate_target all [get_files conv_engine_system.bd]
# 18. 创建HDL Wrapper
make_wrapper -files [get_files conv_engine_system.bd] -top
add_files -norecurse conv_engine_system_wrapper.v
# 19. 设置顶层
set_property top conv_engine_system_wrapper [current_fileset]
生成比特流
运行完整实现流程
# 运行综合
launch_runs synth_1 -jobs 8
wait_on_run synth_1
# 运行实现
launch_runs impl_1 -jobs 8
wait_on_run impl_1
# 生成比特流
launch_runs impl_1 -to_step write_bitstream -jobs 8
wait_on_run impl_1
# 导出硬件(包含比特流)
write_hw_platform -fixed -include_bit -force \
-file ./conv_engine_system.xsa
Linux驱动
创建内核驱动
创建文件 conv_engine_driver.c:
/**
* 卷积引擎Linux内核驱动
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/platform_device.h>
#include <linux/interrupt.h>
#include <linux/dma-mapping.h>
#include <linux/slab.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/of_dma.h>
#include <linux/dmaengine.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#define DRIVER_NAME "conv_engine"
#define DEVICE_NAME "conv_engine"
/* 寄存器偏移定义 */
#define CONV_CTRL_REG 0x00
#define CONV_STATUS_REG 0x04
#define CONV_IMG_SIZE_REG 0x08
#define CONV_KERNEL_REG 0x0C
#define CONV_CHANNEL_REG 0x10
#define CONV_STRIDE_PAD_REG 0x14
#define CONV_PERF_CNT_REG 0x18
/* 控制位定义 */
#define CTRL_START_BIT (1 << 0)
#define CTRL_CLEAR_BIT (1 << 1)
#define CTRL_WEIGHT_LOAD (1 << 2)
/* 状态位定义 */
#define STATUS_BUSY_BIT (1 << 0)
#define STATUS_DONE_BIT (1 << 1)
struct conv_engine_dev {
void __iomem *regs;
struct device *dev;
struct cdev cdev;
dev_t devno;
/* DMA相关 */
struct dma_chan *tx_chan;
struct dma_chan *rx_chan;
dma_addr_t tx_dma_handle;
dma_addr_t rx_dma_handle;
void *tx_virt;
void *rx_virt;
size_t dma_size;
/* 中断 */
int irq;
struct completion dma_complete;
/* 性能统计 */
u32 total_inferences;
u64 total_cycles;
};
static struct class *conv_engine_class;
/* 寄存器读写函数 */
static inline u32 conv_read_reg(struct conv_engine_dev *dev, u32 offset)
{
return ioread32(dev->regs + offset);
}
static inline void conv_write_reg(struct conv_engine_dev *dev, u32 offset, u32 value)
{
iowrite32(value, dev->regs + offset);
}
/* 中断处理函数 */
static irqreturn_t conv_engine_isr(int irq, void *dev_id)
{
struct conv_engine_dev *dev = dev_id;
u32 status;
status = conv_read_reg(dev, CONV_STATUS_REG);
if (status & STATUS_DONE_BIT) {
/* 清除完成标志 */
conv_write_reg(dev, CONV_CTRL_REG, CTRL_CLEAR_BIT);
/* 更新统计 */
dev->total_inferences++;
dev->total_cycles += conv_read_reg(dev, CONV_PERF_CNT_REG);
/* 通知完成 */
complete(&dev->dma_complete);
return IRQ_HANDLED;
}
return IRQ_NONE;
}
/* DMA回调函数 */
static void dma_complete_callback(void *completion)
{
complete(completion);
}
/* 配置并启动DMA传输 */
static int conv_engine_dma_transfer(struct conv_engine_dev *dev,
void *src, size_t len, bool is_tx)
{
struct dma_async_tx_descriptor *tx_desc;
struct dma_chan *chan;
dma_addr_t dma_src, dma_dst;
struct completion *cmp = &dev->dma_complete;
dma_cookie_t cookie;
int ret;
chan = is_tx ? dev->tx_chan : dev->rx_chan;
if (is_tx) {
/* 发送数据到设备 */
memcpy(dev->tx_virt, src, len);
dma_src = dev->tx_dma_handle;
dma_dst = 0; /* 设备地址由DMA控制器管理 */
} else {
/* 从设备接收数据 */
dma_src = 0;
dma_dst = dev->rx_dma_handle;
}
/* 准备DMA描述符 */
tx_desc = dmaengine_prep_slave_single(chan,
is_tx ? dma_src : dma_dst,
len,
is_tx ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM,
DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
if (!tx_desc) {
dev_err(dev->dev, "Failed to prepare DMA descriptor\n");
return -ENOMEM;
}
/* 设置回调 */
tx_desc->callback = dma_complete_callback;
tx_desc->callback_param = cmp;
/* 提交DMA传输 */
init_completion(cmp);
cookie = dmaengine_submit(tx_desc);
if (dma_submit_error(cookie)) {
dev_err(dev->dev, "Failed to submit DMA\n");
return -EINVAL;
}
/* 启动DMA */
dma_async_issue_pending(chan);
/* 等待完成 */
ret = wait_for_completion_timeout(cmp, msecs_to_jiffies(5000));
if (ret == 0) {
dev_err(dev->dev, "DMA timeout\n");
dmaengine_terminate_all(chan);
return -ETIMEDOUT;
}
if (!is_tx) {
/* 复制接收的数据 */
memcpy(src, dev->rx_virt, len);
}
return 0;
}
/* 运行卷积推理 */
static int conv_engine_run_inference(struct conv_engine_dev *dev,
void *input_data, size_t input_size,
void *output_data, size_t output_size)
{
int ret;
u32 status;
/* 检查设备是否忙 */
status = conv_read_reg(dev, CONV_STATUS_REG);
if (status & STATUS_BUSY_BIT) {
dev_err(dev->dev, "Device is busy\n");
return -EBUSY;
}
/* 发送输入数据 */
ret = conv_engine_dma_transfer(dev, input_data, input_size, true);
if (ret) {
dev_err(dev->dev, "Failed to send input data\n");
return ret;
}
/* 启动卷积 */
conv_write_reg(dev, CONV_CTRL_REG, CTRL_START_BIT);
/* 等待完成中断 */
ret = wait_for_completion_timeout(&dev->dma_complete,
msecs_to_jiffies(1000));
if (ret == 0) {
dev_err(dev->dev, "Inference timeout\n");
return -ETIMEDOUT;
}
/* 接收输出数据 */
ret = conv_engine_dma_transfer(dev, output_data, output_size, false);
if (ret) {
dev_err(dev->dev, "Failed to receive output data\n");
return ret;
}
return 0;
}
/* 文件操作函数 */
static int conv_engine_open(struct inode *inode, struct file *file)
{
struct conv_engine_dev *dev;
dev = container_of(inode->i_cdev, struct conv_engine_dev, cdev);
file->private_data = dev;
return 0;
}
static int conv_engine_release(struct inode *inode, struct file *file)
{
return 0;
}
static long conv_engine_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct conv_engine_dev *dev = file->private_data;
int ret = 0;
switch (cmd) {
case 0x1001: /* 设置图像尺寸 */
conv_write_reg(dev, CONV_IMG_SIZE_REG, arg);
break;
case 0x1002: /* 设置卷积核尺寸 */
conv_write_reg(dev, CONV_KERNEL_REG, arg);
break;
case 0x1003: /* 设置通道数 */
conv_write_reg(dev, CONV_CHANNEL_REG, arg);
break;
case 0x1004: /* 获取性能计数 */
ret = put_user(dev->total_cycles / dev->total_inferences,
(u32 __user *)arg);
break;
default:
ret = -EINVAL;
}
return ret;
}
static const struct file_operations conv_engine_fops = {
.owner = THIS_MODULE,
.open = conv_engine_open,
.release = conv_engine_release,
.unlocked_ioctl = conv_engine_ioctl,
};
/* 平台驱动probe函数 */
static int conv_engine_probe(struct platform_device *pdev)
{
struct conv_engine_dev *dev;
struct resource *res;
int ret;
dev_info(&pdev->dev, "Probing conv_engine driver\n");
/* 分配设备结构 */
dev = devm_kzalloc(&pdev->dev, sizeof(*dev), GFP_KERNEL);
if (!dev)
return -ENOMEM;
dev->dev = &pdev->dev;
/* 获取并映射寄存器 */
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
dev->regs = devm_ioremap_resource(&pdev->dev, res);
if (IS_ERR(dev->regs))
return PTR_ERR(dev->regs);
/* 获取中断 */
dev->irq = platform_get_irq(pdev, 0);
if (dev->irq < 0)
return dev->irq;
/* 注册中断处理 */
ret = devm_request_irq(&pdev->dev, dev->irq, conv_engine_isr,
IRQF_SHARED, DRIVER_NAME, dev);
if (ret) {
dev_err(&pdev->dev, "Failed to request IRQ\n");
return ret;
}
/* 获取DMA通道 */
dev->tx_chan = dma_request_slave_channel(&pdev->dev, "tx");
if (!dev->tx_chan) {
dev_err(&pdev->dev, "Failed to request TX DMA channel\n");
return -EPROBE_DEFER;
}
dev->rx_chan = dma_request_slave_channel(&pdev->dev, "rx");
if (!dev->rx_chan) {
dev_err(&pdev->dev, "Failed to request RX DMA channel\n");
dma_release_channel(dev->tx_chan);
return -EPROBE_DEFER;
}
/* 分配DMA缓冲区 */
dev->dma_size = 640 * 640 * 3; /* 最大图像尺寸 */
dev->tx_virt = dma_alloc_coherent(&pdev->dev, dev->dma_size,
&dev->tx_dma_handle, GFP_KERNEL);
if (!dev->tx_virt) {
dev_err(&pdev->dev, "Failed to allocate TX DMA buffer\n");
ret = -ENOMEM;
goto err_dma_alloc;
}
dev->rx_virt = dma_alloc_coherent(&pdev->dev, dev->dma_size,
&dev->rx_dma_handle, GFP_KERNEL);
if (!dev->rx_virt) {
dev_err(&pdev->dev, "Failed to allocate RX DMA buffer\n");
ret = -ENOMEM;
goto err_rx_alloc;
}
/* 初始化完成量 */
init_completion(&dev->dma_complete);
/* 注册字符设备 */
ret = alloc_chrdev_region(&dev->devno, 0, 1, DEVICE_NAME);
if (ret < 0) {
dev_err(&pdev->dev, "Failed to allocate char device region\n");
goto err_chrdev;
}
cdev_init(&dev->cdev, &conv_engine_fops);
dev->cdev.owner = THIS_MODULE;
ret = cdev_add(&dev->cdev, dev->devno, 1);
if (ret) {
dev_err(&pdev->dev, "Failed to add char device\n");
goto err_cdev_add;
}
/* 创建设备节点 */
device_create(conv_engine_class, &pdev->dev, dev->devno,
NULL, DEVICE_NAME);
platform_set_drvdata(pdev, dev);
dev_info(&pdev->dev, "Conv engine driver probed successfully\n");
return 0;
err_cdev_add:
unregister_chrdev_region(dev->devno, 1);
err_chrdev:
dma_free_coherent(&pdev->dev, dev->dma_size,
dev->rx_virt, dev->rx_dma_handle);
err_rx_alloc:
dma_free_coherent(&pdev->dev, dev->dma_size,
dev->tx_virt, dev->tx_dma_handle);
err_dma_alloc:
dma_release_channel(dev->rx_chan);
dma_release_channel(dev->tx_chan);
return ret;
}
static int conv_engine_remove(struct platform_device *pdev)
{
struct conv_engine_dev *dev = platform_get_drvdata(pdev);
device_destroy(conv_engine_class, dev->devno);
cdev_del(&dev->cdev);
unregister_chrdev_region(dev->devno, 1);
dma_free_coherent(&pdev->dev, dev->dma_size,
dev->rx_virt, dev->rx_dma_handle);
dma_free_coherent(&pdev->dev, dev->dma_size,
dev->tx_virt, dev->tx_dma_handle);
dma_release_channel(dev->rx_chan);
dma_release_channel(dev->tx_chan);
return 0;
}
static const struct of_device_id conv_engine_of_match[] = {
{ .compatible = "xlnx,conv-engine-1.0", },
{},
};
MODULE_DEVICE_TABLE(of, conv_engine_of_match);
static struct platform_driver conv_engine_driver = {
.driver = {
.name = DRIVER_NAME,
.of_match_table = conv_engine_of_match,
},
.probe = conv_engine_probe,
.remove = conv_engine_remove,
};
static int __init conv_engine_init(void)
{
int ret;
conv_engine_class = class_create(THIS_MODULE, DEVICE_NAME);
if (IS_ERR(conv_engine_class))
return PTR_ERR(conv_engine_class);
ret = platform_driver_register(&conv_engine_driver);
if (ret)
class_destroy(conv_engine_class);
return ret;
}
static void __exit conv_engine_exit(void)
{
platform_driver_unregister(&conv_engine_driver);
class_destroy(conv_engine_class);
}
module_init(conv_engine_init);
module_exit(conv_engine_exit);
MODULE_DESCRIPTION("Convolution Engine Driver for YOLO V10");
MODULE_AUTHOR("Your Name");
MODULE_LICENSE("GPL v2");
创建设备树配置
创建文件 conv_engine.dtsi:
/ {
conv_engine_0: conv_engine@a0000000 {
compatible = "xlnx,conv-engine-1.0";
reg = <0x0 0xa0000000 0x0 0x10000>;
interrupt-parent = <&gic>;
interrupts = <0 89 4>;
interrupt-names = "conv_irq";
dmas = <&axi_dma_0 0
&axi_dma_0 1>;
dma-names = "tx", "rx";
clocks = <&zynqmp_clk 71>;
clock-names = "axi_clk";
};
axi_dma_0: dma@a0010000 {
compatible = "xlnx,axi-dma-7.1";
reg = <0x0 0xa0010000 0x0 0x10000>;
interrupt-parent = <&gic>;
interrupts = <0 90 4
0 91 4>;
interrupt-names = "mm2s_introut", "s2mm_introut";
clocks = <&zynqmp_clk 71>, <&zynqmp_clk 71>;
clock-names = "s_axi_lite_aclk", "m_axi_sg_aclk";
#dma-cells = <1>;
dma-channels = <2>;
};
};
创建用户空间应用
Python接口
创建文件 conv_engine_python.py:
#!/usr/bin/env python3
"""
卷积引擎Python接口
"""
import numpy as np
import mmap
import os
import struct
import time
from ctypes import *
class ConvEngine:
def __init__(self):
# 打开设备文件
self.dev_file = "/dev/conv_engine"
self.fd = os.open(self.dev_file, os.O_RDWR)
# 寄存器偏移
self.CTRL_REG = 0x00
self.STATUS_REG = 0x04
self.IMG_SIZE_REG = 0x08
self.KERNEL_REG = 0x0C
self.CHANNEL_REG = 0x10
self.STRIDE_PAD_REG = 0x14
self.PERF_CNT_REG = 0x18
# 默认配置
self.img_height = 640
self.img_width = 640
self.kernel_size = 3
self.in_channels = 3
self.out_channels = 16
self.stride = 1
self.padding = 1
def configure(self, img_height=640, img_width=640,
kernel_size=3, in_channels=3, out_channels=16,
stride=1, padding=1):
"""配置卷积参数"""
self.img_height = img_height
self.img_width = img_width
self.kernel_size = kernel_size
self.in_channels = in_channels
self.out_channels = out_channels
self.stride = stride
self.padding = padding
# 写入配置寄存器
img_size = (img_height << 16) | img_width
kernel_cfg = (kernel_size << 8) | kernel_size
channel_cfg = (in_channels << 16) | out_channels
stride_pad = (padding << 4) | stride
self._ioctl(0x1001, img_size)
self._ioctl(0x1002, kernel_cfg)
self._ioctl(0x1003, channel_cfg)
def load_weights(self, weights):
"""加载卷积权重"""
# 将权重量化为INT8
weights_int8 = np.clip(weights * 127, -128, 127).astype(np.int8)
# 打包权重数据
weight_bytes = weights_int8.tobytes()
# 发送到设备
os.write(self.fd, weight_bytes)
def run_inference(self, input_image):
"""运行推理"""
# 预处理输入图像
if input_image.shape != (self.img_height, self.img_width, self.in_channels):
raise ValueError(f"输入图像尺寸不匹配,期望{(self.img_height, self.img_width, self.in_channels)}")
# 量化为INT8
input_int8 = np.clip(input_image * 127, -128, 127).astype(np.int8)
# 开始计时
start_time = time.time()
# 发送输入数据
input_bytes = input_int8.tobytes()
os.write(self.fd, input_bytes)
# 等待完成并读取输出
output_size = self._calculate_output_size()
output_bytes = os.read(self.fd, output_size)
# 结束计时
end_time = time.time()
# 解析输出
output_array = np.frombuffer(output_bytes, dtype=np.int32)
output_shape = self._calculate_output_shape()
output_array = output_array.reshape(output_shape)
# 反量化
output_float = output_array.astype(np.float32) / 127.0
inference_time = (end_time - start_time) * 1000 # ms
return output_float, inference_time
def get_performance_stats(self):
"""获取性能统计"""
avg_cycles = self._ioctl(0x1004, 0)
# 假设200MHz时钟
clock_freq = 200e6
avg_time_ms = (avg_cycles / clock_freq) * 1000
return {
'average_cycles': avg_cycles,
'average_time_ms': avg_time_ms,
'throughput_fps': 1000.0 / avg_time_ms if avg_time_ms > 0 else 0
}
def _calculate_output_size(self):
"""计算输出数据大小"""
out_h = (self.img_height + 2*self.padding - self.kernel_size) // self.stride + 1
out_w = (self.img_width + 2*self.padding - self.kernel_size) // self.stride + 1
return out_h * out_w * self.out_channels * 4 # INT32
def _calculate_output_shape(self):
"""计算输出形状"""
out_h = (self.img_height + 2*self.padding - self.kernel_size) // self.stride + 1
out_w = (self.img_width + 2*self.padding - self.kernel_size) // self.stride + 1
return (out_h, out_w, self.out_channels)
def _ioctl(self, cmd, arg):
"""IOCTL调用"""
import fcntl
return fcntl.ioctl(self.fd, cmd, arg)
def benchmark(self, num_iterations=100):
"""性能基准测试"""
print("开始性能基准测试...")
# 创建随机输入
test_input = np.random.randn(self.img_height, self.img_width, self.in_channels)
times = []
for i in range(num_iterations):
_, time_ms = self.run_inference(test_input)
times.append(time_ms)
if (i+1) % 10 == 0:
print(f" 完成 {i+1}/{num_iterations} 次推理")
times = np.array(times)
print("\n基准测试结果:")
print(f" 平均延迟: {np.mean(times):.2f} ms")
print(f" 最小延迟: {np.min(times):.2f} ms")
print(f" 最大延迟: {np.max(times):.2f} ms")
print(f" 标准差: {np.std(times):.2f} ms")
print(f" 吞吐量: {1000.0/np.mean(times):.2f} FPS")
return times
def __del__(self):
"""清理资源"""
if hasattr(self, 'fd'):
os.close(self.fd)
# 使用示例
if __name__ == "__main__":
# 创建卷积引擎实例
engine = ConvEngine()
# 配置参数
engine.configure(
img_height=32,
img_width=32,
kernel_size=3,
in_channels=3,
out_channels=16
)
# 加载测试权重
test_weights = np.random.randn(16, 3, 3, 3) # [out_ch, in_ch, k_h, k_w]
engine.load_weights(test_weights)
# 运行推理
test_image = np.random.randn(32, 32, 3)
output, time_ms = engine.run_inference(test_image)
print(f"推理完成!")
print(f" 输出形状: {output.shape}")
print(f" 推理时间: {time_ms:.2f} ms")
# 获取性能统计
stats = engine.get_performance_stats()
print(f"\n性能统计:")
print(f" 平均周期数: {stats['average_cycles']}")
print(f" 平均时间: {stats['average_time_ms']:.2f} ms")
print(f" 吞吐量: {stats['throughput_fps']:.2f} FPS")
# 运行基准测试
engine.benchmark(100)

浙公网安备 33010602011771号