卷积引擎实现-- P1
卷积引擎实现
第1部分:卷积引擎架构设计与准备
1.1 创建卷积引擎工程结构
步骤1.1.1:创建卷积引擎子工程
步骤1.1.2:创建源文件目录结构
# 创建目录结构
cd [get_property DIRECTORY [current_project]]
file mkdir src
file mkdir src/hdl
file mkdir src/hdl/conv_engine
file mkdir src/hdl/primitives
file mkdir src/hdl/utilities
file mkdir src/constraints
file mkdir src/testbench
file mkdir src/ip
file mkdir docs
file mkdir sim_results
puts "目录结构创建完成!"
1.2 卷积运算的硬件映射
1.2.1. 硬件并行化策略
- 输出通道并行度 (Pof): 8
- 输入通道并行度 (Pif): 4
- 空间并行度 (Pp): 2
1.2.2. 数据流模式
采用权重固定数据流:
- 权重保持在PE中
- 输入特征图流过PE阵列
- 部分和在PE间传递
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Company: Your Organization
// Engineer: Your Name
//
// Create Date: 2024/11/14
// Module Name: processing_element
// Project Name: YOLO V10 Convolution Engine
// Target Devices: xczu9eg-ffvb1156-2-e (ZCU102)
// Tool Versions: Vivado 2024.2
// Description:
// 单个处理单元(PE),执行乘累加(MAC)操作
// 支持INT8权重和激活值,INT32累加器
//
// Revision: 1.0
//////////////////////////////////////////////////////////////////////////////////
module processing_element #(
parameter WEIGHT_WIDTH = 8, // 权重位宽(INT8)
parameter ACTIVATION_WIDTH = 8, // 激活值位宽(INT8)
parameter ACCUMULATOR_WIDTH = 32 // 累加器位宽
)(
// 时钟和复位
input wire clk,
input wire rst_n,
// 控制信号
input wire enable, // PE使能
input wire weight_load, // 权重加载使能
input wire clear_acc, // 清除累加器
// 数据输入
input wire signed [WEIGHT_WIDTH-1:0] weight_in, // 权重输入
input wire signed [ACTIVATION_WIDTH-1:0] activation_in, // 激活值输入
input wire signed [ACCUMULATOR_WIDTH-1:0] partial_sum_in, // 部分和输入
// 数据输出
output reg signed [ACTIVATION_WIDTH-1:0] activation_out, // 激活值传递输出
output reg signed [ACCUMULATOR_WIDTH-1:0] partial_sum_out // 部分和输出
);
// 内部寄存器
reg signed [WEIGHT_WIDTH-1:0] weight_reg; // 存储的权重
reg signed [ACCUMULATOR_WIDTH-1:0] accumulator; // 内部累加器
// 中间信号
wire signed [WEIGHT_WIDTH+ACTIVATION_WIDTH-1:0] product; // 乘法结果
// 乘法器实例化 - 使用DSP48E2
// Vivado会自动推断并映射到DSP块
assign product = weight_reg * activation_in;
// 主要处理逻辑
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
// 异步复位
weight_reg <= {WEIGHT_WIDTH{1'b0}};
accumulator <= {ACCUMULATOR_WIDTH{1'b0}};
activation_out <= {ACTIVATION_WIDTH{1'b0}};
partial_sum_out <= {ACCUMULATOR_WIDTH{1'b0}};
end else begin
// 权重加载
if (weight_load) begin
weight_reg <= weight_in;
end
// MAC操作
if (enable) begin
// 激活值传递(用于脉动阵列)
activation_out <= activation_in;
// 累加操作
if (clear_acc) begin
accumulator <= partial_sum_in + product;
end else begin
accumulator <= accumulator + product;
end
// 输出部分和
partial_sum_out <= accumulator;
end
end
end
// 添加调试信号(仅在仿真时使用)
`ifdef SIMULATION
initial begin
$display("PE实例化: WEIGHT_WIDTH=%d, ACTIVATION_WIDTH=%d",
WEIGHT_WIDTH, ACTIVATION_WIDTH);
end
`endif
endmodule
步骤1.2.3:创建PE的testbench
创建仿真文件:
tb_processing_element.v
`timescale 1ns / 1ps
module tb_processing_element;
// 参数定义
parameter WEIGHT_WIDTH = 8;
parameter ACTIVATION_WIDTH = 8;
parameter ACCUMULATOR_WIDTH = 32;
// 测试信号
reg clk;
reg rst_n;
reg enable;
reg weight_load;
reg clear_acc;
reg signed [WEIGHT_WIDTH-1:0] weight_in;
reg signed [ACTIVATION_WIDTH-1:0] activation_in;
reg signed [ACCUMULATOR_WIDTH-1:0] partial_sum_in;
wire signed [ACTIVATION_WIDTH-1:0] activation_out;
wire signed [ACCUMULATOR_WIDTH-1:0] partial_sum_out;
// 被测模块实例化
processing_element #(
.WEIGHT_WIDTH(WEIGHT_WIDTH),
.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
.ACCUMULATOR_WIDTH(ACCUMULATOR_WIDTH)
) DUT (
.clk(clk),
.rst_n(rst_n),
.enable(enable),
.weight_load(weight_load),
.clear_acc(clear_acc),
.weight_in(weight_in),
.activation_in(activation_in),
.partial_sum_in(partial_sum_in),
.activation_out(activation_out),
.partial_sum_out(partial_sum_out)
);
// 时钟生成:200MHz
initial begin
clk = 0;
forever #2.5 clk = ~clk; // 5ns周期 = 200MHz
end
// 测试序列
initial begin
// 初始化
$display("=== PE测试开始 ===");
$display("时间\t操作\t\t权重\t激活\t输出");
rst_n = 0;
enable = 0;
weight_load = 0;
clear_acc = 0;
weight_in = 0;
activation_in = 0;
partial_sum_in = 0;
// 复位释放
#20 rst_n = 1;
#10;
// 测试1:加载权重
$display("\n--- 测试1:加载权重 ---");
weight_in = 8'sd5; // 权重 = 5
weight_load = 1;
#5;
weight_load = 0;
#5;
// 测试2:基本MAC操作
$display("\n--- 测试2:基本MAC操作 ---");
enable = 1;
clear_acc = 1;
activation_in = 8'sd3; // 3 * 5 = 15
partial_sum_in = 32'd0;
#5;
$display("%t\tMAC\t\t%d\t%d\t%d", $time, 5, activation_in, partial_sum_out);
// 测试3:累加操作
$display("\n--- 测试3:累加操作 ---");
clear_acc = 0;
activation_in = 8'sd4; // 4 * 5 = 20, 累加:15 + 20 = 35
#5;
$display("%t\t累加\t\t%d\t%d\t%d", $time, 5, activation_in, partial_sum_out);
activation_in = 8'sd-2; // -2 * 5 = -10, 累加:35 - 10 = 25
#5;
$display("%t\t累加\t\t%d\t%d\t%d", $time, 5, activation_in, partial_sum_out);
// 测试4:负数处理
$display("\n--- 测试4:负数权重 ---");
weight_in = 8'sd-3;
weight_load = 1;
#5;
weight_load = 0;
clear_acc = 1;
activation_in = 8'sd6; // 6 * -3 = -18
#5;
$display("%t\t负数MAC\t%d\t%d\t%d", $time, -3, activation_in, partial_sum_out);
// 测试5:溢出测试
$display("\n--- 测试5:大数值测试 ---");
weight_in = 8'sd127; // 最大正值
weight_load = 1;
#5;
weight_load = 0;
activation_in = 8'sd127; // 127 * 127 = 16129
clear_acc = 1;
#5;
$display("%t\t最大值\t%d\t%d\t%d", $time, 127, 127, partial_sum_out);
#20;
$display("\n=== PE测试完成 ===");
$finish;
end
// 波形转储(用于查看波形)
initial begin
$dumpfile("pe_test.vcd");
$dumpvars(0, tb_processing_element);
end
endmodule
1.3 构建脉动阵列
步骤1.3.1:创建脉动阵列模块
创建文件 systolic_array.v:
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Module Name: systolic_array
// Description: 8x8脉动阵列,用于卷积计算
//////////////////////////////////////////////////////////////////////////////////
module systolic_array #(
parameter ARRAY_SIZE = 8, // 阵列大小 8x8
parameter WEIGHT_WIDTH = 8, // 权重位宽
parameter ACTIVATION_WIDTH = 8, // 激活值位宽
parameter ACCUMULATOR_WIDTH = 32 // 累加器位宽
)(
// 时钟和复位
input wire clk,
input wire rst_n,
// 全局控制
input wire enable, // 阵列使能
input wire weight_load, // 权重加载模式
input wire compute_enable, // 计算使能
input wire accumulate_clear, // 清除累加器
// 权重输入接口(加载模式)
input wire [WEIGHT_WIDTH-1:0] weight_data [0:ARRAY_SIZE-1][0:ARRAY_SIZE-1],
// 激活值输入接口(左侧输入)
input wire signed [ACTIVATION_WIDTH-1:0] activation_in [0:ARRAY_SIZE-1],
input wire activation_valid [0:ARRAY_SIZE-1],
// 部分和输入接口(顶部输入)
input wire signed [ACCUMULATOR_WIDTH-1:0] partial_sum_in [0:ARRAY_SIZE-1],
// 输出接口(底部和右侧输出)
output wire signed [ACCUMULATOR_WIDTH-1:0] result_out [0:ARRAY_SIZE-1],
output wire result_valid [0:ARRAY_SIZE-1]
);
// PE之间的连接信号
wire signed [ACTIVATION_WIDTH-1:0] activation_h [0:ARRAY_SIZE-1][0:ARRAY_SIZE]; // 水平传递
wire signed [ACCUMULATOR_WIDTH-1:0] partial_sum_v [0:ARRAY_SIZE][0:ARRAY_SIZE-1]; // 垂直传递
// 延迟寄存器,用于数据对齐
reg signed [ACTIVATION_WIDTH-1:0] activation_delay [0:ARRAY_SIZE-1][0:ARRAY_SIZE-1];
reg signed [ACCUMULATOR_WIDTH-1:0] psum_delay [0:ARRAY_SIZE-1][0:ARRAY_SIZE-1];
// 生成PE阵列
genvar row, col;
generate
for (row = 0; row < ARRAY_SIZE; row = row + 1) begin : pe_row
for (col = 0; col < ARRAY_SIZE; col = col + 1) begin : pe_col
// 确定每个PE的输入连接
wire signed [ACTIVATION_WIDTH-1:0] pe_act_in;
wire signed [ACCUMULATOR_WIDTH-1:0] pe_psum_in;
// 激活值输入:第一列从外部输入,其他列从左侧PE传入
assign pe_act_in = (col == 0) ? activation_in[row] :
activation_h[row][col];
// 部分和输入:第一行从外部输入,其他行从上方PE传入
assign pe_psum_in = (row == 0) ? partial_sum_in[col] :
partial_sum_v[row][col];
// PE实例化
processing_element #(
.WEIGHT_WIDTH(WEIGHT_WIDTH),
.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
.ACCUMULATOR_WIDTH(ACCUMULATOR_WIDTH)
) pe_inst (
.clk(clk),
.rst_n(rst_n),
.enable(compute_enable),
.weight_load(weight_load),
.clear_acc(accumulate_clear),
.weight_in(weight_data[row][col]),
.activation_in(pe_act_in),
.partial_sum_in(pe_psum_in),
.activation_out(activation_h[row][col+1]),
.partial_sum_out(partial_sum_v[row+1][col])
);
end
end
endgenerate
// 输出赋值(从最后一行输出)
generate
for (col = 0; col < ARRAY_SIZE; col = col + 1) begin : output_assign
assign result_out[col] = partial_sum_v[ARRAY_SIZE][col];
end
endgenerate
// 输出有效信号生成(需要根据数据流延迟计算)
reg [ARRAY_SIZE-1:0] valid_shift_reg [0:2*ARRAY_SIZE-1];
integer i;
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
for (i = 0; i < 2*ARRAY_SIZE; i = i + 1) begin
valid_shift_reg[i] <= 0;
end
end else if (enable) begin
// 移位寄存器传递有效信号
valid_shift_reg[0] <= activation_valid[0];
for (i = 1; i < 2*ARRAY_SIZE; i = i + 1) begin
valid_shift_reg[i] <= valid_shift_reg[i-1];
end
end
end
// 输出有效信号(经过适当延迟)
generate
for (col = 0; col < ARRAY_SIZE; col = col + 1) begin : valid_gen
assign result_valid[col] = valid_shift_reg[ARRAY_SIZE + col][col];
end
endgenerate
endmodule
步骤1.3.2:创建脉动阵列控制器
创建文件 systolic_controller.v:
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Module Name: systolic_controller
// Description: 脉动阵列控制器,管理数据流和时序
//////////////////////////////////////////////////////////////////////////////////
module systolic_controller #(
parameter ARRAY_SIZE = 8,
parameter INPUT_BUFFER_DEPTH = 256,
parameter WEIGHT_BUFFER_DEPTH = 64
)(
input wire clk,
input wire rst_n,
// 控制接口
input wire start,
input wire [9:0] input_height, // 输入特征图高度
input wire [9:0] input_width, // 输入特征图宽度
input wire [9:0] kernel_size, // 卷积核大小
input wire [9:0] input_channels, // 输入通道数
input wire [9:0] output_channels, // 输出通道数
// 状态输出
output reg busy,
output reg done,
output reg [31:0] cycle_count,
// 脉动阵列控制信号
output reg sa_enable,
output reg sa_weight_load,
output reg sa_compute,
output reg sa_acc_clear,
// 数据缓冲控制
output reg input_buffer_rd_en,
output reg [7:0] input_buffer_addr,
output reg weight_buffer_rd_en,
output reg [7:0] weight_buffer_addr,
output reg output_buffer_wr_en,
output reg [7:0] output_buffer_addr
);
// 状态机定义
localparam IDLE = 4'b0000;
localparam LOAD_WEIGHTS = 4'b0001;
localparam INIT_COMPUTE = 4'b0010;
localparam COMPUTE = 4'b0011;
localparam DRAIN = 4'b0100;
localparam DONE = 4'b0101;
reg [3:0] state, next_state;
// 计数器
reg [9:0] weight_load_cnt;
reg [9:0] compute_cycle_cnt;
reg [9:0] drain_cnt;
reg [9:0] current_och; // 当前输出通道
reg [9:0] current_ich; // 当前输入通道
reg [9:0] current_ky; // 当前卷积核y
reg [9:0] current_kx; // 当前卷积核x
// 状态机主逻辑
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
state <= IDLE;
end else begin
state <= next_state;
end
end
// 次态逻辑
always @(*) begin
next_state = state;
case (state)
IDLE: begin
if (start)
next_state = LOAD_WEIGHTS;
end
LOAD_WEIGHTS: begin
if (weight_load_cnt >= ARRAY_SIZE * ARRAY_SIZE - 1)
next_state = INIT_COMPUTE;
end
INIT_COMPUTE: begin
next_state = COMPUTE;
end
COMPUTE: begin
if (compute_cycle_cnt >= input_height * input_width + 2*ARRAY_SIZE)
next_state = DRAIN;
end
DRAIN: begin
if (drain_cnt >= ARRAY_SIZE)
next_state = DONE;
end
DONE: begin
next_state = IDLE;
end
default: next_state = IDLE;
endcase
end
// 输出逻辑和计数器控制
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
// 复位所有信号
busy <= 1'b0;
done <= 1'b0;
cycle_count <= 32'd0;
sa_enable <= 1'b0;
sa_weight_load <= 1'b0;
sa_compute <= 1'b0;
sa_acc_clear <= 1'b0;
input_buffer_rd_en <= 1'b0;
input_buffer_addr <= 8'd0;
weight_buffer_rd_en <= 1'b0;
weight_buffer_addr <= 8'd0;
output_buffer_wr_en <= 1'b0;
output_buffer_addr <= 8'd0;
weight_load_cnt <= 10'd0;
compute_cycle_cnt <= 10'd0;
drain_cnt <= 10'd0;
current_och <= 10'd0;
current_ich <= 10'd0;
current_ky <= 10'd0;
current_kx <= 10'd0;
end else begin
// 默认值
done <= 1'b0;
sa_weight_load <= 1'b0;
input_buffer_rd_en <= 1'b0;
weight_buffer_rd_en <= 1'b0;
output_buffer_wr_en <= 1'b0;
case (state)
IDLE: begin
busy <= 1'b0;
sa_enable <= 1'b0;
sa_compute <= 1'b0;
cycle_count <= 32'd0;
end
LOAD_WEIGHTS: begin
busy <= 1'b1;
sa_enable <= 1'b1;
sa_weight_load <= 1'b1;
// 从权重缓冲读取
weight_buffer_rd_en <= 1'b1;
weight_buffer_addr <= weight_load_cnt[7:0];
weight_load_cnt <= weight_load_cnt + 1'b1;
cycle_count <= cycle_count + 1'b1;
end
INIT_COMPUTE: begin
sa_weight_load <= 1'b0;
sa_compute <= 1'b1;
sa_acc_clear <= 1'b1; // 第一次计算清除累加器
compute_cycle_cnt <= 10'd0;
end
COMPUTE: begin
sa_acc_clear <= 1'b0;
// 读取输入数据
if (compute_cycle_cnt < input_height * input_width) begin
input_buffer_rd_en <= 1'b1;
input_buffer_addr <= compute_cycle_cnt[7:0];
end
// 在适当的时候写入输出
if (compute_cycle_cnt >= ARRAY_SIZE) begin
output_buffer_wr_en <= 1'b1;
output_buffer_addr <= (compute_cycle_cnt - ARRAY_SIZE)[7:0];
end
compute_cycle_cnt <= compute_cycle_cnt + 1'b1;
cycle_count <= cycle_count + 1'b1;
end
DRAIN: begin
// 排空流水线中的最后数据
output_buffer_wr_en <= 1'b1;
output_buffer_addr <= output_buffer_addr + 1'b1;
drain_cnt <= drain_cnt + 1'b1;
cycle_count <= cycle_count + 1'b1;
end
DONE: begin
busy <= 1'b0;
done <= 1'b1;
sa_enable <= 1'b0;
sa_compute <= 1'b0;
// 复位计数器
weight_load_cnt <= 10'd0;
compute_cycle_cnt <= 10'd0;
drain_cnt <= 10'd0;
end
endcase
end
end
// 性能计数器
reg [31:0] mac_operations;
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
mac_operations <= 32'd0;
end else if (state == COMPUTE && sa_compute) begin
// 每个周期执行 ARRAY_SIZE × ARRAY_SIZE 个MAC操作
mac_operations <= mac_operations + (ARRAY_SIZE * ARRAY_SIZE);
end
end
endmodule
1.4 实现卷积引擎顶层
步骤1.4.1:创建完整的卷积引擎
创建文件 conv_engine_top.v:
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Module Name: conv_engine_top
// Description: 完整的卷积引擎顶层模块
//////////////////////////////////////////////////////////////////////////////////
module conv_engine_top #(
// 参数定义
parameter ARRAY_SIZE = 8,
parameter DATA_WIDTH = 8,
parameter WEIGHT_WIDTH = 8,
parameter ACCUMULATOR_WIDTH = 32,
parameter MAX_IMAGE_SIZE = 640,
parameter MAX_KERNEL_SIZE = 7
)(
// 系统信号
input wire clk,
input wire rst_n,
// AXI4-Lite从接口(配置)
input wire [31:0] s_axi_awaddr,
input wire s_axi_awvalid,
output reg s_axi_awready,
input wire [31:0] s_axi_wdata,
input wire [3:0] s_axi_wstrb,
input wire s_axi_wvalid,
output reg s_axi_wready,
output reg [1:0] s_axi_bresp,
output reg s_axi_bvalid,
input wire s_axi_bready,
input wire [31:0] s_axi_araddr,
input wire s_axi_arvalid,
output reg s_axi_arready,
output reg [31:0] s_axi_rdata,
output reg [1:0] s_axi_rresp,
output reg s_axi_rvalid,
input wire s_axi_rready,
// AXI4-Stream输入接口(特征图)
input wire [63:0] s_axis_tdata,
input wire s_axis_tlast,
input wire s_axis_tvalid,
output reg s_axis_tready,
// AXI4-Stream输出接口(结果)
output reg [63:0] m_axis_tdata,
output reg m_axis_tlast,
output reg m_axis_tvalid,
input wire m_axis_tready,
// 状态和调试
output reg conv_busy,
output reg conv_done,
output reg [31:0] performance_counter
);
// ========================================
// 配置寄存器定义
// ========================================
reg [31:0] ctrl_reg; // 0x00: 控制寄存器
reg [31:0] status_reg; // 0x04: 状态寄存器
reg [31:0] img_size_reg; // 0x08: 图像尺寸
reg [31:0] kernel_size_reg; // 0x0C: 卷积核尺寸
reg [31:0] channel_reg; // 0x10: 通道配置
reg [31:0] stride_pad_reg; // 0x14: 步长和填充
reg [31:0] perf_counter_reg; // 0x18: 性能计数器
// 控制位定义
wire start_conv = ctrl_reg[0];
wire clear_done = ctrl_reg[1];
wire weight_load_mode = ctrl_reg[2];
// 从寄存器提取参数
wire [15:0] img_height = img_size_reg[31:16];
wire [15:0] img_width = img_size_reg[15:0];
wire [7:0] kernel_h = kernel_size_reg[15:8];
wire [7:0] kernel_w = kernel_size_reg[7:0];
wire [15:0] in_channels = channel_reg[31:16];
wire [15:0] out_channels = channel_reg[15:0];
wire [3:0] stride = stride_pad_reg[3:0];
wire [3:0] padding = stride_pad_reg[7:4];
// ========================================
// 存储器实例化
// ========================================
// 输入缓冲(使用BRAM)
(* ram_style = "block" *)
reg [DATA_WIDTH-1:0] input_buffer [0:ARRAY_SIZE-1][0:1023];
reg [9:0] input_wr_addr;
reg [9:0] input_rd_addr;
reg input_wr_en;
wire [DATA_WIDTH-1:0] input_data [0:ARRAY_SIZE-1];
// 权重缓冲(使用URAM)
(* ram_style = "ultra" *)
reg [WEIGHT_WIDTH-1:0] weight_buffer [0:ARRAY_SIZE-1][0:ARRAY_SIZE-1][0:255];
reg [7:0] weight_addr;
wire [WEIGHT_WIDTH-1:0] weight_data [0:ARRAY_SIZE-1][0:ARRAY_SIZE-1];
// 输出缓冲
(* ram_style = "block" *)
reg [ACCUMULATOR_WIDTH-1:0] output_buffer [0:ARRAY_SIZE-1][0:1023];
reg [9:0] output_wr_addr;
reg [9:0] output_rd_addr;
reg output_wr_en;
// ========================================
// 脉动阵列实例化
// ========================================
wire sa_enable;
wire sa_weight_load;
wire sa_compute;
wire sa_acc_clear;
wire signed [DATA_WIDTH-1:0] sa_activation_in [0:ARRAY_SIZE-1];
wire sa_activation_valid [0:ARRAY_SIZE-1];
wire signed [ACCUMULATOR_WIDTH-1:0] sa_partial_sum_in [0:ARRAY_SIZE-1];
wire signed [ACCUMULATOR_WIDTH-1:0] sa_result_out [0:ARRAY_SIZE-1];
wire sa_result_valid [0:ARRAY_SIZE-1];
systolic_array #(
.ARRAY_SIZE(ARRAY_SIZE),
.WEIGHT_WIDTH(WEIGHT_WIDTH),
.ACTIVATION_WIDTH(DATA_WIDTH),
.ACCUMULATOR_WIDTH(ACCUMULATOR_WIDTH)
) sa_inst (
.clk(clk),
.rst_n(rst_n),
.enable(sa_enable),
.weight_load(sa_weight_load),
.compute_enable(sa_compute),
.accumulate_clear(sa_acc_clear),
.weight_data(weight_data),
.activation_in(sa_activation_in),
.activation_valid(sa_activation_valid),
.partial_sum_in(sa_partial_sum_in),
.result_out(sa_result_out),
.result_valid(sa_result_valid)
);
// ========================================
// 控制器实例化
// ========================================
systolic_controller #(
.ARRAY_SIZE(ARRAY_SIZE)
) controller_inst (
.clk(clk),
.rst_n(rst_n),
.start(start_conv),
.input_height(img_height),
.input_width(img_width),
.kernel_size(kernel_h),
.input_channels(in_channels),
.output_channels(out_channels),
.busy(conv_busy),
.done(conv_done),
.cycle_count(performance_counter),
.sa_enable(sa_enable),
.sa_weight_load(sa_weight_load),
.sa_compute(sa_compute),
.sa_acc_clear(sa_acc_clear),
.input_buffer_rd_en(input_rd_en),
.input_buffer_addr(input_rd_addr[7:0]),
.weight_buffer_rd_en(weight_rd_en),
.weight_buffer_addr(weight_addr),
.output_buffer_wr_en(output_wr_en),
.output_buffer_addr(output_wr_addr[7:0])
);
// ========================================
// AXI4-Lite接口逻辑
// ========================================
// 写地址通道
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s_axi_awready <= 1'b0;
end else begin
if (s_axi_awvalid && !s_axi_awready) begin
s_axi_awready <= 1'b1;
end else begin
s_axi_awready <= 1'b0;
end
end
end
// 写数据通道和寄存器写入
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s_axi_wready <= 1'b0;
ctrl_reg <= 32'h0;
img_size_reg <= 32'h02800280; // 默认640x640
kernel_size_reg <= 32'h00030003; // 默认3x3
channel_reg <= 32'h00030010; // 默认3输入16输出
stride_pad_reg <= 32'h00010001; // 默认stride=1, pad=1
end else begin
if (s_axi_wvalid && s_axi_awvalid) begin
s_axi_wready <= 1'b1;
case (s_axi_awaddr[7:0])
8'h00: ctrl_reg <= s_axi_wdata;
8'h08: img_size_reg <= s_axi_wdata;
8'h0C: kernel_size_reg <= s_axi_wdata;
8'h10: channel_reg <= s_axi_wdata;
8'h14: stride_pad_reg <= s_axi_wdata;
endcase
end else begin
s_axi_wready <= 1'b0;
end
// 自动清除start位
if (conv_busy) begin
ctrl_reg[0] <= 1'b0;
end
// 清除done标志
if (clear_done) begin
ctrl_reg[1] <= 1'b0;
end
end
end
// 写响应通道
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s_axi_bvalid <= 1'b0;
s_axi_bresp <= 2'b00;
end else begin
if (s_axi_wready && s_axi_wvalid && !s_axi_bvalid) begin
s_axi_bvalid <= 1'b1;
s_axi_bresp <= 2'b00; // OKAY响应
end else if (s_axi_bready && s_axi_bvalid) begin
s_axi_bvalid <= 1'b0;
end
end
end
// 读地址通道
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s_axi_arready <= 1'b0;
end else begin
if (s_axi_arvalid && !s_axi_arready) begin
s_axi_arready <= 1'b1;
end else begin
s_axi_arready <= 1'b0;
end
end
end
// 读数据通道
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s_axi_rvalid <= 1'b0;
s_axi_rdata <= 32'h0;
s_axi_rresp <= 2'b00;
end else begin
if (s_axi_arvalid && s_axi_arready) begin
s_axi_rvalid <= 1'b1;
s_axi_rresp <= 2'b00;
case (s_axi_araddr[7:0])
8'h00: s_axi_rdata <= ctrl_reg;
8'h04: s_axi_rdata <= {30'b0, conv_done, conv_busy};
8'h08: s_axi_rdata <= img_size_reg;
8'h0C: s_axi_rdata <= kernel_size_reg;
8'h10: s_axi_rdata <= channel_reg;
8'h14: s_axi_rdata <= stride_pad_reg;
8'h18: s_axi_rdata <= performance_counter;
default: s_axi_rdata <= 32'h0;
endcase
end else if (s_axi_rready && s_axi_rvalid) begin
s_axi_rvalid <= 1'b0;
end
end
end
// ========================================
// AXI-Stream输入处理
// ========================================
reg [2:0] input_state;
reg [15:0] pixel_counter;
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s_axis_tready <= 1'b0;
input_wr_en <= 1'b0;
input_wr_addr <= 10'b0;
pixel_counter <= 16'b0;
input_state <= 3'b000;
end else begin
case (input_state)
3'b000: begin // 空闲
if (weight_load_mode) begin
s_axis_tready <= 1'b1;
input_state <= 3'b001;
end
end
3'b001: begin // 接收数据
if (s_axis_tvalid && s_axis_tready) begin
// 将64位数据拆分为8个8位像素
input_buffer[0][input_wr_addr] <= s_axis_tdata[7:0];
input_buffer[1][input_wr_addr] <= s_axis_tdata[15:8];
input_buffer[2][input_wr_addr] <= s_axis_tdata[23:16];
input_buffer[3][input_wr_addr] <= s_axis_tdata[31:24];
input_buffer[4][input_wr_addr] <= s_axis_tdata[39:32];
input_buffer[5][input_wr_addr] <= s_axis_tdata[47:40];
input_buffer[6][input_wr_addr] <= s_axis_tdata[55:48];
input_buffer[7][input_wr_addr] <= s_axis_tdata[63:56];
input_wr_addr <= input_wr_addr + 1'b1;
pixel_counter <= pixel_counter + 8;
if (s_axis_tlast) begin
s_axis_tready <= 1'b0;
input_state <= 3'b010;
end
end
end
3'b010: begin // 完成
input_wr_en <= 1'b0;
input_state <= 3'b000;
pixel_counter <= 16'b0;
input_wr_addr <= 10'b0;
end
endcase
end
end
// ========================================
// AXI-Stream输出处理
// ========================================
reg [2:0] output_state;
reg [15:0] output_counter;
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
m_axis_tvalid <= 1'b0;
m_axis_tlast <= 1'b0;
m_axis_tdata <= 64'b0;
output_rd_addr <= 10'b0;
output_counter <= 16'b0;
output_state <= 3'b000;
end else begin
case (output_state)
3'b000: begin // 空闲
if (conv_done) begin
output_state <= 3'b001;
end
end
3'b001: begin // 准备输出
if (m_axis_tready) begin
m_axis_tvalid <= 1'b1;
output_state <= 3'b010;
end
end
3'b010: begin // 发送数据
if (m_axis_tready && m_axis_tvalid) begin
// 打包8个输出值
m_axis_tdata[31:0] <= output_buffer[0][output_rd_addr];
m_axis_tdata[63:32] <= output_buffer[1][output_rd_addr];
output_rd_addr <= output_rd_addr + 1'b1;
output_counter <= output_counter + 2;
// 检查是否为最后数据
if (output_counter >= (img_height * img_width) - 2) begin
m_axis_tlast <= 1'b1;
output_state <= 3'b011;
end
end
end
3'b011: begin // 完成
m_axis_tvalid <= 1'b0;
m_axis_tlast <= 1'b0;
output_rd_addr <= 10'b0;
output_counter <= 16'b0;
output_state <= 3'b000;
end
endcase
end
end
// ========================================
// 数据连接
// ========================================
genvar i, j;
generate
// 连接输入缓冲到脉动阵列
for (i = 0; i < ARRAY_SIZE; i = i + 1) begin : connect_input
assign sa_activation_in[i] = input_buffer[i][input_rd_addr];
assign sa_activation_valid[i] = input_rd_en;
end
// 连接权重缓冲到脉动阵列
for (i = 0; i < ARRAY_SIZE; i = i + 1) begin : connect_weight_row
for (j = 0; j < ARRAY_SIZE; j = j + 1) begin : connect_weight_col
assign weight_data[i][j] = weight_buffer[i][j][weight_addr];
end
end
// 部分和输入(通常初始化为0)
for (i = 0; i < ARRAY_SIZE; i = i + 1) begin : init_psum
assign sa_partial_sum_in[i] = 32'b0;
end
endgenerate
// 存储输出结果
integer k;
always @(posedge clk) begin
if (output_wr_en) begin
for (k = 0; k < ARRAY_SIZE; k = k + 1) begin
if (sa_result_valid[k]) begin
output_buffer[k][output_wr_addr] <= sa_result_out[k];
end
end
end
end
endmodule
1.5 综合测试与验证
步骤1.5.1:创建综合testbench
创建文件 tb_conv_engine_complete.v:
`timescale 1ns / 1ps
module tb_conv_engine_complete;
// 测试参数
parameter CLK_PERIOD = 5; // 200MHz
// DUT信号
reg clk;
reg rst_n;
// AXI4-Lite信号
reg [31:0] s_axi_awaddr;
reg s_axi_awvalid;
wire s_axi_awready;
reg [31:0] s_axi_wdata;
reg [3:0] s_axi_wstrb;
reg s_axi_wvalid;
wire s_axi_wready;
wire [1:0] s_axi_bresp;
wire s_axi_bvalid;
reg s_axi_bready;
reg [31:0] s_axi_araddr;
reg s_axi_arvalid;
wire s_axi_arready;
wire [31:0] s_axi_rdata;
wire [1:0] s_axi_rresp;
wire s_axi_rvalid;
reg s_axi_rready;
// AXI-Stream信号
reg [63:0] s_axis_tdata;
reg s_axis_tlast;
reg s_axis_tvalid;
wire s_axis_tready;
wire [63:0] m_axis_tdata;
wire m_axis_tlast;
wire m_axis_tvalid;
reg m_axis_tready;
// 状态信号
wire conv_busy;
wire conv_done;
wire [31:0] performance_counter;
// DUT实例化
conv_engine_top DUT (
.clk(clk),
.rst_n(rst_n),
.s_axi_awaddr(s_axi_awaddr),
.s_axi_awvalid(s_axi_awvalid),
.s_axi_awready(s_axi_awready),
.s_axi_wdata(s_axi_wdata),
.s_axi_wstrb(s_axi_wstrb),
.s_axi_wvalid(s_axi_wvalid),
.s_axi_wready(s_axi_wready),
.s_axi_bresp(s_axi_bresp),
.s_axi_bvalid(s_axi_bvalid),
.s_axi_bready(s_axi_bready),
.s_axi_araddr(s_axi_araddr),
.s_axi_arvalid(s_axi_arvalid),
.s_axi_arready(s_axi_arready),
.s_axi_rdata(s_axi_rdata),
.s_axi_rresp(s_axi_rresp),
.s_axi_rvalid(s_axi_rvalid),
.s_axi_rready(s_axi_rready),
.s_axis_tdata(s_axis_tdata),
.s_axis_tlast(s_axis_tlast),
.s_axis_tvalid(s_axis_tvalid),
.s_axis_tready(s_axis_tready),
.m_axis_tdata(m_axis_tdata),
.m_axis_tlast(m_axis_tlast),
.m_axis_tvalid(m_axis_tvalid),
.m_axis_tready(m_axis_tready),
.conv_busy(conv_busy),
.conv_done(conv_done),
.performance_counter(performance_counter)
);
// 时钟生成
initial begin
clk = 0;
forever #(CLK_PERIOD/2) clk = ~clk;
end
// AXI4-Lite写任务
task axi_write;
input [31:0] addr;
input [31:0] data;
begin
@(posedge clk);
s_axi_awaddr = addr;
s_axi_awvalid = 1'b1;
s_axi_wdata = data;
s_axi_wstrb = 4'hF;
s_axi_wvalid = 1'b1;
s_axi_bready = 1'b1;
wait(s_axi_awready && s_axi_wready);
@(posedge clk);
s_axi_awvalid = 1'b0;
s_axi_wvalid = 1'b0;
wait(s_axi_bvalid);
@(posedge clk);
s_axi_bready = 1'b0;
end
endtask
// AXI4-Lite读任务
task axi_read;
input [31:0] addr;
output [31:0] data;
begin
@(posedge clk);
s_axi_araddr = addr;
s_axi_arvalid = 1'b1;
s_axi_rready = 1'b1;
wait(s_axi_arready);
@(posedge clk);
s_axi_arvalid = 1'b0;
wait(s_axi_rvalid);
data = s_axi_rdata;
@(posedge clk);
s_axi_rready = 1'b0;
end
endtask
// 发送输入数据任务
task send_input_data;
input [31:0] num_pixels;
integer i;
begin
@(posedge clk);
for (i = 0; i < num_pixels/8; i = i + 1) begin
s_axis_tdata = {8{i[7:0]}}; // 简单的测试数据
s_axis_tvalid = 1'b1;
s_axis_tlast = (i == num_pixels/8 - 1);
wait(s_axis_tready);
@(posedge clk);
end
s_axis_tvalid = 1'b0;
s_axis_tlast = 1'b0;
end
endtask
// 主测试序列
initial begin
// 初始化
$display("=== 卷积引擎完整测试开始 ===");
rst_n = 0;
s_axi_awaddr = 0;
s_axi_awvalid = 0;
s_axi_wdata = 0;
s_axi_wstrb = 0;
s_axi_wvalid = 0;
s_axi_bready = 0;
s_axi_araddr = 0;
s_axi_arvalid = 0;
s_axi_rready = 0;
s_axis_tdata = 0;
s_axis_tlast = 0;
s_axis_tvalid = 0;
m_axis_tready = 1;
// 复位释放
#100 rst_n = 1;
#100;
// 配置卷积参数
$display("\n--- 配置卷积参数 ---");
axi_write(32'h08, 32'h00200020); // 32x32图像
axi_write(32'h0C, 32'h00030003); // 3x3卷积核
axi_write(32'h10, 32'h00030010); // 3输入通道,16输出通道
axi_write(32'h14, 32'h00010001); // stride=1, pad=1
// 读回配置确认
reg [31:0] read_data;
axi_read(32'h08, read_data);
$display("图像尺寸配置: 0x%08X", read_data);
// 加载权重(简化)
$display("\n--- 加载权重 ---");
axi_write(32'h00, 32'h00000004); // 设置权重加载模式
#20;
send_input_data(32*32); // 发送权重数据
// 等待权重加载完成
#200;
// 发送输入特征图
$display("\n--- 发送输入特征图 ---");
send_input_data(32*32);
// 启动卷积
$display("\n--- 启动卷积计算 ---");
axi_write(32'h00, 32'h00000001); // 启动卷积
// 等待完成
wait(conv_done);
$display("卷积完成!");
// 读取性能计数器
axi_read(32'h18, read_data);
$display("执行周期数: %d", read_data);
// 接收输出数据
$display("\n--- 接收输出数据 ---");
wait(m_axis_tvalid);
while (m_axis_tvalid) begin
@(posedge clk);
if (m_axis_tlast) begin
$display("收到最后数据");
break;
end
end
#100;
$display("\n=== 测试完成 ===");
$finish;
end
// 超时保护
initial begin
#100000;
$display("ERROR: 测试超时!");
$finish;
end
// 波形记录
initial begin
$dumpfile("conv_engine_test.vcd");
$dumpvars(0, tb_conv_engine_complete);
end
endmodule

浙公网安备 33010602011771号