卷积引擎实现(p2):综合
P2:综合与实现流程
综合约束
####################################################################################
# 卷积引擎时序约束文件
# 目标器件: xczu9eg
# 创建日期: 2025/9/14
####################################################################################
# ===========================================
# 1. 时钟定义
# ===========================================
# 主处理时钟 - 200MHz (5ns周期)
create_clock -period 5.000 -name clk_main [get_ports clk]
set_property IOSTANDARD LVCMOS18 [get_ports clk]
set_property PACKAGE_PIN AL8 [get_ports clk]
# AXI时钟(如果独立)- 100MHz
# create_clock -period 10.000 -name axi_clk [get_ports axi_clk]
# ===========================================
# 2. 时钟不确定性和抖动
# ===========================================
# 设置时钟不确定性(对所有时钟)
set_clock_uncertainty -setup 0.200 [get_clocks clk_main]
set_clock_uncertainty -hold 0.100 [get_clocks clk_main]
# 输入抖动
set_input_jitter [get_clocks clk_main] 0.100
# ===========================================
# 3. 输入输出延迟约束
# ===========================================
# 定义虚拟时钟用于IO约束
create_clock -period 10.000 -name virtual_io_clk
set_input_delay -clock virtual_io_clk -max 2.000 [get_ports {s_axi_*}]
set_input_delay -clock virtual_io_clk -min 0.500 [get_ports {s_axi_*}]
set_output_delay -clock virtual_io_clk -max 2.000 [get_ports {s_axi_*}]
set_output_delay -clock virtual_io_clk -min 0.500 [get_ports {s_axi_*}]
# AXI-Stream接口约束
set_input_delay -clock clk_main -max 1.000 [get_ports {s_axis_*}]
set_input_delay -clock clk_main -min 0.200 [get_ports {s_axis_*}]
set_output_delay -clock clk_main -max 1.000 [get_ports {m_axis_*}]
set_output_delay -clock clk_main -min 0.200 [get_ports {m_axis_*}]
# ===========================================
# 4. 多周期路径约束
# ===========================================
# 权重加载路径可以是多周期
set_multicycle_path -setup 2 -from [get_pins */weight_buffer_reg*/C] -to [get_pins */weight_reg*/D]
set_multicycle_path -hold 1 -from [get_pins */weight_buffer_reg*/C] -to [get_pins */weight_reg*/D]
# 累加器路径优化
set_multicycle_path -setup 2 -from [get_pins */accumulator_reg*/C] -to [get_pins */partial_sum_out_reg*/D]
set_multicycle_path -hold 1 -from [get_pins */accumulator_reg*/C] -to [get_pins */partial_sum_out_reg*/D]
# ===========================================
# 5. 假路径定义
# ===========================================
# 复位信号是异步的
set_false_path -from [get_ports rst_n]
# 配置寄存器到数据路径的交叉
set_false_path -from [get_cells -hierarchical -filter {NAME =~ */ctrl_reg_reg*}] \
-to [get_cells -hierarchical -filter {NAME =~ */sa_inst/*}]
# 状态寄存器读取路径
set_false_path -from [get_cells -hierarchical -filter {NAME =~ */performance_counter_reg*}] \
-to [get_ports {s_axi_rdata[*]}]
# ===========================================
# 6. 最大延迟约束(关键路径)
# ===========================================
# DSP链路径
set_max_delay 4.500 -from [get_pins */pe_inst/weight_reg_reg*/C] \
-to [get_pins */pe_inst/partial_sum_out_reg*/D]
# 脉动阵列数据传播路径
set_max_delay 4.000 -from [get_pins */activation_in_reg*/C] \
-to [get_pins */activation_out_reg*/D]
# ===========================================
# 7. 扇出约束
# ===========================================
# 控制信号扇出限制
set_max_fanout 32 [get_nets -hierarchical -filter {NAME =~ */sa_enable}]
set_max_fanout 32 [get_nets -hierarchical -filter {NAME =~ */sa_compute}]
set_max_fanout 16 [get_nets -hierarchical -filter {NAME =~ */weight_load}]
# ===========================================
# 8. 物理约束(面积约束)
# ===========================================
# 为脉动阵列创建物理区域(Pblock)
create_pblock pblock_systolic_array
add_cells_to_pblock pblock_systolic_array [get_cells -hierarchical -filter {NAME =~ */sa_inst/*}]
resize_pblock pblock_systolic_array -add {SLICE_X40Y120:SLICE_X79Y179}
# DSP48E2放置约束
resize_pblock pblock_systolic_array -add {DSP48E2_X8Y48:DSP48E2_X15Y71}
# BRAM放置约束
create_pblock pblock_memory
add_cells_to_pblock pblock_memory [get_cells -hierarchical -filter {NAME =~ */*buffer*}]
resize_pblock pblock_memory -add {RAMB36_X4Y24:RAMB36_X7Y35}
# URAM放置约束
resize_pblock pblock_memory -add {URAM288_X2Y32:URAM288_X3Y39}
# ===========================================
# 9. 时钟域交叉约束(CDC)
# ===========================================
# 如果有多个时钟域,设置时钟组
# set_clock_groups -asynchronous \
# -group [get_clocks clk_main] \
# -group [get_clocks axi_clk]
# ===========================================
# 10. 功耗优化约束
# ===========================================
# 设置开关活动率用于功耗估算
set_switching_activity -default_toggle_rate 12.5 -default_static_probability 0.5
# 对低活动率信号设置特定值
set_switching_activity -toggle_rate 2.0 -static_probability 0.1 \
[get_nets -hierarchical -filter {NAME =~ */weight_load}]
# ===========================================
# 11. 配置特定的实现策略
# ===========================================
# 设置综合策略属性
set_property STEPS.SYNTH_DESIGN.ARGS.DIRECTIVE AlternateRoutability [get_runs synth_1]
set_property STEPS.SYNTH_DESIGN.ARGS.RETIMING true [get_runs synth_1]
set_property STEPS.SYNTH_DESIGN.ARGS.FSM_EXTRACTION auto [get_runs synth_1]
set_property STEPS.SYNTH_DESIGN.ARGS.KEEP_EQUIVALENT_REGISTERS true [get_runs synth_1]
# 设置实现策略属性
set_property STEPS.OPT_DESIGN.ARGS.DIRECTIVE ExploreWithRemap [get_runs impl_1]
set_property STEPS.PLACE_DESIGN.ARGS.DIRECTIVE ExtraNetDelay_high [get_runs impl_1]
set_property STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1]
# ===========================================
# 12. DRC豁免(如果需要)
# ===========================================
# 对特定的DRC检查创建豁免
# create_waiver -type DRC -id {TIMING-17} -user "conv_engine" \
# -desc "The clock pin is driven by combinatorial logic acceptable for this design"
####################################################################################
# 约束文件结束
####################################################################################
物理约束
创建文件 conv_engine_physical.xdc:
####################################################################################
# 物理实现约束文件
####################################################################################
# ===========================================
# 1. IO位置约束(根据ZCU102调整)
# ===========================================
# 系统复位(按钮SW19)
set_property PACKAGE_PIN AM13 [get_ports rst_n]
set_property IOSTANDARD LVCMOS33 [get_ports rst_n]
# LED指示器
set_property PACKAGE_PIN AG14 [get_ports conv_busy]
set_property PACKAGE_PIN AF13 [get_ports conv_done]
set_property IOSTANDARD LVCMOS33 [get_ports {conv_busy conv_done}]
# ===========================================
# 2. 配置约束
# ===========================================
# 配置电压
set_property CFGBVS GND [current_design]
set_property CONFIG_VOLTAGE 1.8 [current_design]
# 配置模式
set_property CONFIG_MODE SPIx4 [current_design]
set_property BITSTREAM.CONFIG.SPI_BUSWIDTH 4 [current_design]
# ===========================================
# 3. 性能优化设置
# ===========================================
# 启用增量编译
set_property INCREMENTAL_CHECKPOINT ./conv_engine_routed.dcp [get_runs impl_1]
# 保留层次结构
set_property KEEP_HIERARCHY true [get_cells -hierarchical -filter {NAME =~ */sa_inst}]
# ===========================================
# 4. 关键路径组定义
# ===========================================
group_path -name SYSTOLIC_ARRAY -from [get_pins */sa_inst/*/C] -to [get_pins */sa_inst/*/D]
group_path -name AXI_INTERFACE -from [get_ports s_axi_*] -to [get_ports s_axi_*]
group_path -name STREAM_INTERFACE -from [get_ports s_axis_*] -to [get_ports m_axis_*]
综合
配置综合
配置综合选项:
set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} \
-value {-directive AlternateRoutability -retiming -no_lc -shreg_min_size 10} \
-objects [get_runs synth_1]
# 并行作业数
set_property STEPS.SYNTH_DESIGN.ARGS.NUM_JOBS 8 [get_runs synth_1]
# 综合策略为性能优化
set_property strategy Performance_ExploreWithRemap [get_runs synth_1]
# 增量综合检查点
set_property AUTO_INCREMENTAL_CHECKPOINT 1 [get_runs synth_1]
在GUI中设置:
more options: -shreg_min_size 10 -keep_equivalent_registers
启动
-
保存文件:
save_project_as conv_engine_synthesis ./conv_engine_synthesis -force -
运行综合:
reset_run synth_1 launch_runs synth_1 -jobs 8 -
监控进度:
- 查看右上角的
Design Runs窗口 - 或在TCL Console执行:
wait_on_run synth_1 - 查看右上角的
-
完成后打开:
open_run synth_1 -name synth_1
分析
资源利用报告
- 生成详细利用率报告:
# 生成分层次的利用率报告
report_utilization -file ./reports/post_synth_utilization.rpt -hierarchical -hierarchical_depth 4
# 在控制台显示摘要
report_utilization -hierarchical
# 生成特定模块的利用率
report_utilization -cells [get_cells sa_inst]
- 创建自动分析脚本
analyze_synth_results.tcl:
###############################################
# 综合结果分析脚本
###############################################
proc analyze_synthesis_results {} {
puts "\n================================================"
puts " 卷积引擎综合结果分析"
puts "================================================\n"
# 1. 资源利用率分析
puts "1. 资源利用率:"
puts "----------------"
set lut_used [get_property STATS.LUT [current_run]]
set ff_used [get_property STATS.FDRE [current_run]]
set bram_used [get_property STATS.RAMB36 [current_run]]
set uram_used [get_property STATS.URAM [current_run]]
set dsp_used [get_property STATS.DSP [current_run]]
puts " LUT使用: $lut_used / 274080 ([expr {$lut_used*100.0/274080}]%)"
puts " FF使用: $ff_used / 548160 ([expr {$ff_used*100.0/548160}]%)"
puts " BRAM使用: $bram_used / 912 ([expr {$bram_used*100.0/912}]%)"
puts " URAM使用: $uram_used / 96 ([expr {$uram_used*100.0/96}]%)"
puts " DSP使用: $dsp_used / 2520 ([expr {$dsp_used*100.0/2520}]%)"
# 2. 时序分析
puts "\n2. 时序摘要:"
puts "-------------"
# 获取时序摘要
set timing_summary [report_timing_summary -return_string -max_paths 10]
# 提取WNS和TNS
regexp {WNS$ns$:\s+([\-0-9.]+)} $timing_summary match wns
regexp {TNS$ns$:\s+([\-0-9.]+)} $timing_summary match tns
if {[info exists wns]} {
puts " 最差负时序裕量(WNS): ${wns} ns"
if {$wns < 0} {
puts " ⚠️ 警告:时序未满足!"
} else {
puts " ✓ 时序满足"
}
}
if {[info exists tns]} {
puts " 总负时序裕量(TNS): ${tns} ns"
}
# 3. 关键路径分析
puts "\n3. 关键路径:"
puts "------------"
# 报告最差的5条路径
report_timing -max_paths 5 -nworst 1 -delay_type max -sort_by slack
# 4. DSP推断分析
puts "\n4. DSP使用详情:"
puts "---------------"
set dsp_cells [get_cells -hierarchical -filter {PRIMITIVE_SUBGROUP == DSP}]
set dsp_count [llength $dsp_cells]
puts " 推断的DSP48E2数量: $dsp_count"
if {$dsp_count > 0} {
puts " DSP实例列表(前10个):"
set count 0
foreach cell $dsp_cells {
if {$count < 10} {
puts " - $cell"
incr count
}
}
}
# 5. BRAM/URAM推断分析
puts "\n5. 存储器使用详情:"
puts "------------------"
set bram_cells [get_cells -hierarchical -filter {PRIMITIVE_SUBGROUP == BRAM}]
set uram_cells [get_cells -hierarchical -filter {PRIMITIVE_SUBGROUP == URAM}]
puts " BRAM实例: [llength $bram_cells]"
puts " URAM实例: [llength $uram_cells]"
# 6. 层次化资源分析
puts "\n6. 模块级资源分配:"
puts "------------------"
foreach module {sa_inst controller_inst} {
if {[llength [get_cells $module]] > 0} {
puts "\n 模块 $module:"
set module_luts [llength [get_cells -hierarchical -filter "PARENT == $module && IS_PRIMITIVE && PRIMITIVE_SUBGROUP == LUT"]]
set module_ffs [llength [get_cells -hierarchical -filter "PARENT == $module && IS_PRIMITIVE && PRIMITIVE_SUBGROUP == FLOP_LATCH"]]
puts " LUTs: $module_luts"
puts " FFs: $module_ffs"
}
}
# 7. 生成报告文件
puts "\n7. 生成报告文件..."
puts "-----------------"
# 创建报告目录
file mkdir ./reports
# 生成各种报告
report_utilization -file ./reports/utilization_hierarchical.rpt -hierarchical
report_timing_summary -file ./reports/timing_summary.rpt
report_power -file ./reports/power_estimate.rpt
report_clock_utilization -file ./reports/clock_utilization.rpt
puts " ✓ 报告已保存到 ./reports/ 目录"
puts "\n================================================"
puts " 分析完成"
puts "================================================\n"
}
# 执行分析
analyze_synthesis_results
- 运行分析脚本:
source analyze_synth_results.tcl
时序分析和优化
- 查看时序报告:
# 生成详细时序报告
report_timing_summary -delay_type min_max -report_unconstrained \
-check_timing_verbose -max_paths 10 -input_pins -routable_nets \
-file ./reports/timing_detail.rpt
# 分析setup时序
report_timing -setup -delay_type max -max_paths 20 \
-sort_by group -file ./reports/setup_timing.rpt
# 分析hold时序
report_timing -hold -delay_type min -max_paths 20 \
-sort_by group -file ./reports/hold_timing.rpt
# 查看时钟关系
report_clock_interaction -delay_type min_max \
-file ./reports/clock_interaction.rpt
- 如果时序不满足,创建优化脚本
timing_optimization.tcl:
###############################################
# 时序优化脚本
###############################################
proc optimize_timing_violations {} {
puts "开始时序优化..."
# 1. 识别关键路径
set critical_paths [get_timing_paths -max_paths 10 -nworst 1 \
-delay_type max -sort_by slack]
foreach path $critical_paths {
set slack [get_property SLACK $path]
if {$slack < 0} {
puts "发现时序违例: Slack = $slack ns"
set startpoint [get_property STARTPOINT_PIN $path]
set endpoint [get_property ENDPOINT_PIN $path]
puts " 起点: $startpoint"
puts " 终点: $endpoint"
# 尝试优化策略
# 策略1: 插入流水线寄存器
if {$slack < -2.0} {
puts " 建议: 在该路径插入流水线寄存器"
set_property STEPS.SYNTH_DESIGN.ARGS.RETIMING true [get_runs synth_1]
}
# 策略2: 逻辑复制
if {[get_property FANOUT [get_nets -of $startpoint]] > 32} {
puts " 建议: 复制高扇出网络"
set_property STEPS.SYNTH_DESIGN.ARGS.MAX_FANOUT 16 [get_runs synth_1]
}
# 策略3: 改变综合策略
if {$slack < -1.0} {
puts " 建议: 使用Performance_ExploreWithRemap策略"
set_property strategy Performance_ExploreWithRemap [get_runs synth_1]
}
}
}
# 2. 应用物理优化
puts "\n应用物理优化选项..."
# 为关键路径设置更严格的约束
foreach path $critical_paths {
set slack [get_property SLACK $path]
if {$slack < 0 && $slack > -1.0} {
# 增加该路径的权重
set net [get_nets -of [get_pins [get_property ENDPOINT_PIN $path]]]
set_property CRITICALITY HIGH $net
}
}
# 3. 重新运行综合(如果需要)
puts "\n是否需要重新运行综合?(y/n)"
gets stdin answer
if {$answer == "y"} {
reset_run synth_1
launch_runs synth_1 -jobs 8
wait_on_run synth_1
}
puts "时序优化完成!"
}
# 执行优化
optimize_timing_violations
布局布线
策略
- 设置实现选项:
# 配置实现运行
set_property STEPS.OPT_DESIGN.ARGS.DIRECTIVE Default [get_runs impl_1]
set_property STEPS.PLACE_DESIGN.ARGS.DIRECTIVE ExtraNetDelay_high [get_runs impl_1]
set_property STEPS.PHYS_OPT_DESIGN.IS_ENABLED true [get_runs impl_1]
set_property STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1]
set_property STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1]
# 如果有时序问题,启用额外的物理优化
set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.IS_ENABLED true [get_runs impl_1]
set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.ARGS.DIRECTIVE Explore [get_runs impl_1]
- 运行实现:
# 启动实现
launch_runs impl_1 -jobs 8
# 等待完成
wait_on_run impl_1
# 检查实现状态
set impl_status [get_property STATUS [get_runs impl_1]]
puts "实现状态: $impl_status"
if {$impl_status == "route_design Complete!"} {
puts "实现成功完成!"
open_run impl_1
} else {
puts "实现失败!请检查错误信息"
}
分析实现结果
创建脚本 analyze_implementation.tcl:
###############################################
# 实现结果分析脚本
###############################################
proc analyze_implementation_results {} {
puts "\n================================================"
puts " 实现结果详细分析"
puts "================================================\n"
# 确保打开实现后的设计
open_run impl_1
# 1. 最终资源利用率
puts "1. 最终资源利用率:"
puts "------------------"
report_utilization
# 2. 最终时序结果
puts "\n2. 时序结果:"
puts "------------"
set timing_met [expr {[get_property STATS.WNS [current_run]] >= 0}]
if {$timing_met} {
puts "✓ 时序约束满足!"
puts " WNS: [get_property STATS.WNS [current_run]] ns"
puts " WHS: [get_property STATS.WHS [current_run]] ns"
} else {
puts "✗ 时序约束未满足!"
puts " WNS: [get_property STATS.WNS [current_run]] ns"
puts " TNS: [get_property STATS.TNS [current_run]] ns"
# 显示失败的路径
puts "\n 失败的时序路径:"
report_timing -setup -max_paths 5
}
# 3. 功耗估算
puts "\n3. 功耗估算:"
puts "-----------"
report_power -file ./reports/power_impl.rpt
# 解析功耗报告
set power_report [report_power -return_string]
regexp {Total On-Chip Power $W$:\s+([\d.]+)} $power_report match total_power
regexp {Dynamic $W$:\s+([\d.]+)} $power_report match dynamic_power
regexp {Static $W$:\s+([\d.]+)} $power_report match static_power
if {[info exists total_power]} {
puts " 总功耗: ${total_power} W"
puts " 动态功耗: ${dynamic_power} W"
puts " 静态功耗: ${static_power} W"
}
# 4. 布线拥塞分析
puts "\n4. 布线质量:"
puts "------------"
report_route_status -file ./reports/route_status.rpt
# 5. 设计规则检查(DRC)
puts "\n5. DRC检查:"
puts "-----------"
report_drc -file ./reports/drc_report.rpt
# 解析DRC结果
set drc_report [report_drc -return_string]
regexp {(\d+) Infos, (\d+) Warnings, (\d+) Critical Warnings and (\d+) Errors} \
$drc_report match infos warnings critical_warnings errors
if {[info exists errors] && $errors > 0} {
puts " ✗ 发现 $errors 个错误!"
puts " 请查看 ./reports/drc_report.rpt 获取详情"
} else {
puts " ✓ 无DRC错误"
if {[info exists warnings]} {
puts " ⚠ $warnings 个警告, $critical_warnings 个关键警告"
}
}
# 6. 时钟域分析
puts "\n6. 时钟域分析:"
puts "--------------"
report_clock_utilization -file ./reports/clock_util_impl.rpt
report_clock_networks -file ./reports/clock_networks.rpt
# 7. IO时序分析
puts "\n7. IO时序:"
puts "----------"
report_io_timing -file ./reports/io_timing.rpt
# 8. 生成实现统计
puts "\n8. 实现统计:"
puts "------------"
set route_nets [get_property STATS.ROUTE_NETS [current_run]]
set route_insts [get_property STATS.ROUTE_INSTS [current_run]]
puts " 布线网络数: $route_nets"
puts " 布线实例数: $route_insts"
puts "\n================================================"
puts " 分析完成"
puts "================================================\n"
# 决定是否生成比特流
if {$timing_met && $errors == 0} {
puts "设计准备好生成比特流!"
puts "是否生成比特流?(y/n)"
gets stdin answer
if {$answer == "y"} {
launch_runs impl_1 -to_step write_bitstream -jobs 8
wait_on_run impl_1
puts "比特流生成完成!"
}
} else {
puts "⚠ 设计存在问题,建议先解决后再生成比特流"
}
}
# 执行分析
analyze_implementation_results

浙公网安备 33010602011771号