[置顶] cluster light Vulkan实现(gpu)
实现GPU版本cluster light的步骤
1.分割视椎体为三维网格
- 具体实现:
选取合适的网格大小。(例如16924的网格,分别对应屏幕的x,y,z),每个cluster表示一个空间区域,包含可能影响该区域像素的光源。
按照对数分割,计算AABB(轴对齐包围盒),进行光源与cluster的相交测试。
- GPU设置:将cluster边界存储在结构化缓冲区域或3D纹理中,供计算着色器使用。
2.光源分配(注入阶段)
逐个Tile收集
- 使用计算着色器并行处理所有cluster和光源。
- 对每个光源,计算其影响范围(如球形光源,其位置半径决定影响范围)。
- 进行相交检测,与AABB包围盒进行相交测试。
逐个光源逐入
- 遍历光源,将其分配到所有与之相交cluster,减少重复计算。
压缩阶段
- 使用计算着色器清理光源列表,移除重复或空项目,优化内存使用。
GPU细节处理:
- 使用线程组并行处理cluster。
- 将光源索引存储在每个cluster的固定大小数组或链表中。
- 使用原子操作处理多线程写入冲突。
3.着色阶段
按照像素所属cluster计算所有光源的贡献,完成着色。
实现:
- 在片段着色器中,根据像素的屏幕坐标和深度缓冲区,计算其视空间位置。
- 确定像素所属的cluster(通过网格映射)。
- 从注入阶段的缓冲区中读取该cluster的光源索引列表。
- 遍历光源列表,计算每个光源的贡献,使用PBR模型。
- 累加所有光源的贡献,完成像素着色。
可能的优化
*应用GPU纹理可快速访问光源数据和cluster分配。
修改思路
为了实现GPU版本的Cluster Lights,需要:
1.调整cluster,采用对数深度分布。
在c++中的VulkanExample类中,调整updateLightCluster()函数,使用对数深度分割,为后续GPU计算着色器做准备。
void updateLightsCluster() {
// 清空集群数据
memset(clusterIndexList.indices, 0, sizeof(clusterIndexList.indices));
memset(clusterData.cluster, 0, sizeof(clusterData.cluster));
glm::mat4 viewProj = uboMatrices.projection * uboMatrices.view;
float zNear = 0.1f;
float zFar = 256.0f;
float logZFarNear = log(zFar / zNear); // 预计算 log(zFar/zNear)
// 分配临时数组记录已分配的光源
std::vector<std::vector<bool>> assignedLights(TOTAL_CLUSTERS, std::vector<bool>(maxnumLights, false));
for (int lightIdx = 0; lightIdx < maxnumLights; lightIdx++) {
Light& light = uboParams.lights[lightIdx];
float radius = light.colorAndRadius.w;
// 变换光源位置到视图空间
glm::vec4 viewPos = uboMatrices.view * light.position;
float viewZ = -viewPos.z; // Vulkan 视图空间 Z 为负
if (viewZ < zNear || viewZ > zFar) continue; // 跳过视锥体外的光源
// 计算光源在视图空间的 AABB
glm::vec3 minAABB = glm::vec3(viewPos) - glm::vec3(radius);
glm::vec3 maxAABB = glm::vec3(viewPos) + glm::vec3(radius);
// 变换到 NDC 空间
glm::vec4 clipMin = viewProj * glm::vec4(minAABB, 1.0f);
glm::vec4 clipMax = viewProj * glm::vec4(maxAABB, 1.0f);
clipMin /= clipMin.w;
clipMax /= clipMax.w;
// 映射到集群索引
uint32_t minClusterX = static_cast<uint32_t>((clipMin.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
uint32_t maxClusterX = static_cast<uint32_t>((clipMax.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
uint32_t minClusterY = static_cast<uint32_t>((clipMin.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
uint32_t maxClusterY = static_cast<uint32_t>((clipMax.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
// 对数深度分割
float minZ = glm::max(viewZ - radius, zNear);
float maxZ = glm::min(viewZ + radius, zFar);
uint32_t minClusterZ = static_cast<uint32_t>((log(minZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);
uint32_t maxClusterZ = static_cast<uint32_t>((log(maxZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);
minClusterX = glm::clamp(minClusterX, 0u, CLUSTER_SIZE_X - 1);
maxClusterX = glm::clamp(maxClusterX, 0u, CLUSTER_SIZE_X - 1);
minClusterY = glm::clamp(minClusterY, 0u, CLUSTER_SIZE_Y - 1);
maxClusterY = glm::clamp(maxClusterY, 0u, CLUSTER_SIZE_Y - 1);
minClusterZ = glm::clamp(minClusterZ, 0u, CLUSTER_SIZE_Z - 1);
maxClusterZ = glm::clamp(maxClusterZ, 0u, CLUSTER_SIZE_Z - 1);
for (uint32_t z = minClusterZ; z <= maxClusterZ; ++z) {
for (uint32_t y = minClusterY; y <= maxClusterY; ++y) {
for (uint32_t x = minClusterX; x <= maxClusterX; ++x) {
uint32_t clusterIdx = z * CLUSTER_SIZE_X * CLUSTER_SIZE_Y + y * CLUSTER_SIZE_X + x;
if (!assignedLights[clusterIdx][lightIdx] && clusterData.cluster[clusterIdx].count < maxnumLights) {
clusterData.cluster[clusterIdx].count++;
assignedLights[clusterIdx][lightIdx] = true;
}
}
}
}
}
// 计算偏移
uint32_t runningSum = 0;
for (uint32_t i = 0; i < TOTAL_CLUSTERS; i++) {
clusterData.cluster[i].offset = runningSum;
runningSum += clusterData.cluster[i].count;
}
// 填充光源索引列表
std::vector<uint32_t> tempOffsets(TOTAL_CLUSTERS, 0);
for (int lightIdx = 0; lightIdx < maxnumLights; lightIdx++) {
Light& light = uboParams.lights[lightIdx];
float radius = light.colorAndRadius.w;
glm::vec4 viewPos = uboMatrices.view * light.position;
float viewZ = -viewPos.z;
if (viewZ < zNear || viewZ > zFar) continue;
glm::vec3 minAABB = glm::vec3(viewPos) - glm::vec3(radius);
glm::vec3 maxAABB = glm::vec3(viewPos) + glm::vec3(radius);
glm::vec4 clipMin = viewProj * glm::vec4(minAABB, 1.0f);
glm::vec4 clipMax = viewProj * glm::vec4(maxAABB, 1.0f);
clipMin /= clipMin.w;
clipMax /= clipMax.w;
uint32_t minClusterX = static_cast<uint32_t>((clipMin.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
uint32_t maxClusterX = static_cast<uint32_t>((clipMax.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
uint32_t minClusterY = static_cast<uint32_t>((clipMin.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
uint32_t maxClusterY = static_cast<uint32_t>((clipMax.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
float minZ = glm::max(viewZ - radius, zNear);
float maxZ = glm::min(viewZ + radius, zFar);
uint32_t minClusterZ = static_cast<uint32_t>((log(minZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);
uint32_t maxClusterZ = static_cast<uint32_t>((log(maxZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);
minClusterX = glm::clamp(minClusterX, 0u, CLUSTER_SIZE_X - 1);
maxClusterX = glm::clamp(maxClusterX, 0u, CLUSTER_SIZE_X - 1);
minClusterY = glm::clamp(minClusterY, 0u, CLUSTER_SIZE_Y - 1);
maxClusterY = glm::clamp(maxClusterY, 0u, CLUSTER_SIZE_Y - 1);
minClusterZ = glm::clamp(minClusterZ, 0u, CLUSTER_SIZE_Z - 1);
maxClusterZ = glm::clamp(maxClusterZ, 0u, CLUSTER_SIZE_Z - 1);
for (uint32_t z = minClusterZ; z <= maxClusterZ; ++z) {
for (uint32_t y = minClusterY; y <= maxClusterY; ++y) {
for (uint32_t x = minClusterX; x <= maxClusterX; ++x) {
uint32_t clusterIdx = z * CLUSTER_SIZE_X * CLUSTER_SIZE_Y + y * CLUSTER_SIZE_X + x;
uint32_t offset = clusterData.cluster[clusterIdx].offset + tempOffsets[clusterIdx];
if (offset < lightIndexListnum && tempOffsets[clusterIdx] < clusterData.cluster[clusterIdx].count) {
clusterIndexList.indices[offset].clusterIndexList = lightIdx;
tempOffsets[clusterIdx]++;
}
}
}
}
}
memcpy(uniformBuffers.clusterData.mapped, &clusterData, sizeof(clusterData));
memcpy(uniformBuffers.clusterIndexList.mapped, &clusterIndexList, sizeof(clusterIndexList));
}
2.添加GPU注入阶段
添加一个计算着色器来替代上面c++中的updateLightsCluster,并使用存储缓冲区(Storage Buffer)而非Uniform Buffer,以支持更大容量和写操作。
c++端代码修改
void prepareUniformBuffers() {
VkPhysicalDeviceProperties properties;
vkGetPhysicalDeviceProperties(physicalDevice, &properties);
VkDeviceSize minAlignment = properties.limits.minStorageBufferOffsetAlignment;
VkDeviceSize alignedSizeClusterIndexList = ((sizeof(clusterIndexList) + minAlignment - 1) / minAlignment) * minAlignment;
// 矩阵缓冲区(仍为 Uniform Buffer)
VK_CHECK_RESULT(vulkanDevice->createBuffer(
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
&uniformBuffers.object,
sizeof(uboMatrices)));
// 光源数据缓冲区(仍为 Uniform Buffer)
VK_CHECK_RESULT(vulkanDevice->createBuffer(
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
&uniformBuffers.params,
sizeof(uboParams)));
// 集群计数和偏移(Storage Buffer)
VK_CHECK_RESULT(vulkanDevice->createBuffer(
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
&uniformBuffers.clusterData,
sizeof(clusterData)));
// 光源索引列表(Storage Buffer)
VK_CHECK_RESULT(vulkanDevice->createBuffer(
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
&uniformBuffers.clusterIndexList,
alignedSizeClusterIndexList));
VK_CHECK_RESULT(uniformBuffers.object.map());
VK_CHECK_RESULT(uniformBuffers.params.map());
VK_CHECK_RESULT(uniformBuffers.clusterData.map());
VK_CHECK_RESULT(uniformBuffers.clusterIndexList.map());
prepareSphereBuffers();
}
2.添加计算管线
在VulkanExample类中添加计算管线的相关成员。
class VulkanExample : public VulkanExampleBase {
public:
// ... 其他成员 ...
VkPipeline computePipeline{ VK_NULL_HANDLE };
VkPipelineLayout computePipelineLayout{ VK_NULL_HANDLE };
VkDescriptorSet computeDescriptorSet{ VK_NULL_HANDLE };
VkDescriptorSetLayout computeDescriptorSetLayout{ VK_NULL_HANDLE };
};
3.设置描述符集
在setupDescriptors()中添加计算描述符集布局:
void setupDescriptors() {
// 图形描述符
std::vector<VkDescriptorPoolSize> poolSizes = {
vks::initializers::descriptorPoolSize(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 4),
vks::initializers::descriptorPoolSize(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2), // 为计算着色器添加
};
VkDescriptorPoolCreateInfo descriptorPoolInfo = vks::initializers::descriptorPoolCreateInfo(poolSizes, 3); // 增加描述符集数
VK_CHECK_RESULT(vkCreateDescriptorPool(device, &descriptorPoolInfo, nullptr, &descriptorPool));
// 图形描述符集布局
std::vector<VkDescriptorSetLayoutBinding> setLayoutBindings = {
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT, 0),
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_FRAGMENT_BIT, 1),
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_FRAGMENT_BIT, 2), // 改为 Storage Buffer
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_FRAGMENT_BIT, 3), // 改为 Storage Buffer
};
VkDescriptorSetLayoutCreateInfo descriptorLayout = vks::initializers::descriptorSetLayoutCreateInfo(setLayoutBindings);
VK_CHECK_RESULT(vkCreateDescriptorSetLayout(device, &descriptorLayout, nullptr, &descriptorSetLayout));
VkDescriptorSetAllocateInfo allocInfo = vks::initializers::descriptorSetAllocateInfo(descriptorPool, &descriptorSetLayout, 1);
VK_CHECK_RESULT(vkAllocateDescriptorSets(device, &allocInfo, &descriptorSet));
std::vector<VkWriteDescriptorSet> writeDescriptorSets = {
vks::initializers::writeDescriptorSet(descriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 0, &uniformBuffers.object.descriptor),
vks::initializers::writeDescriptorSet(descriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, &uniformBuffers.params.descriptor),
vks::initializers::writeDescriptorSet(descriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2, &uniformBuffers.clusterIndexList.descriptor),
vks::initializers::writeDescriptorSet(descriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3, &uniformBuffers.clusterData.descriptor),
};
vkUpdateDescriptorSets(device, static_cast<uint32_t>(writeDescriptorSets.size()), writeDescriptorSets.data(), 0, NULL);
// 计算描述符集布局
std::vector<VkDescriptorSetLayoutBinding> computeBindings = {
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 0), // 矩阵
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 1), // 光源
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 2), // 集群计数和偏移
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 3), // 光源索引列表
};
VkDescriptorSetLayoutCreateInfo computeLayoutInfo = vks::initializers::descriptorSetLayoutCreateInfo(computeBindings);
VK_CHECK_RESULT(vkCreateDescriptorSetLayout(device, &computeLayoutInfo, nullptr, &computeDescriptorSetLayout));
allocInfo = vks::initializers::descriptorSetAllocateInfo(descriptorPool, &computeDescriptorSetLayout, 1);
VK_CHECK_RESULT(vkAllocateDescriptorSets(device, &allocInfo, &computeDescriptorSet));
std::vector<VkWriteDescriptorSet> computeWriteDescriptorSets = {
vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 0, &uniformBuffers.object.descriptor),
vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, &uniformBuffers.params.descriptor),
vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2, &uniformBuffers.clusterData.descriptor),
vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3, &uniformBuffers.clusterIndexList.descriptor),
};
vkUpdateDescriptorSets(device, static_cast<uint32_t>(computeWriteDescriptorSets.size()), computeWriteDescriptorSets.data(), 0, NULL);
}
4.创建计算管线
在preparePipelines()后添加计算管线准备函数prepareComputePipeline()
void prepareComputePipeline() {
VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = vks::initializers::pipelineLayoutCreateInfo(&computeDescriptorSetLayout, 1);
VK_CHECK_RESULT(vkCreatePipelineLayout(device, &pipelineLayoutCreateInfo, nullptr, &computePipelineLayout));
VkComputePipelineCreateInfo computePipelineCreateInfo = vks::initializers::computePipelineCreateInfo(computePipelineLayout);
VkPipelineShaderStageCreateInfo shaderStage = loadShader(getShadersPath() + "pbrbasic/lightcull.comp.hlsl", VK_SHADER_STAGE_COMPUTE_BIT);
computePipelineCreateInfo.stage = shaderStage;
VK_CHECK_RESULT(vkCreateComputePipelines(device, pipelineCache, 1, &computePipelineCreateInfo, nullptr, &computePipeline));
}
在preoare()中调用该函数
void prepare() {
VulkanExampleBase::prepare();
loadAssets();
prepareUniformBuffers();
setupDescriptors();
preparePipelines();
prepareComputePipeline(); // 新增
buildCommandBuffers();
prepared = true;
}
5.更新命令缓冲区以运行计算着色器
在buildCommandBuffers()之前添加一个计算命令缓冲区。
VkCommandBuffer computeCmdBuffer{ VK_NULL_HANDLE };
void prepareComputeCommandBuffer() {
VkCommandBufferAllocateInfo allocInfo = vks::initializers::commandBufferAllocateInfo(commandPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, 1);
VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &allocInfo, &computeCmdBuffer));
VkCommandBufferBeginInfo beginInfo = vks::initializers::commandBufferBeginInfo();
VK_CHECK_RESULT(vkBeginCommandBuffer(computeCmdBuffer, &beginInfo));
vkCmdBindPipeline(computeCmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline);
vkCmdBindDescriptorSets(computeCmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipelineLayout, 0, 1, &computeDescriptorSet, 0, nullptr);
vkCmdDispatch(computeCmdBuffer, CLUSTER_SIZE_X, CLUSTER_SIZE_Y, CLUSTER_SIZE_Z); // 启动 8x8x8 线程组
VK_CHECK_RESULT(vkEndCommandBuffer(computeCmdBuffer));
}
在rebder()中调用:
void render() {
if (!prepared) return;
updateUniformBuffers();
if (!paused) {
updateLights();
prepareComputeCommandBuffer(); // 运行计算着色器
VkSubmitInfo computeSubmitInfo = vks::initializers::submitInfo();
computeSubmitInfo.commandBufferCount = 1;
computeSubmitInfo.pCommandBuffers = &computeCmdBuffer;
VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &computeSubmitInfo, VK_NULL_HANDLE));
vkQueueWaitIdle(queue); // 等待计算完成
}
draw();
}
6.添加计算着色器
创建文件 shader/pbrbasic/lightcull.comp.hlsl
#define NUM_LIGHTS 64
#define CLUSTER_SIZE_X 8
#define CLUSTER_SIZE_Y 8
#define CLUSTER_SIZE_Z 8
#define TOTAL_CLUSTERS (CLUSTER_SIZE_X * CLUSTER_SIZE_Y * CLUSTER_SIZE_Z)
struct Light {
float4 position;
float4 colorAndRadius;
float4 direction;
float4 cutOff;
};
struct Cluster {
uint counts;
uint offsets;
float2 padding;
};
struct Indices {
uint clusterIndexList;
float3 padding;
};
cbuffer ubo : register(b0) {
float4x4 projection;
float4x4 model;
float4x4 view;
float3 camPos;
};
cbuffer uboParams : register(b1) {
Light lights[NUM_LIGHTS];
};
RWStructuredBuffer<Cluster> clusterCountsandOffsets : register(u0);
RWStructuredBuffer<Indices> indices : register(u1);
[numthreads(1, 1, 1)]
void main(uint3 groupID : SV_GroupID, uint3 threadID : SV_GroupThreadID) {
uint clusterIdx = groupID.z * CLUSTER_SIZE_X * CLUSTER_SIZE_Y + groupID.y * CLUSTER_SIZE_X + groupID.x;
if (clusterIdx >= TOTAL_CLUSTERS) return;
// 初始化集群
clusterCountsandOffsets[clusterIdx].counts = 0;
clusterCountsandOffsets[clusterIdx].offsets = 0;
float zNear = 0.1f;
float zFar = 256.0f;
float logZFarNear = log(zFar / zNear);
// 计算集群的视图空间范围
float zStart = zNear * pow(zFar / zNear, float(groupID.z) / CLUSTER_SIZE_Z);
float zEnd = zNear * pow(zFar / zNear, float(groupID.z + 1) / CLUSTER_SIZE_Z);
float xStart = -1.0f + 2.0f * groupID.x / CLUSTER_SIZE_X;
float xEnd = -1.0f + 2.0f * (groupID.x + 1) / CLUSTER_SIZE_X;
float yStart = -1.0f + 2.0f * groupID.y / CLUSTER_SIZE_Y;
float yEnd = -1.0f + 2.0f * (groupID.y + 1) / CLUSTER_SIZE_Y;
// 计算光源分配
uint count = 0;
for (uint lightIdx = 0; lightIdx < NUM_LIGHTS; lightIdx++) {
float4 viewPos = mul(view, lights[lightIdx].position);
float viewZ = -viewPos.z;
float radius = lights[lightIdx].colorAndRadius.w;
if (viewZ < zNear || viewZ > zFar) continue;
// 计算光源 AABB
float3 minAABB = viewPos.xyz - float3(radius, radius, radius);
float3 maxAABB = viewPos.xyz + float3(radius, radius, radius);
// 变换到 NDC
float4 clipMin = mul(projection, float4(minAABB, 1.0));
float4 clipMax = mul(projection, float4(maxAABB, 1.0));
clipMin /= clipMin.w;
clipMax /= clipMax.w;
// 检查与集群相交
if (clipMin.x <= xEnd && clipMax.x >= xStart &&
clipMin.y <= yEnd && clipMax.y >= yStart &&
viewZ - radius <= zEnd && viewZ + radius >= zStart) {
if (count < NUM_LIGHTS) {
clusterCountsandOffsets[clusterIdx].counts++;
count++;
}
}
}
}
"pbrbasic/lightcull.comp.hlsl"
- 注意,在创建文件的过程中,省略.hlsl。
- 在hlsl文件中,使用RWStructuredBuffer写入集群计数。
- 每个线程处理一个集群,计算其在NDC空间的范围,并测试光源AABB是否相交。
- 暂时只计算counts,偏移和索引列表将在后续阶段完成。
7.添加第二阶段计算着色器(填充索引列表)
创建文件shaders/pbrbasic/lightindex.comp.hlsl:
#define NUM_LIGHTS 64
#define CLUSTER_SIZE_X 8
#define CLUSTER_SIZE_Y 8
#define CLUSTER_SIZE_Z 8
#define TOTAL_CLUSTERS (CLUSTER_SIZE_X * CLUSTER_SIZE_Y * CLUSTER_SIZE_Z)
struct Light {
float4 position;
float4 colorAndRadius;
float4 direction;
float4 cutOff;
};
struct Cluster {
uint counts;
uint offsets;
float2 padding;
};
struct Indices {
uint clusterIndexList;
float3 padding;
};
cbuffer ubo : register(b0) {
float4x4 projection;
float4x4 model;
float4x4 view;
float3 camPos;
};
cbuffer uboParams : register(b1) {
Light lights[NUM_LIGHTS];
};
StructuredBuffer<Cluster> clusterCountsandOffsets : register(t0);
RWStructuredBuffer<Indices> indices : register(u1);
RWStructuredBuffer<uint> globalOffset : register(u2); // 全局偏移计数器
[numthreads(64, 1, 1)]
void main(uint3 groupID : SV_GroupID, uint3 threadID : SV_GroupThreadID) {
uint lightIdx = threadID.x;
if (lightIdx >= NUM_LIGHTS) return;
float zNear = 0.1f;
float zFar = 256.0f;
float logZFarNear = log(zFar / zNear);
float4 viewPos = mul(view, lights[lightIdx].position);
float viewZ = -viewPos.z;
float radius = lights[lightIdx].colorAndRadius.w;
if (viewZ < zNear || viewZ > zFar) return;
float3 minAABB = viewPos.xyz - float3(radius, radius, radius);
float3 maxAABB = viewPos.xyz + float3(radius, radius, radius);
float4 clipMin = mul(projection, float4(minAABB, 1.0));
float4 clipMax = mul(projection, float4(maxAABB, 1.0));
clipMin /= clipMin.w;
clipMax /= clipMax.w;
uint minClusterX = uint((clipMin.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
uint maxClusterX = uint((clipMax.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
uint minClusterY = uint((clipMin.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
uint maxClusterY = uint((clipMax.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
float minZ = max(viewZ - radius, zNear);
float maxZ = min(viewZ + radius, zFar);
uint minClusterZ = uint((log(minZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);
uint maxClusterZ = uint((log(maxZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);
minClusterX = clamp(minClusterX, 0u, CLUSTER_SIZE_X - 1);
maxClusterX = clamp(maxClusterX, 0u, CLUSTER_SIZE_X - 1);
minClusterY = clamp(minClusterY, 0u, CLUSTER_SIZE_Y - 1);
maxClusterY = clamp(maxClusterY, 0u, CLUSTER_SIZE_Y - 1);
minClusterZ = clamp(minClusterZ, 0u, CLUSTER_SIZE_Z - 1);
maxClusterZ = clamp(maxClusterZ, 0u, CLUSTER_SIZE_Z - 1);
for (uint z = minClusterZ; z <= maxClusterZ; ++z) {
for (uint y = minClusterY; y <= maxClusterY; ++y) {
for (uint x = minClusterX; x <= maxClusterX; ++x) {
uint clusterIdx = z * CLUSTER_SIZE_X * CLUSTER_SIZE_Y + y * CLUSTER_SIZE_X + x;
uint offset = clusterCountsandOffsets[clusterIdx].offsets + InterlockedAdd(globalOffset[0], 1);
if (offset < NUM_LIGHTS * TOTAL_CLUSTERS) {
indices[offset].clusterIndexList = lightIdx;
}
}
}
}
}
说明:
- 每个线程处理一个光源,计算其影响的集群范围。
- 使用InterlockedAdd原子操作更新全局偏移,确保线程安全。
- 需要一个额外的全局索引globalOffset 存储缓冲区来跟踪索引分配。
8.添加全局偏移缓冲区
在c++端的VulkanExample类中添加
struct {
vks::Buffer object;
vks::Buffer params;
vks::Buffer clusterData;
vks::Buffer clusterIndexList;
vks::Buffer globalOffset; // 新增
vks::Buffer sphereVertex;
vks::Buffer sphereIndex;
vks::Buffer sphereNormal;
} uniformBuffers;
同步更新prepareprepareUniformBuffers()。
在 prepareUniformBuffers() 中添加:
VK_CHECK_RESULT(vulkanDevice->createBuffer(
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
&uniformBuffers.globalOffset,
sizeof(uint32_t)));
VK_CHECK_RESULT(uniformBuffers.globalOffset.map());
在 setupDescriptors() 的计算描述符中添加绑定:
std::vector<VkDescriptorSetLayoutBinding> computeBindings = {
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 0),
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 1),
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 2),
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 3),
vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 4), // globalOffset
};
std::vector<VkWriteDescriptorSet> computeWriteDescriptorSets = {
vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 0, &uniformBuffers.object.descriptor),
vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, &uniformBuffers.params.descriptor),
vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2, &uniformBuffers.clusterData.descriptor),
vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3, &uniformBuffers.clusterIndexList.descriptor),
vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 4, &uniformBuffers.globalOffset.descriptor),
};
在 prepareComputeCommandBuffer() 中初始化 globalOffset:
void prepareComputeCommandBuffer() {
uint32_t zero = 0;
memcpy(uniformBuffers.globalOffset.mapped, &zero, sizeof(uint32_t)); // 重置偏移
VkCommandBufferAllocateInfo allocInfo = vks::initializers::commandBufferAllocateInfo(commandPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, 1);
VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &allocInfo, &computeCmdBuffer));
VkCommandBufferBeginInfo beginInfo = vks::initializers::commandBufferBeginInfo();
VK_CHECK_RESULT(vkBeginCommandBuffer(computeCmdBuffer, &beginInfo));
// 第一次 Dispatch:计算计数
vkCmdBindPipeline(computeCmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline);
vkCmdBindDescriptorSets(computeCmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipelineLayout, 0, 1, &computeDescriptorSet, 0, nullptr);
vkCmdDispatch(computeCmdBuffer, CLUSTER_SIZE_X, CLUSTER_SIZE_Y, CLUSTER_SIZE_Z);
// 计算偏移
vkCmdFillBuffer(computeCmdBuffer, uniformBuffers.clusterData.buffer, 0, sizeof(clusterData), 0); // 清空 clusterData
VkBufferMemoryBarrier barrier = vks::initializers::bufferMemoryBarrier();
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
barrier.buffer = uniformBuffers.clusterData.buffer;
barrier.size = sizeof(clusterData);
vkCmdPipelineBarrier(computeCmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 1, &barrier, 0, nullptr);
// 第二次 Dispatch:填充索引
VkPipeline computePipelineIndex;
VkPipelineShaderStageCreateInfo shaderStage = loadShader(getShadersPath() + "pbrbasic/lightindex.comp.hlsl", VK_SHADER_STAGE_COMPUTE_BIT);
VkComputePipelineCreateInfo computePipelineCreateInfo = vks::initializers::computePipelineCreateInfo(computePipelineLayout);
computePipelineCreateInfo.stage = shaderStage;
VK_CHECK_RESULT(vkCreateComputePipelines(device, pipelineCache, 1, &computePipelineCreateInfo, nullptr, &computePipelineIndex));
vkCmdBindPipeline(computeCmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipelineIndex);
vkCmdDispatch(computeCmdBuffer, 1, 1, 1); // 64 个光源
VK_CHECK_RESULT(vkEndCommandBuffer(computeCmdBuffer));
}
优化之前的片段着色器:
修改 pbr.frag.hlsl,确保正确处理 3D 集群,并优化光源遍历:
float4 main(VSOutput input) : SV_TARGET {
float3 N = normalize(input.Normal);
float3 V = normalize(ubo.camPos - input.WorldPos);
float roughness = material.roughness;
float4 worldPos = float4(input.WorldPos, 1.0);
float4 viewPos = mul(ubo.view, worldPos);
float4 clipPos = mul(ubo.projection, viewPos);
clipPos /= clipPos.w;
float2 screenPos = clipPos.xy * 0.5 + 0.5;
float viewZ = -viewPos.z;
float zNear = 0.1;
float zFar = 256.0;
uint clusterZ = uint((log(max(viewZ, zNear) / zNear) / log(zFar / zNear)) * CLUSTER_SIZE_Z);
clusterZ = clamp(clusterZ, 0u, CLUSTER_SIZE_Z - 1);
uint clusterX = uint(screenPos.x * CLUSTER_SIZE_X);
uint clusterY = uint(screenPos.y * CLUSTER_SIZE_Y);
clusterX = clamp(clusterX, 0u, CLUSTER_SIZE_X - 1);
clusterY = clamp(clusterY, 0u, CLUSTER_SIZE_Y - 1);
uint clusterIdx = clusterZ * CLUSTER_SIZE_X * CLUSTER_SIZE_Y + clusterY * CLUSTER_SIZE_X + clusterX;
uint lightCount = clusterCountsandOffsets[clusterIdx].counts;
uint lightOffset = clusterCountsandOffsets[clusterIdx].offsets;
float3 Lo = float3(0.0, 0.0, 0.0);
const uint maxLightsPerCluster = 32; // 限制最大光源数
if (lightCount > 0) {
lightCount = min(lightCount, maxLightsPerCluster);
for (uint i = lightOffset; i < lightOffset + lightCount; i++) {
float3 lightVec = lights[indices[i].clusterIndexList].position.xyz - input.WorldPos;
float3 L = normalize(lightVec);
float radianceFactor = radiance(lights[indices[i].clusterIndexList].colorAndRadius.w, lightVec, N, L);
float3 lightColor = lights[indices[i].clusterIndexList].colorAndRadius.xyz;
Lo += BRDF(L, V, N, material.metallic, roughness) * lightColor * radianceFactor;
}
}
float3 color = materialcolor() * 0.02;
color += Lo;
color = pow(color, float3(0.4545, 0.4545, 0.4545));
return float4(color, 1.0);
}
说明:
- 深度分割:使用对数深度公式计算 clusterZ,与计算着色器保持一致。
- 光源限制:添加 maxLightsPerCluster(设为 32),避免过多的光源遍历。
- 缓冲区访问:使用 StructuredBuffer(对应 Storage Buffer)访问 clusterCountsandOffsets 和 indices。

浙公网安备 33010602011771号