[置顶] cluster light Vulkan实现(gpu)

实现GPU版本cluster light的步骤
1.分割视椎体为三维网格

  • 具体实现:
    选取合适的网格大小。(例如16924的网格,分别对应屏幕的x,y,z),每个cluster表示一个空间区域,包含可能影响该区域像素的光源。

按照对数分割,计算AABB(轴对齐包围盒),进行光源与cluster的相交测试。

  • GPU设置:将cluster边界存储在结构化缓冲区域或3D纹理中,供计算着色器使用。

2.光源分配(注入阶段)

逐个Tile收集

  • 使用计算着色器并行处理所有cluster和光源。
  • 对每个光源,计算其影响范围(如球形光源,其位置半径决定影响范围)。
  • 进行相交检测,与AABB包围盒进行相交测试。

逐个光源逐入

  • 遍历光源,将其分配到所有与之相交cluster,减少重复计算。

压缩阶段

  • 使用计算着色器清理光源列表,移除重复或空项目,优化内存使用。

GPU细节处理:

  • 使用线程组并行处理cluster。
  • 将光源索引存储在每个cluster的固定大小数组或链表中。
  • 使用原子操作处理多线程写入冲突。

3.着色阶段
按照像素所属cluster计算所有光源的贡献,完成着色。

实现:

  • 在片段着色器中,根据像素的屏幕坐标和深度缓冲区,计算其视空间位置。
  • 确定像素所属的cluster(通过网格映射)。
  • 从注入阶段的缓冲区中读取该cluster的光源索引列表。
  • 遍历光源列表,计算每个光源的贡献,使用PBR模型。
  • 累加所有光源的贡献,完成像素着色。

可能的优化
*应用GPU纹理可快速访问光源数据和cluster分配。

修改思路

为了实现GPU版本的Cluster Lights,需要:

1.调整cluster,采用对数深度分布。
在c++中的VulkanExample类中,调整updateLightCluster()函数,使用对数深度分割,为后续GPU计算着色器做准备。

void updateLightsCluster() {
    // 清空集群数据
    memset(clusterIndexList.indices, 0, sizeof(clusterIndexList.indices));
    memset(clusterData.cluster, 0, sizeof(clusterData.cluster));

    glm::mat4 viewProj = uboMatrices.projection * uboMatrices.view;
    float zNear = 0.1f;
    float zFar = 256.0f;
    float logZFarNear = log(zFar / zNear); // 预计算 log(zFar/zNear)

    // 分配临时数组记录已分配的光源
    std::vector<std::vector<bool>> assignedLights(TOTAL_CLUSTERS, std::vector<bool>(maxnumLights, false));

    for (int lightIdx = 0; lightIdx < maxnumLights; lightIdx++) {
        Light& light = uboParams.lights[lightIdx];
        float radius = light.colorAndRadius.w;

        // 变换光源位置到视图空间
        glm::vec4 viewPos = uboMatrices.view * light.position;
        float viewZ = -viewPos.z; // Vulkan 视图空间 Z 为负
        if (viewZ < zNear || viewZ > zFar) continue; // 跳过视锥体外的光源

        // 计算光源在视图空间的 AABB
        glm::vec3 minAABB = glm::vec3(viewPos) - glm::vec3(radius);
        glm::vec3 maxAABB = glm::vec3(viewPos) + glm::vec3(radius);

        // 变换到 NDC 空间
        glm::vec4 clipMin = viewProj * glm::vec4(minAABB, 1.0f);
        glm::vec4 clipMax = viewProj * glm::vec4(maxAABB, 1.0f);
        clipMin /= clipMin.w;
        clipMax /= clipMax.w;

        // 映射到集群索引
        uint32_t minClusterX = static_cast<uint32_t>((clipMin.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
        uint32_t maxClusterX = static_cast<uint32_t>((clipMax.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
        uint32_t minClusterY = static_cast<uint32_t>((clipMin.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
        uint32_t maxClusterY = static_cast<uint32_t>((clipMax.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);

        // 对数深度分割
        float minZ = glm::max(viewZ - radius, zNear);
        float maxZ = glm::min(viewZ + radius, zFar);
        uint32_t minClusterZ = static_cast<uint32_t>((log(minZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);
        uint32_t maxClusterZ = static_cast<uint32_t>((log(maxZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);

        minClusterX = glm::clamp(minClusterX, 0u, CLUSTER_SIZE_X - 1);
        maxClusterX = glm::clamp(maxClusterX, 0u, CLUSTER_SIZE_X - 1);
        minClusterY = glm::clamp(minClusterY, 0u, CLUSTER_SIZE_Y - 1);
        maxClusterY = glm::clamp(maxClusterY, 0u, CLUSTER_SIZE_Y - 1);
        minClusterZ = glm::clamp(minClusterZ, 0u, CLUSTER_SIZE_Z - 1);
        maxClusterZ = glm::clamp(maxClusterZ, 0u, CLUSTER_SIZE_Z - 1);

        for (uint32_t z = minClusterZ; z <= maxClusterZ; ++z) {
            for (uint32_t y = minClusterY; y <= maxClusterY; ++y) {
                for (uint32_t x = minClusterX; x <= maxClusterX; ++x) {
                    uint32_t clusterIdx = z * CLUSTER_SIZE_X * CLUSTER_SIZE_Y + y * CLUSTER_SIZE_X + x;
                    if (!assignedLights[clusterIdx][lightIdx] && clusterData.cluster[clusterIdx].count < maxnumLights) {
                        clusterData.cluster[clusterIdx].count++;
                        assignedLights[clusterIdx][lightIdx] = true;
                    }
                }
            }
        }
    }

    // 计算偏移
    uint32_t runningSum = 0;
    for (uint32_t i = 0; i < TOTAL_CLUSTERS; i++) {
        clusterData.cluster[i].offset = runningSum;
        runningSum += clusterData.cluster[i].count;
    }

    // 填充光源索引列表
    std::vector<uint32_t> tempOffsets(TOTAL_CLUSTERS, 0);
    for (int lightIdx = 0; lightIdx < maxnumLights; lightIdx++) {
        Light& light = uboParams.lights[lightIdx];
        float radius = light.colorAndRadius.w;
        glm::vec4 viewPos = uboMatrices.view * light.position;
        float viewZ = -viewPos.z;

        if (viewZ < zNear || viewZ > zFar) continue;

        glm::vec3 minAABB = glm::vec3(viewPos) - glm::vec3(radius);
        glm::vec3 maxAABB = glm::vec3(viewPos) + glm::vec3(radius);
        glm::vec4 clipMin = viewProj * glm::vec4(minAABB, 1.0f);
        glm::vec4 clipMax = viewProj * glm::vec4(maxAABB, 1.0f);
        clipMin /= clipMin.w;
        clipMax /= clipMax.w;

        uint32_t minClusterX = static_cast<uint32_t>((clipMin.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
        uint32_t maxClusterX = static_cast<uint32_t>((clipMax.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
        uint32_t minClusterY = static_cast<uint32_t>((clipMin.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
        uint32_t maxClusterY = static_cast<uint32_t>((clipMax.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
        float minZ = glm::max(viewZ - radius, zNear);
        float maxZ = glm::min(viewZ + radius, zFar);
        uint32_t minClusterZ = static_cast<uint32_t>((log(minZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);
        uint32_t maxClusterZ = static_cast<uint32_t>((log(maxZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);

        minClusterX = glm::clamp(minClusterX, 0u, CLUSTER_SIZE_X - 1);
        maxClusterX = glm::clamp(maxClusterX, 0u, CLUSTER_SIZE_X - 1);
        minClusterY = glm::clamp(minClusterY, 0u, CLUSTER_SIZE_Y - 1);
        maxClusterY = glm::clamp(maxClusterY, 0u, CLUSTER_SIZE_Y - 1);
        minClusterZ = glm::clamp(minClusterZ, 0u, CLUSTER_SIZE_Z - 1);
        maxClusterZ = glm::clamp(maxClusterZ, 0u, CLUSTER_SIZE_Z - 1);

        for (uint32_t z = minClusterZ; z <= maxClusterZ; ++z) {
            for (uint32_t y = minClusterY; y <= maxClusterY; ++y) {
                for (uint32_t x = minClusterX; x <= maxClusterX; ++x) {
                    uint32_t clusterIdx = z * CLUSTER_SIZE_X * CLUSTER_SIZE_Y + y * CLUSTER_SIZE_X + x;
                    uint32_t offset = clusterData.cluster[clusterIdx].offset + tempOffsets[clusterIdx];
                    if (offset < lightIndexListnum && tempOffsets[clusterIdx] < clusterData.cluster[clusterIdx].count) {
                        clusterIndexList.indices[offset].clusterIndexList = lightIdx;
                        tempOffsets[clusterIdx]++;
                    }
                }
            }
        }
    }

    memcpy(uniformBuffers.clusterData.mapped, &clusterData, sizeof(clusterData));
    memcpy(uniformBuffers.clusterIndexList.mapped, &clusterIndexList, sizeof(clusterIndexList));
}

2.添加GPU注入阶段
添加一个计算着色器来替代上面c++中的updateLightsCluster,并使用存储缓冲区(Storage Buffer)而非Uniform Buffer,以支持更大容量和写操作。

c++端代码修改

void prepareUniformBuffers() {
    VkPhysicalDeviceProperties properties;
    vkGetPhysicalDeviceProperties(physicalDevice, &properties);
    VkDeviceSize minAlignment = properties.limits.minStorageBufferOffsetAlignment;
    VkDeviceSize alignedSizeClusterIndexList = ((sizeof(clusterIndexList) + minAlignment - 1) / minAlignment) * minAlignment;

    // 矩阵缓冲区(仍为 Uniform Buffer)
    VK_CHECK_RESULT(vulkanDevice->createBuffer(
        VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
        &uniformBuffers.object,
        sizeof(uboMatrices)));

    // 光源数据缓冲区(仍为 Uniform Buffer)
    VK_CHECK_RESULT(vulkanDevice->createBuffer(
        VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
        &uniformBuffers.params,
        sizeof(uboParams)));

    // 集群计数和偏移(Storage Buffer)
    VK_CHECK_RESULT(vulkanDevice->createBuffer(
        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
        &uniformBuffers.clusterData,
        sizeof(clusterData)));

    // 光源索引列表(Storage Buffer)
    VK_CHECK_RESULT(vulkanDevice->createBuffer(
        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
        &uniformBuffers.clusterIndexList,
        alignedSizeClusterIndexList));

    VK_CHECK_RESULT(uniformBuffers.object.map());
    VK_CHECK_RESULT(uniformBuffers.params.map());
    VK_CHECK_RESULT(uniformBuffers.clusterData.map());
    VK_CHECK_RESULT(uniformBuffers.clusterIndexList.map());

    prepareSphereBuffers();
}

2.添加计算管线

在VulkanExample类中添加计算管线的相关成员。

class VulkanExample : public VulkanExampleBase {
public:
    // ... 其他成员 ...
    VkPipeline computePipeline{ VK_NULL_HANDLE };
    VkPipelineLayout computePipelineLayout{ VK_NULL_HANDLE };
    VkDescriptorSet computeDescriptorSet{ VK_NULL_HANDLE };
    VkDescriptorSetLayout computeDescriptorSetLayout{ VK_NULL_HANDLE };
};

3.设置描述符集

在setupDescriptors()中添加计算描述符集布局:

void setupDescriptors() {
    // 图形描述符
    std::vector<VkDescriptorPoolSize> poolSizes = {
        vks::initializers::descriptorPoolSize(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 4),
        vks::initializers::descriptorPoolSize(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2), // 为计算着色器添加
    };
    VkDescriptorPoolCreateInfo descriptorPoolInfo = vks::initializers::descriptorPoolCreateInfo(poolSizes, 3); // 增加描述符集数
    VK_CHECK_RESULT(vkCreateDescriptorPool(device, &descriptorPoolInfo, nullptr, &descriptorPool));

    // 图形描述符集布局
    std::vector<VkDescriptorSetLayoutBinding> setLayoutBindings = {
        vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT, 0),
        vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_FRAGMENT_BIT, 1),
        vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_FRAGMENT_BIT, 2), // 改为 Storage Buffer
        vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_FRAGMENT_BIT, 3), // 改为 Storage Buffer
    };
    VkDescriptorSetLayoutCreateInfo descriptorLayout = vks::initializers::descriptorSetLayoutCreateInfo(setLayoutBindings);
    VK_CHECK_RESULT(vkCreateDescriptorSetLayout(device, &descriptorLayout, nullptr, &descriptorSetLayout));

    VkDescriptorSetAllocateInfo allocInfo = vks::initializers::descriptorSetAllocateInfo(descriptorPool, &descriptorSetLayout, 1);
    VK_CHECK_RESULT(vkAllocateDescriptorSets(device, &allocInfo, &descriptorSet));

    std::vector<VkWriteDescriptorSet> writeDescriptorSets = {
        vks::initializers::writeDescriptorSet(descriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 0, &uniformBuffers.object.descriptor),
        vks::initializers::writeDescriptorSet(descriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, &uniformBuffers.params.descriptor),
        vks::initializers::writeDescriptorSet(descriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2, &uniformBuffers.clusterIndexList.descriptor),
        vks::initializers::writeDescriptorSet(descriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3, &uniformBuffers.clusterData.descriptor),
    };
    vkUpdateDescriptorSets(device, static_cast<uint32_t>(writeDescriptorSets.size()), writeDescriptorSets.data(), 0, NULL);

    // 计算描述符集布局
    std::vector<VkDescriptorSetLayoutBinding> computeBindings = {
        vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 0), // 矩阵
        vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 1), // 光源
        vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 2), // 集群计数和偏移
        vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 3), // 光源索引列表
    };
    VkDescriptorSetLayoutCreateInfo computeLayoutInfo = vks::initializers::descriptorSetLayoutCreateInfo(computeBindings);
    VK_CHECK_RESULT(vkCreateDescriptorSetLayout(device, &computeLayoutInfo, nullptr, &computeDescriptorSetLayout));

    allocInfo = vks::initializers::descriptorSetAllocateInfo(descriptorPool, &computeDescriptorSetLayout, 1);
    VK_CHECK_RESULT(vkAllocateDescriptorSets(device, &allocInfo, &computeDescriptorSet));

    std::vector<VkWriteDescriptorSet> computeWriteDescriptorSets = {
        vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 0, &uniformBuffers.object.descriptor),
        vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, &uniformBuffers.params.descriptor),
        vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2, &uniformBuffers.clusterData.descriptor),
        vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3, &uniformBuffers.clusterIndexList.descriptor),
    };
    vkUpdateDescriptorSets(device, static_cast<uint32_t>(computeWriteDescriptorSets.size()), computeWriteDescriptorSets.data(), 0, NULL);
}

4.创建计算管线

在preparePipelines()后添加计算管线准备函数prepareComputePipeline()

void prepareComputePipeline() {
    VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = vks::initializers::pipelineLayoutCreateInfo(&computeDescriptorSetLayout, 1);
    VK_CHECK_RESULT(vkCreatePipelineLayout(device, &pipelineLayoutCreateInfo, nullptr, &computePipelineLayout));

    VkComputePipelineCreateInfo computePipelineCreateInfo = vks::initializers::computePipelineCreateInfo(computePipelineLayout);
    VkPipelineShaderStageCreateInfo shaderStage = loadShader(getShadersPath() + "pbrbasic/lightcull.comp.hlsl", VK_SHADER_STAGE_COMPUTE_BIT);
    computePipelineCreateInfo.stage = shaderStage;
    VK_CHECK_RESULT(vkCreateComputePipelines(device, pipelineCache, 1, &computePipelineCreateInfo, nullptr, &computePipeline));
}

在preoare()中调用该函数

void prepare() {
    VulkanExampleBase::prepare();
    loadAssets();
    prepareUniformBuffers();
    setupDescriptors();
    preparePipelines();
    prepareComputePipeline(); // 新增
    buildCommandBuffers();
    prepared = true;
}

5.更新命令缓冲区以运行计算着色器

在buildCommandBuffers()之前添加一个计算命令缓冲区。

VkCommandBuffer computeCmdBuffer{ VK_NULL_HANDLE };

void prepareComputeCommandBuffer() {
    VkCommandBufferAllocateInfo allocInfo = vks::initializers::commandBufferAllocateInfo(commandPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, 1);
    VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &allocInfo, &computeCmdBuffer));

    VkCommandBufferBeginInfo beginInfo = vks::initializers::commandBufferBeginInfo();
    VK_CHECK_RESULT(vkBeginCommandBuffer(computeCmdBuffer, &beginInfo));

    vkCmdBindPipeline(computeCmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline);
    vkCmdBindDescriptorSets(computeCmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipelineLayout, 0, 1, &computeDescriptorSet, 0, nullptr);
    vkCmdDispatch(computeCmdBuffer, CLUSTER_SIZE_X, CLUSTER_SIZE_Y, CLUSTER_SIZE_Z); // 启动 8x8x8 线程组
    VK_CHECK_RESULT(vkEndCommandBuffer(computeCmdBuffer));
}

在rebder()中调用:

void render() {
    if (!prepared) return;
    updateUniformBuffers();
    if (!paused) {
        updateLights();
        prepareComputeCommandBuffer(); // 运行计算着色器
        VkSubmitInfo computeSubmitInfo = vks::initializers::submitInfo();
        computeSubmitInfo.commandBufferCount = 1;
        computeSubmitInfo.pCommandBuffers = &computeCmdBuffer;
        VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &computeSubmitInfo, VK_NULL_HANDLE));
        vkQueueWaitIdle(queue); // 等待计算完成
    }
    draw();
}

6.添加计算着色器
创建文件 shader/pbrbasic/lightcull.comp.hlsl

#define NUM_LIGHTS 64
#define CLUSTER_SIZE_X 8
#define CLUSTER_SIZE_Y 8
#define CLUSTER_SIZE_Z 8
#define TOTAL_CLUSTERS (CLUSTER_SIZE_X * CLUSTER_SIZE_Y * CLUSTER_SIZE_Z)

struct Light {
    float4 position;
    float4 colorAndRadius;
    float4 direction;
    float4 cutOff;
};

struct Cluster {
    uint counts;
    uint offsets;
    float2 padding;
};

struct Indices {
    uint clusterIndexList;
    float3 padding;
};

cbuffer ubo : register(b0) {
    float4x4 projection;
    float4x4 model;
    float4x4 view;
    float3 camPos;
};

cbuffer uboParams : register(b1) {
    Light lights[NUM_LIGHTS];
};

RWStructuredBuffer<Cluster> clusterCountsandOffsets : register(u0);
RWStructuredBuffer<Indices> indices : register(u1);

[numthreads(1, 1, 1)]
void main(uint3 groupID : SV_GroupID, uint3 threadID : SV_GroupThreadID) {
    uint clusterIdx = groupID.z * CLUSTER_SIZE_X * CLUSTER_SIZE_Y + groupID.y * CLUSTER_SIZE_X + groupID.x;
    if (clusterIdx >= TOTAL_CLUSTERS) return;

    // 初始化集群
    clusterCountsandOffsets[clusterIdx].counts = 0;
    clusterCountsandOffsets[clusterIdx].offsets = 0;

    float zNear = 0.1f;
    float zFar = 256.0f;
    float logZFarNear = log(zFar / zNear);

    // 计算集群的视图空间范围
    float zStart = zNear * pow(zFar / zNear, float(groupID.z) / CLUSTER_SIZE_Z);
    float zEnd = zNear * pow(zFar / zNear, float(groupID.z + 1) / CLUSTER_SIZE_Z);
    float xStart = -1.0f + 2.0f * groupID.x / CLUSTER_SIZE_X;
    float xEnd = -1.0f + 2.0f * (groupID.x + 1) / CLUSTER_SIZE_X;
    float yStart = -1.0f + 2.0f * groupID.y / CLUSTER_SIZE_Y;
    float yEnd = -1.0f + 2.0f * (groupID.y + 1) / CLUSTER_SIZE_Y;

    // 计算光源分配
    uint count = 0;
    for (uint lightIdx = 0; lightIdx < NUM_LIGHTS; lightIdx++) {
        float4 viewPos = mul(view, lights[lightIdx].position);
        float viewZ = -viewPos.z;
        float radius = lights[lightIdx].colorAndRadius.w;

        if (viewZ < zNear || viewZ > zFar) continue;

        // 计算光源 AABB
        float3 minAABB = viewPos.xyz - float3(radius, radius, radius);
        float3 maxAABB = viewPos.xyz + float3(radius, radius, radius);

        // 变换到 NDC
        float4 clipMin = mul(projection, float4(minAABB, 1.0));
        float4 clipMax = mul(projection, float4(maxAABB, 1.0));
        clipMin /= clipMin.w;
        clipMax /= clipMax.w;

        // 检查与集群相交
        if (clipMin.x <= xEnd && clipMax.x >= xStart &&
            clipMin.y <= yEnd && clipMax.y >= yStart &&
            viewZ - radius <= zEnd && viewZ + radius >= zStart) {
            if (count < NUM_LIGHTS) {
                clusterCountsandOffsets[clusterIdx].counts++;
                count++;
            }
        }
    }
}

"pbrbasic/lightcull.comp.hlsl"

  • 注意,在创建文件的过程中,省略.hlsl。
  • 在hlsl文件中,使用RWStructuredBuffer写入集群计数。
  • 每个线程处理一个集群,计算其在NDC空间的范围,并测试光源AABB是否相交。
  • 暂时只计算counts,偏移和索引列表将在后续阶段完成。

7.添加第二阶段计算着色器(填充索引列表)
创建文件shaders/pbrbasic/lightindex.comp.hlsl:

#define NUM_LIGHTS 64
#define CLUSTER_SIZE_X 8
#define CLUSTER_SIZE_Y 8
#define CLUSTER_SIZE_Z 8
#define TOTAL_CLUSTERS (CLUSTER_SIZE_X * CLUSTER_SIZE_Y * CLUSTER_SIZE_Z)

struct Light {
    float4 position;
    float4 colorAndRadius;
    float4 direction;
    float4 cutOff;
};

struct Cluster {
    uint counts;
    uint offsets;
    float2 padding;
};

struct Indices {
    uint clusterIndexList;
    float3 padding;
};

cbuffer ubo : register(b0) {
    float4x4 projection;
    float4x4 model;
    float4x4 view;
    float3 camPos;
};

cbuffer uboParams : register(b1) {
    Light lights[NUM_LIGHTS];
};

StructuredBuffer<Cluster> clusterCountsandOffsets : register(t0);
RWStructuredBuffer<Indices> indices : register(u1);
RWStructuredBuffer<uint> globalOffset : register(u2); // 全局偏移计数器

[numthreads(64, 1, 1)]
void main(uint3 groupID : SV_GroupID, uint3 threadID : SV_GroupThreadID) {
    uint lightIdx = threadID.x;
    if (lightIdx >= NUM_LIGHTS) return;

    float zNear = 0.1f;
    float zFar = 256.0f;
    float logZFarNear = log(zFar / zNear);

    float4 viewPos = mul(view, lights[lightIdx].position);
    float viewZ = -viewPos.z;
    float radius = lights[lightIdx].colorAndRadius.w;

    if (viewZ < zNear || viewZ > zFar) return;

    float3 minAABB = viewPos.xyz - float3(radius, radius, radius);
    float3 maxAABB = viewPos.xyz + float3(radius, radius, radius);
    float4 clipMin = mul(projection, float4(minAABB, 1.0));
    float4 clipMax = mul(projection, float4(maxAABB, 1.0));
    clipMin /= clipMin.w;
    clipMax /= clipMax.w;

    uint minClusterX = uint((clipMin.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
    uint maxClusterX = uint((clipMax.x * 0.5f + 0.5f) * CLUSTER_SIZE_X);
    uint minClusterY = uint((clipMin.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
    uint maxClusterY = uint((clipMax.y * 0.5f + 0.5f) * CLUSTER_SIZE_Y);
    float minZ = max(viewZ - radius, zNear);
    float maxZ = min(viewZ + radius, zFar);
    uint minClusterZ = uint((log(minZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);
    uint maxClusterZ = uint((log(maxZ / zNear) / logZFarNear) * CLUSTER_SIZE_Z);

    minClusterX = clamp(minClusterX, 0u, CLUSTER_SIZE_X - 1);
    maxClusterX = clamp(maxClusterX, 0u, CLUSTER_SIZE_X - 1);
    minClusterY = clamp(minClusterY, 0u, CLUSTER_SIZE_Y - 1);
    maxClusterY = clamp(maxClusterY, 0u, CLUSTER_SIZE_Y - 1);
    minClusterZ = clamp(minClusterZ, 0u, CLUSTER_SIZE_Z - 1);
    maxClusterZ = clamp(maxClusterZ, 0u, CLUSTER_SIZE_Z - 1);

    for (uint z = minClusterZ; z <= maxClusterZ; ++z) {
        for (uint y = minClusterY; y <= maxClusterY; ++y) {
            for (uint x = minClusterX; x <= maxClusterX; ++x) {
                uint clusterIdx = z * CLUSTER_SIZE_X * CLUSTER_SIZE_Y + y * CLUSTER_SIZE_X + x;
                uint offset = clusterCountsandOffsets[clusterIdx].offsets + InterlockedAdd(globalOffset[0], 1);
                if (offset < NUM_LIGHTS * TOTAL_CLUSTERS) {
                    indices[offset].clusterIndexList = lightIdx;
                }
            }
        }
    }
}

说明:

  • 每个线程处理一个光源,计算其影响的集群范围。
  • 使用InterlockedAdd原子操作更新全局偏移,确保线程安全。
  • 需要一个额外的全局索引globalOffset 存储缓冲区来跟踪索引分配。

8.添加全局偏移缓冲区

在c++端的VulkanExample类中添加

struct {
    vks::Buffer object;
    vks::Buffer params;
    vks::Buffer clusterData;
    vks::Buffer clusterIndexList;
    vks::Buffer globalOffset; // 新增
    vks::Buffer sphereVertex;
    vks::Buffer sphereIndex;
    vks::Buffer sphereNormal;
} uniformBuffers;

同步更新prepareprepareUniformBuffers()。
在 prepareUniformBuffers() 中添加:

VK_CHECK_RESULT(vulkanDevice->createBuffer(
    VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
    VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
    &uniformBuffers.globalOffset,
    sizeof(uint32_t)));
VK_CHECK_RESULT(uniformBuffers.globalOffset.map());

在 setupDescriptors() 的计算描述符中添加绑定:

std::vector<VkDescriptorSetLayoutBinding> computeBindings = {
    vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 0),
    vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 1),
    vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 2),
    vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 3),
    vks::initializers::descriptorSetLayoutBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT, 4), // globalOffset
};
std::vector<VkWriteDescriptorSet> computeWriteDescriptorSets = {
    vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 0, &uniformBuffers.object.descriptor),
    vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, &uniformBuffers.params.descriptor),
    vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2, &uniformBuffers.clusterData.descriptor),
    vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3, &uniformBuffers.clusterIndexList.descriptor),
    vks::initializers::writeDescriptorSet(computeDescriptorSet, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 4, &uniformBuffers.globalOffset.descriptor),
};

在 prepareComputeCommandBuffer() 中初始化 globalOffset:

void prepareComputeCommandBuffer() {
    uint32_t zero = 0;
    memcpy(uniformBuffers.globalOffset.mapped, &zero, sizeof(uint32_t)); // 重置偏移

    VkCommandBufferAllocateInfo allocInfo = vks::initializers::commandBufferAllocateInfo(commandPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, 1);
    VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &allocInfo, &computeCmdBuffer));

    VkCommandBufferBeginInfo beginInfo = vks::initializers::commandBufferBeginInfo();
    VK_CHECK_RESULT(vkBeginCommandBuffer(computeCmdBuffer, &beginInfo));

    // 第一次 Dispatch:计算计数
    vkCmdBindPipeline(computeCmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline);
    vkCmdBindDescriptorSets(computeCmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipelineLayout, 0, 1, &computeDescriptorSet, 0, nullptr);
    vkCmdDispatch(computeCmdBuffer, CLUSTER_SIZE_X, CLUSTER_SIZE_Y, CLUSTER_SIZE_Z);

    // 计算偏移
    vkCmdFillBuffer(computeCmdBuffer, uniformBuffers.clusterData.buffer, 0, sizeof(clusterData), 0); // 清空 clusterData
    VkBufferMemoryBarrier barrier = vks::initializers::bufferMemoryBarrier();
    barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
    barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
    barrier.buffer = uniformBuffers.clusterData.buffer;
    barrier.size = sizeof(clusterData);
    vkCmdPipelineBarrier(computeCmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 1, &barrier, 0, nullptr);

    // 第二次 Dispatch:填充索引
    VkPipeline computePipelineIndex;
    VkPipelineShaderStageCreateInfo shaderStage = loadShader(getShadersPath() + "pbrbasic/lightindex.comp.hlsl", VK_SHADER_STAGE_COMPUTE_BIT);
    VkComputePipelineCreateInfo computePipelineCreateInfo = vks::initializers::computePipelineCreateInfo(computePipelineLayout);
    computePipelineCreateInfo.stage = shaderStage;
    VK_CHECK_RESULT(vkCreateComputePipelines(device, pipelineCache, 1, &computePipelineCreateInfo, nullptr, &computePipelineIndex));

    vkCmdBindPipeline(computeCmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipelineIndex);
    vkCmdDispatch(computeCmdBuffer, 1, 1, 1); // 64 个光源

    VK_CHECK_RESULT(vkEndCommandBuffer(computeCmdBuffer));
}

优化之前的片段着色器:
修改 pbr.frag.hlsl,确保正确处理 3D 集群,并优化光源遍历:

float4 main(VSOutput input) : SV_TARGET {
    float3 N = normalize(input.Normal);
    float3 V = normalize(ubo.camPos - input.WorldPos);
    float roughness = material.roughness;

    float4 worldPos = float4(input.WorldPos, 1.0);
    float4 viewPos = mul(ubo.view, worldPos);
    float4 clipPos = mul(ubo.projection, viewPos);
    clipPos /= clipPos.w;
    float2 screenPos = clipPos.xy * 0.5 + 0.5;

    float viewZ = -viewPos.z;
    float zNear = 0.1;
    float zFar = 256.0;
    uint clusterZ = uint((log(max(viewZ, zNear) / zNear) / log(zFar / zNear)) * CLUSTER_SIZE_Z);
    clusterZ = clamp(clusterZ, 0u, CLUSTER_SIZE_Z - 1);

    uint clusterX = uint(screenPos.x * CLUSTER_SIZE_X);
    uint clusterY = uint(screenPos.y * CLUSTER_SIZE_Y);
    clusterX = clamp(clusterX, 0u, CLUSTER_SIZE_X - 1);
    clusterY = clamp(clusterY, 0u, CLUSTER_SIZE_Y - 1);

    uint clusterIdx = clusterZ * CLUSTER_SIZE_X * CLUSTER_SIZE_Y + clusterY * CLUSTER_SIZE_X + clusterX;
    uint lightCount = clusterCountsandOffsets[clusterIdx].counts;
    uint lightOffset = clusterCountsandOffsets[clusterIdx].offsets;

    float3 Lo = float3(0.0, 0.0, 0.0);
    const uint maxLightsPerCluster = 32; // 限制最大光源数
    if (lightCount > 0) {
        lightCount = min(lightCount, maxLightsPerCluster);
        for (uint i = lightOffset; i < lightOffset + lightCount; i++) {
            float3 lightVec = lights[indices[i].clusterIndexList].position.xyz - input.WorldPos;
            float3 L = normalize(lightVec);
            float radianceFactor = radiance(lights[indices[i].clusterIndexList].colorAndRadius.w, lightVec, N, L);
            float3 lightColor = lights[indices[i].clusterIndexList].colorAndRadius.xyz;
            Lo += BRDF(L, V, N, material.metallic, roughness) * lightColor * radianceFactor;
        }
    }

    float3 color = materialcolor() * 0.02;
    color += Lo;
    color = pow(color, float3(0.4545, 0.4545, 0.4545));
    return float4(color, 1.0);
}

说明:

  • 深度分割:使用对数深度公式计算 clusterZ,与计算着色器保持一致。
  • 光源限制:添加 maxLightsPerCluster(设为 32),避免过多的光源遍历。
  • 缓冲区访问:使用 StructuredBuffer(对应 Storage Buffer)访问 clusterCountsandOffsets 和 indices。
posted @ 2025-06-14 22:54  BlueTOberry  阅读(41)  评论(0)    收藏  举报