cuda 学习

纯手写 GEMM 达到了 10.6 TFLOPS 的算力(逼近 cuBLAS 74% 的性能),转置带宽跑满了 622 GB/s(达到 cuBLAS 91% 的性能)。

在此总结一下这一路走来的优化经验:哪些是真神技,哪些是看似聪明的“反作用力”毒药。

一、 矩阵转置 (Transpose):访存的艺术

矩阵转置是一个纯粹的 Memory-Bound(访存受限) 任务。计算核心 (ALU) 几乎在睡大觉,所有的优化都在围绕显存控制器做文章。

✅ 核心有效操作 (The Good)

  1. 共享内存分块 (Shared Memory Tiling)
    • 原理:直接在全局显存 (Global Memory) 中进行 out[x][y] = in[y][x] 会导致致命的“跳跃访存”。利用 Shared Memory 作为中转站,先按块读入,再按块写出。
    • 收益:大幅降低对全局显存的访问延迟。
  2. 合并访存 (Coalesced Access)
    • 原理:永远记住 CUDA 的黄金法则:变化最快的线程索引 tx 必须对应内存中连续变化的维度(列)。即 x 对应列,y 对应行。
    • 收益:确保 32 个线程组成的 Warp 能一次性吃满显存的位宽(如 128-byte 事务),避免带宽浪费。
  3. Padding 消除 Bank Conflict (魔法 +1)
    • 原理:当 Shared Memory 的维度是 32 的倍数(如 [32][32])时,按列读取会产生严重的 32 路 Bank Conflict。将声明改为 __shared__ float tile[32][33]
    • 收益:极其廉价的一行代码,通过错位打破了内存块的对齐,彻底消灭 Bank Conflict,带宽瞬间飙升。
#include <stdio.h>
#include <cublas_v2.h>

// 宏定义检查 CUDA 错误
#define CHECK(call) \
{ \
    const cudaError_t error = call; \
    if (error != cudaSuccess) { \
        printf("Error: %s:%d, ", __FILE__, __LINE__); \
        printf("code: %d, reason: %s\n", error, cudaGetErrorString(error)); \
        exit(1); \
    } \
}

// 你的终极手写优化版 (转置)
__global__ void transposeOptimized(float *out, float *in, int width, int height) {
    __shared__ float tile[32][33]; // 32x33 完美避开 Bank Conflict

    int x = blockIdx.x * 32 + threadIdx.x;
    int y = blockIdx.y * 32 + threadIdx.y;

    // 1. 合并读取
    #pragma unroll
    for (int j = 0; j < 32; j += 8) {
        if (x < width && (y + j) < height) {
            tile[threadIdx.y + j][threadIdx.x] = in[(y + j) * width + x];
        }
    }

    __syncthreads();

    // 2. 重新计算转置后的坐标
    int new_x = blockIdx.y * 32 + threadIdx.x;
    int new_y = blockIdx.x * 32 + threadIdx.y;

    // 3. 合并写入
    #pragma unroll
    for (int j = 0; j < 32; j += 8) {
        if (new_x < height && (new_y + j) < width) {
            out[(new_y + j) * height + new_x] = tile[threadIdx.x][threadIdx.y + j];
        }
    }
}

int main() {
    int N = 8192; 
    size_t size = N * N * sizeof(float);
    
    // 计算传输的总数据量 (读一遍A,写一遍B,总共 2 * 8192 * 8192 * 4 Bytes)
    double total_bytes = 2.0 * N * N * sizeof(float); 

    float *d_A, *d_B_custom, *d_B_cublas;
    CHECK(cudaMalloc(&d_A, size));
    CHECK(cudaMalloc(&d_B_custom, size));
    CHECK(cudaMalloc(&d_B_cublas, size));

    // 初始化(省略具体数值,直接分配显存)
    CHECK(cudaMemset(d_A, 1, size));

    dim3 threads(32, 8);
    dim3 blocks((N + 32 - 1) / 32, (N + 32 - 1) / 32);

    // --- 创建高精度计时器 ---
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float ms = 0;

    printf("Matrix Size: %d x %d\n", N, N);
    printf("--------------------------------------------------\n");

    // ==========================================
    // 1. 测试你的手写终极版 (Custom Kernel)
    // ==========================================
    // 预热 (Warmup)
    transposeOptimized<<<blocks, threads>>>(d_B_custom, d_A, N, N);
    cudaDeviceSynchronize();

    cudaEventRecord(start);
    int iter = 10;
    for (int i = 0; i < iter; i++) {
        transposeOptimized<<<blocks, threads>>>(d_B_custom, d_A, N, N);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    
    float custom_time = ms / iter;
    double custom_bandwidth = (total_bytes / (custom_time / 1000.0)) / 1e9; // GB/s
    printf("Custom Kernel Time: %8.3f ms | Bandwidth: %6.1f GB/s\n", custom_time, custom_bandwidth);

    // ==========================================
    // 2. 测试官方 cuBLAS (cublasSgeam)
    // ==========================================
    cublasHandle_t handle;
    cublasCreate(&handle);
    const float alpha = 1.0f;
    const float beta = 0.0f;

    // 预热 (Warmup)
    cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, N, &alpha, d_A, N, &beta, d_B_cublas, N, d_B_cublas, N);
    cudaDeviceSynchronize();

    cudaEventRecord(start);
    for (int i = 0; i < iter; i++) {
        // 参数说明:A转置(OP_T), B不转置(OP_N,因为beta是0没影响), m, n, alpha, A, lda, beta, B, ldb, C, ldc
        cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, N, &alpha, d_A, N, &beta, d_B_cublas, N, d_B_cublas, N);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);

    float cublas_time = ms / iter;
    double cublas_bandwidth = (total_bytes / (cublas_time / 1000.0)) / 1e9;
    printf("cuBLAS sgeam  Time: %8.3f ms | Bandwidth: %6.1f GB/s\n", cublas_time, cublas_bandwidth);
    printf("--------------------------------------------------\n");
    
    // ==========================================
    // 3. 计算你的代码达到了 cuBLAS 多少的功力?
    // ==========================================
    double percentage = (custom_bandwidth / cublas_bandwidth) * 100.0;
    printf("Your transpose kernel reached \033[1;32m%.2f%%\033[0m of cuBLAS performance!\n", percentage);
    
    // 清理
    cublasDestroy(handle);
    cudaFree(d_A); cudaFree(d_B_custom); cudaFree(d_B_cublas);
    return 0;
}

二、 矩阵乘法 (GEMM):算力的狂飙

GEMM (\(O(N^3)\) 计算量,\(O(N^2)\) 数据量) 注定是一个 Compute-Bound(计算受限) 的任务。优化的核心目标是:打破指令墙,喂饱计算单元

✅ 核心有效操作 (The Good)

  1. 寄存器分块 / 线程粗化 (Register Tiling / Thread Coarsening)
    • 原理:这是手写极致性能的灵魂!从 1 个线程算 1 个点,改为 1 个线程算 \(4 \times 4 = 16\) 个点。利用外积(Outer Product)的数学特性,只需要将 A 的 4 个数和 B 的 4 个数读入私有寄存器,就能交叉相乘算出 16 个结果。
    • 收益:将读写计算比从 1:1 直接压低到 1:2,完美掩盖访存延迟,计算单元利用率爆炸式增长(算力从 2.3 TFLOPS 直接飙升至 8.4 TFLOPS)。
  2. 向量化访存 (float4)
    • 原理:在协作搬运数据到 Shared Memory 时,将指针强转为 float4*,让一个线程一次性扛起 128-bit (16 字节) 的数据。
    • 收益:全局内存请求指令数量暴降 75%,进一步释放显存带宽,算力冲破 9.2 TFLOPS。
  3. 编译期循环展开 (#pragma unroll)
    • 原理:在搬砖和计算的内层循环上加上该预处理指令。
    • 收益:消除了 for 循环的分支判断开销和地址累加开销,同时避免了纯手工平铺代码时极易发生的坐标笔误和 Segfault 越界。

❌ 踩坑与反作用力操作 (The Pitfalls)

  1. 强行加 Padding ([64][65]) 导致性能暴跌
    • 踩坑:在转置中大放异彩的 Padding 魔法,在 GEMM + float4 的场景下成了毒药。
    • 原因
      1. 破坏 float4 对齐float4 指令强制要求 16 字节对齐。加 1 导致换行后地址不对齐,编译器直接废掉向量化,退化为低效的单步读取。
      2. Occupancy Cliff (占用率悬崖):共享内存刚好越过了让 SM 多驻留一个 Block 的临界点,导致并发 Warp 数量暴跌 33%。
      3. 多此一举:由于采用了外积复用,读取 shared_A[ty][k] 时整个半 Warp 读的是同一个地址,触发了硬件的 Broadcast (广播) 机制,天然零 Bank Conflict。
  2. 纯 C++ 手写双缓冲 (Double Buffering) 导致寄存器溢出
    • 踩坑:试图用两个 Shared Memory 数组实现“边算边搬”的流水线,结果性能不升反降(9.2 TFLOPS 跌回 7.7 TFLOPS)。
    • 原因:双缓冲迫使编译器在同一时刻保留大量的存取指针和中间变量,导致寄存器爆炸 (Register Spilling),数据溢出到缓慢的 L1/Local Memory。同时,if 分支打断了指令重排。现代双缓冲必须依赖硬件级异步指令(如 cp.async)才能真正发挥威力。
#include <stdio.h>
#include <stdlib.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>

// 宏定义检查 CUDA 错误
#define CHECK(call) \
{ \
    const cudaError_t error = call; \
    if (error != cudaSuccess) { \
        printf("Error: %s:%d, ", __FILE__, __LINE__); \
        printf("code: %d, reason: %s\n", error, cudaGetErrorString(error)); \
        exit(1); \
    } \
}

// 初始化矩阵
__global__ void initMatrix(float *matrix, int width, int height) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    if (col < width && row < height) {
        // 给一个简单的随机值或者固定值
        // 这里用 (row + col)*0.001f 避免计算后数值溢出
        matrix[row * width + col] = (float)((row + col) % 100) * 0.01f; 
    }
}

// 你的 9.25 TFLOPS 冠军版 Kernel!
__global__ void matrixMulRegisterTiled(float *C, float *A, float *B, int N) {
    __shared__ float shared_A[64][64];
    __shared__ float shared_B[64][64];

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int row_start = blockIdx.y * 64;
    int col_start = blockIdx.x * 64;

    float sum00 = 0.0f; float sum01 = 0.0f; float sum02 = 0.0f; float sum03 = 0.0f;
    float sum10 = 0.0f; float sum11 = 0.0f; float sum12 = 0.0f; float sum13 = 0.0f;
    float sum20 = 0.0f; float sum21 = 0.0f; float sum22 = 0.0f; float sum23 = 0.0f;
    float sum30 = 0.0f; float sum31 = 0.0f; float sum32 = 0.0f; float sum33 = 0.0f;

    float4 *A_vec = (float4 *)A;
    float4 *B_vec = (float4 *)B;

    int numPhases = N / 64;
    for (int p = 0; p < numPhases; ++p) {
        #pragma unroll
        for (int i = 0; i < 4; ++i) { 
            int current_row = ty + i * 16;
            float4 vec_A = A_vec[(row_start + current_row) * (N / 4) + (p * 16 + tx)];
            shared_A[current_row][tx * 4 + 0] = vec_A.x;
            shared_A[current_row][tx * 4 + 1] = vec_A.y;
            shared_A[current_row][tx * 4 + 2] = vec_A.z;
            shared_A[current_row][tx * 4 + 3] = vec_A.w;

            float4 vec_B = B_vec[(p * 64 + current_row) * (N / 4) + (col_start / 4 + tx)];
            shared_B[current_row][tx * 4 + 0] = vec_B.x;
            shared_B[current_row][tx * 4 + 1] = vec_B.y;
            shared_B[current_row][tx * 4 + 2] = vec_B.z;
            shared_B[current_row][tx * 4 + 3] = vec_B.w;
        }

        __syncthreads();

        #pragma unroll
        for (int k = 0; k < 64; ++k) {
            float reg_A0 = shared_A[ty][k];
            float reg_A1 = shared_A[ty + 16][k];
            float reg_A2 = shared_A[ty + 32][k];
            float reg_A3 = shared_A[ty + 48][k];
            float reg_B0 = shared_B[k][tx];
            float reg_B1 = shared_B[k][tx + 16];
            float reg_B2 = shared_B[k][tx + 32];
            float reg_B3 = shared_B[k][tx + 48];

            sum00 += reg_A0 * reg_B0; sum01 += reg_A0 * reg_B1; sum02 += reg_A0 * reg_B2; sum03 += reg_A0 * reg_B3;
            sum10 += reg_A1 * reg_B0; sum11 += reg_A1 * reg_B1; sum12 += reg_A1 * reg_B2; sum13 += reg_A1 * reg_B3;
            sum20 += reg_A2 * reg_B0; sum21 += reg_A2 * reg_B1; sum22 += reg_A2 * reg_B2; sum23 += reg_A2 * reg_B3;
            sum30 += reg_A3 * reg_B0; sum31 += reg_A3 * reg_B1; sum32 += reg_A3 * reg_B2; sum33 += reg_A3 * reg_B3;
        }
        __syncthreads();
    }

    C[(row_start + ty) * N + (col_start + tx)] = sum00;
    C[(row_start + ty) * N + (col_start + tx + 16)] = sum01;
    C[(row_start + ty) * N + (col_start + tx + 32)] = sum02;
    C[(row_start + ty) * N + (col_start + tx + 48)] = sum03;
    C[(row_start + ty + 16) * N + (col_start + tx)] = sum10;
    C[(row_start + ty + 16) * N + (col_start + tx + 16)] = sum11;
    C[(row_start + ty + 16) * N + (col_start + tx + 32)] = sum12;
    C[(row_start + ty + 16) * N + (col_start + tx + 48)] = sum13;
    C[(row_start + ty + 32) * N + (col_start + tx)] = sum20;
    C[(row_start + ty + 32) * N + (col_start + tx + 16)] = sum21;
    C[(row_start + ty + 32) * N + (col_start + tx + 32)] = sum22;
    C[(row_start + ty + 32) * N + (col_start + tx + 48)] = sum23;
    C[(row_start + ty + 48) * N + (col_start + tx)] = sum30;
    C[(row_start + ty + 48) * N + (col_start + tx + 16)] = sum31;
    C[(row_start + ty + 48) * N + (col_start + tx + 32)] = sum32;
    C[(row_start + ty + 48) * N + (col_start + tx + 48)] = sum33;
}

int main() {
    // 💡 建议把 N 调大到 4096,因为 1024 太小,cuBLAS 甚至跑不满高端 GPU 的流水线
    int N = 4096; 
    size_t size = N * N * sizeof(float);
    
    // 矩阵乘法的总浮点运算次数 (FLOPs): 2 * N^3
    double total_flops = 2.0 * (double)N * (double)N * (double)N; 

    float *d_A, *d_B, *d_C_custom, *d_C_cublas;
    CHECK(cudaMalloc(&d_A, size));
    CHECK(cudaMalloc(&d_B, size));
    CHECK(cudaMalloc(&d_C_custom, size));
    CHECK(cudaMalloc(&d_C_cublas, size));

    // 初始化
    dim3 init_blocks((N + 15) / 16, (N + 15) / 16);
    dim3 init_threads(16, 16);
    initMatrix<<<init_blocks, init_threads>>>(d_A, N, N);
    initMatrix<<<init_blocks, init_threads>>>(d_B, N, N);
    CHECK(cudaDeviceSynchronize());

    dim3 threads(16, 16);
    dim3 blocks((N + 64 - 1) / 64, (N + 64 - 1) / 64);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float ms = 0;

    printf("Matrix Size: %d x %d\n", N, N);
    printf("Total FLOPs: %.2f Giga FLOPs\n", total_flops / 1e9);
    printf("--------------------------------------------------\n");

    // ==========================================
    // 1. 测试你的手写终极版 (Custom Kernel)
    // ==========================================
    matrixMulRegisterTiled<<<blocks, threads>>>(d_C_custom, d_A, d_B, N);
    CHECK(cudaDeviceSynchronize());

    int iter = 10;
    cudaEventRecord(start);
    for (int i = 0; i < iter; i++) {
        matrixMulRegisterTiled<<<blocks, threads>>>(d_C_custom, d_A, d_B, N);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    
    float custom_time = ms / iter;
    double custom_gflops = (total_flops / (custom_time / 1000.0)) / 1e9;
    printf("Custom 4x4 Tiled Time: %8.3f ms | Performance: %8.2f GFLOPS\n", custom_time, custom_gflops);

    // ==========================================
    // 2. 测试官方 cuBLAS (cublasSgemm)
    // ==========================================
    cublasHandle_t handle;
    cublasCreate(&handle);
    const float alpha = 1.0f;
    const float beta = 0.0f;

    // 💡 黑魔法:因为 C/C++ 是行优先,而 cuBLAS 是列优先。
    // 计算 C = A * B (行优先),相当于在 cuBLAS 中计算 C^T = B^T * A^T (列优先)
    // 所以这里传参的顺序是先 d_B,再 d_A!
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, N, d_A, N, &beta, d_C_cublas, N);
    CHECK(cudaDeviceSynchronize());

    cudaEventRecord(start);
    for (int i = 0; i < iter; i++) {
        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, N, d_A, N, &beta, d_C_cublas, N);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);

    float cublas_time = ms / iter;
    double cublas_gflops = (total_flops / (cublas_time / 1000.0)) / 1e9;
    printf("cuBLAS sgemm     Time: %8.3f ms | Performance: %8.2f GFLOPS\n", cublas_time, cublas_gflops);
    printf("--------------------------------------------------\n");
    
    // ==========================================
    // 3. 计算你的代码达到了 cuBLAS 多少的功力?
    // ==========================================
    printf("Your kernel reached \033[1;32m%.2f%%\033[0m of cuBLAS performance!\n", (custom_gflops / cublas_gflops) * 100.0);
    
    // 清理
    cublasDestroy(handle);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C_custom); cudaFree(d_C_cublas);
    return 0;
}
posted @ 2026-03-31 16:47  Grice  阅读(8)  评论(0)    收藏  举报