cuda 学习
纯手写 GEMM 达到了 10.6 TFLOPS 的算力(逼近 cuBLAS 74% 的性能),转置带宽跑满了 622 GB/s(达到 cuBLAS 91% 的性能)。
在此总结一下这一路走来的优化经验:哪些是真神技,哪些是看似聪明的“反作用力”毒药。
一、 矩阵转置 (Transpose):访存的艺术
矩阵转置是一个纯粹的 Memory-Bound(访存受限) 任务。计算核心 (ALU) 几乎在睡大觉,所有的优化都在围绕显存控制器做文章。
✅ 核心有效操作 (The Good)
- 共享内存分块 (Shared Memory Tiling)
- 原理:直接在全局显存 (Global Memory) 中进行
out[x][y] = in[y][x]会导致致命的“跳跃访存”。利用 Shared Memory 作为中转站,先按块读入,再按块写出。 - 收益:大幅降低对全局显存的访问延迟。
- 原理:直接在全局显存 (Global Memory) 中进行
- 合并访存 (Coalesced Access)
- 原理:永远记住 CUDA 的黄金法则:变化最快的线程索引
tx必须对应内存中连续变化的维度(列)。即x对应列,y对应行。 - 收益:确保 32 个线程组成的 Warp 能一次性吃满显存的位宽(如 128-byte 事务),避免带宽浪费。
- 原理:永远记住 CUDA 的黄金法则:变化最快的线程索引
- Padding 消除 Bank Conflict (魔法
+1)- 原理:当 Shared Memory 的维度是 32 的倍数(如
[32][32])时,按列读取会产生严重的 32 路 Bank Conflict。将声明改为__shared__ float tile[32][33]。 - 收益:极其廉价的一行代码,通过错位打破了内存块的对齐,彻底消灭 Bank Conflict,带宽瞬间飙升。
- 原理:当 Shared Memory 的维度是 32 的倍数(如
#include <stdio.h>
#include <cublas_v2.h>
// 宏定义检查 CUDA 错误
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) { \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code: %d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
// 你的终极手写优化版 (转置)
__global__ void transposeOptimized(float *out, float *in, int width, int height) {
__shared__ float tile[32][33]; // 32x33 完美避开 Bank Conflict
int x = blockIdx.x * 32 + threadIdx.x;
int y = blockIdx.y * 32 + threadIdx.y;
// 1. 合并读取
#pragma unroll
for (int j = 0; j < 32; j += 8) {
if (x < width && (y + j) < height) {
tile[threadIdx.y + j][threadIdx.x] = in[(y + j) * width + x];
}
}
__syncthreads();
// 2. 重新计算转置后的坐标
int new_x = blockIdx.y * 32 + threadIdx.x;
int new_y = blockIdx.x * 32 + threadIdx.y;
// 3. 合并写入
#pragma unroll
for (int j = 0; j < 32; j += 8) {
if (new_x < height && (new_y + j) < width) {
out[(new_y + j) * height + new_x] = tile[threadIdx.x][threadIdx.y + j];
}
}
}
int main() {
int N = 8192;
size_t size = N * N * sizeof(float);
// 计算传输的总数据量 (读一遍A,写一遍B,总共 2 * 8192 * 8192 * 4 Bytes)
double total_bytes = 2.0 * N * N * sizeof(float);
float *d_A, *d_B_custom, *d_B_cublas;
CHECK(cudaMalloc(&d_A, size));
CHECK(cudaMalloc(&d_B_custom, size));
CHECK(cudaMalloc(&d_B_cublas, size));
// 初始化(省略具体数值,直接分配显存)
CHECK(cudaMemset(d_A, 1, size));
dim3 threads(32, 8);
dim3 blocks((N + 32 - 1) / 32, (N + 32 - 1) / 32);
// --- 创建高精度计时器 ---
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float ms = 0;
printf("Matrix Size: %d x %d\n", N, N);
printf("--------------------------------------------------\n");
// ==========================================
// 1. 测试你的手写终极版 (Custom Kernel)
// ==========================================
// 预热 (Warmup)
transposeOptimized<<<blocks, threads>>>(d_B_custom, d_A, N, N);
cudaDeviceSynchronize();
cudaEventRecord(start);
int iter = 10;
for (int i = 0; i < iter; i++) {
transposeOptimized<<<blocks, threads>>>(d_B_custom, d_A, N, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
float custom_time = ms / iter;
double custom_bandwidth = (total_bytes / (custom_time / 1000.0)) / 1e9; // GB/s
printf("Custom Kernel Time: %8.3f ms | Bandwidth: %6.1f GB/s\n", custom_time, custom_bandwidth);
// ==========================================
// 2. 测试官方 cuBLAS (cublasSgeam)
// ==========================================
cublasHandle_t handle;
cublasCreate(&handle);
const float alpha = 1.0f;
const float beta = 0.0f;
// 预热 (Warmup)
cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, N, &alpha, d_A, N, &beta, d_B_cublas, N, d_B_cublas, N);
cudaDeviceSynchronize();
cudaEventRecord(start);
for (int i = 0; i < iter; i++) {
// 参数说明:A转置(OP_T), B不转置(OP_N,因为beta是0没影响), m, n, alpha, A, lda, beta, B, ldb, C, ldc
cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, N, &alpha, d_A, N, &beta, d_B_cublas, N, d_B_cublas, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
float cublas_time = ms / iter;
double cublas_bandwidth = (total_bytes / (cublas_time / 1000.0)) / 1e9;
printf("cuBLAS sgeam Time: %8.3f ms | Bandwidth: %6.1f GB/s\n", cublas_time, cublas_bandwidth);
printf("--------------------------------------------------\n");
// ==========================================
// 3. 计算你的代码达到了 cuBLAS 多少的功力?
// ==========================================
double percentage = (custom_bandwidth / cublas_bandwidth) * 100.0;
printf("Your transpose kernel reached \033[1;32m%.2f%%\033[0m of cuBLAS performance!\n", percentage);
// 清理
cublasDestroy(handle);
cudaFree(d_A); cudaFree(d_B_custom); cudaFree(d_B_cublas);
return 0;
}
二、 矩阵乘法 (GEMM):算力的狂飙
GEMM (\(O(N^3)\) 计算量,\(O(N^2)\) 数据量) 注定是一个 Compute-Bound(计算受限) 的任务。优化的核心目标是:打破指令墙,喂饱计算单元。
✅ 核心有效操作 (The Good)
- 寄存器分块 / 线程粗化 (Register Tiling / Thread Coarsening)
- 原理:这是手写极致性能的灵魂!从 1 个线程算 1 个点,改为 1 个线程算 \(4 \times 4 = 16\) 个点。利用外积(Outer Product)的数学特性,只需要将 A 的 4 个数和 B 的 4 个数读入私有寄存器,就能交叉相乘算出 16 个结果。
- 收益:将读写计算比从 1:1 直接压低到 1:2,完美掩盖访存延迟,计算单元利用率爆炸式增长(算力从 2.3 TFLOPS 直接飙升至 8.4 TFLOPS)。
- 向量化访存 (
float4)- 原理:在协作搬运数据到 Shared Memory 时,将指针强转为
float4*,让一个线程一次性扛起 128-bit (16 字节) 的数据。 - 收益:全局内存请求指令数量暴降 75%,进一步释放显存带宽,算力冲破 9.2 TFLOPS。
- 原理:在协作搬运数据到 Shared Memory 时,将指针强转为
- 编译期循环展开 (
#pragma unroll)- 原理:在搬砖和计算的内层循环上加上该预处理指令。
- 收益:消除了
for循环的分支判断开销和地址累加开销,同时避免了纯手工平铺代码时极易发生的坐标笔误和 Segfault 越界。
❌ 踩坑与反作用力操作 (The Pitfalls)
- 强行加 Padding (
[64][65]) 导致性能暴跌- 踩坑:在转置中大放异彩的 Padding 魔法,在 GEMM +
float4的场景下成了毒药。 - 原因:
- 破坏
float4对齐:float4指令强制要求 16 字节对齐。加 1 导致换行后地址不对齐,编译器直接废掉向量化,退化为低效的单步读取。 - Occupancy Cliff (占用率悬崖):共享内存刚好越过了让 SM 多驻留一个 Block 的临界点,导致并发 Warp 数量暴跌 33%。
- 多此一举:由于采用了外积复用,读取
shared_A[ty][k]时整个半 Warp 读的是同一个地址,触发了硬件的 Broadcast (广播) 机制,天然零 Bank Conflict。
- 破坏
- 踩坑:在转置中大放异彩的 Padding 魔法,在 GEMM +
- 纯 C++ 手写双缓冲 (Double Buffering) 导致寄存器溢出
- 踩坑:试图用两个 Shared Memory 数组实现“边算边搬”的流水线,结果性能不升反降(9.2 TFLOPS 跌回 7.7 TFLOPS)。
- 原因:双缓冲迫使编译器在同一时刻保留大量的存取指针和中间变量,导致寄存器爆炸 (Register Spilling),数据溢出到缓慢的 L1/Local Memory。同时,
if分支打断了指令重排。现代双缓冲必须依赖硬件级异步指令(如cp.async)才能真正发挥威力。
#include <stdio.h>
#include <stdlib.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
// 宏定义检查 CUDA 错误
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) { \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code: %d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
// 初始化矩阵
__global__ void initMatrix(float *matrix, int width, int height) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (col < width && row < height) {
// 给一个简单的随机值或者固定值
// 这里用 (row + col)*0.001f 避免计算后数值溢出
matrix[row * width + col] = (float)((row + col) % 100) * 0.01f;
}
}
// 你的 9.25 TFLOPS 冠军版 Kernel!
__global__ void matrixMulRegisterTiled(float *C, float *A, float *B, int N) {
__shared__ float shared_A[64][64];
__shared__ float shared_B[64][64];
int tx = threadIdx.x;
int ty = threadIdx.y;
int row_start = blockIdx.y * 64;
int col_start = blockIdx.x * 64;
float sum00 = 0.0f; float sum01 = 0.0f; float sum02 = 0.0f; float sum03 = 0.0f;
float sum10 = 0.0f; float sum11 = 0.0f; float sum12 = 0.0f; float sum13 = 0.0f;
float sum20 = 0.0f; float sum21 = 0.0f; float sum22 = 0.0f; float sum23 = 0.0f;
float sum30 = 0.0f; float sum31 = 0.0f; float sum32 = 0.0f; float sum33 = 0.0f;
float4 *A_vec = (float4 *)A;
float4 *B_vec = (float4 *)B;
int numPhases = N / 64;
for (int p = 0; p < numPhases; ++p) {
#pragma unroll
for (int i = 0; i < 4; ++i) {
int current_row = ty + i * 16;
float4 vec_A = A_vec[(row_start + current_row) * (N / 4) + (p * 16 + tx)];
shared_A[current_row][tx * 4 + 0] = vec_A.x;
shared_A[current_row][tx * 4 + 1] = vec_A.y;
shared_A[current_row][tx * 4 + 2] = vec_A.z;
shared_A[current_row][tx * 4 + 3] = vec_A.w;
float4 vec_B = B_vec[(p * 64 + current_row) * (N / 4) + (col_start / 4 + tx)];
shared_B[current_row][tx * 4 + 0] = vec_B.x;
shared_B[current_row][tx * 4 + 1] = vec_B.y;
shared_B[current_row][tx * 4 + 2] = vec_B.z;
shared_B[current_row][tx * 4 + 3] = vec_B.w;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < 64; ++k) {
float reg_A0 = shared_A[ty][k];
float reg_A1 = shared_A[ty + 16][k];
float reg_A2 = shared_A[ty + 32][k];
float reg_A3 = shared_A[ty + 48][k];
float reg_B0 = shared_B[k][tx];
float reg_B1 = shared_B[k][tx + 16];
float reg_B2 = shared_B[k][tx + 32];
float reg_B3 = shared_B[k][tx + 48];
sum00 += reg_A0 * reg_B0; sum01 += reg_A0 * reg_B1; sum02 += reg_A0 * reg_B2; sum03 += reg_A0 * reg_B3;
sum10 += reg_A1 * reg_B0; sum11 += reg_A1 * reg_B1; sum12 += reg_A1 * reg_B2; sum13 += reg_A1 * reg_B3;
sum20 += reg_A2 * reg_B0; sum21 += reg_A2 * reg_B1; sum22 += reg_A2 * reg_B2; sum23 += reg_A2 * reg_B3;
sum30 += reg_A3 * reg_B0; sum31 += reg_A3 * reg_B1; sum32 += reg_A3 * reg_B2; sum33 += reg_A3 * reg_B3;
}
__syncthreads();
}
C[(row_start + ty) * N + (col_start + tx)] = sum00;
C[(row_start + ty) * N + (col_start + tx + 16)] = sum01;
C[(row_start + ty) * N + (col_start + tx + 32)] = sum02;
C[(row_start + ty) * N + (col_start + tx + 48)] = sum03;
C[(row_start + ty + 16) * N + (col_start + tx)] = sum10;
C[(row_start + ty + 16) * N + (col_start + tx + 16)] = sum11;
C[(row_start + ty + 16) * N + (col_start + tx + 32)] = sum12;
C[(row_start + ty + 16) * N + (col_start + tx + 48)] = sum13;
C[(row_start + ty + 32) * N + (col_start + tx)] = sum20;
C[(row_start + ty + 32) * N + (col_start + tx + 16)] = sum21;
C[(row_start + ty + 32) * N + (col_start + tx + 32)] = sum22;
C[(row_start + ty + 32) * N + (col_start + tx + 48)] = sum23;
C[(row_start + ty + 48) * N + (col_start + tx)] = sum30;
C[(row_start + ty + 48) * N + (col_start + tx + 16)] = sum31;
C[(row_start + ty + 48) * N + (col_start + tx + 32)] = sum32;
C[(row_start + ty + 48) * N + (col_start + tx + 48)] = sum33;
}
int main() {
// 💡 建议把 N 调大到 4096,因为 1024 太小,cuBLAS 甚至跑不满高端 GPU 的流水线
int N = 4096;
size_t size = N * N * sizeof(float);
// 矩阵乘法的总浮点运算次数 (FLOPs): 2 * N^3
double total_flops = 2.0 * (double)N * (double)N * (double)N;
float *d_A, *d_B, *d_C_custom, *d_C_cublas;
CHECK(cudaMalloc(&d_A, size));
CHECK(cudaMalloc(&d_B, size));
CHECK(cudaMalloc(&d_C_custom, size));
CHECK(cudaMalloc(&d_C_cublas, size));
// 初始化
dim3 init_blocks((N + 15) / 16, (N + 15) / 16);
dim3 init_threads(16, 16);
initMatrix<<<init_blocks, init_threads>>>(d_A, N, N);
initMatrix<<<init_blocks, init_threads>>>(d_B, N, N);
CHECK(cudaDeviceSynchronize());
dim3 threads(16, 16);
dim3 blocks((N + 64 - 1) / 64, (N + 64 - 1) / 64);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float ms = 0;
printf("Matrix Size: %d x %d\n", N, N);
printf("Total FLOPs: %.2f Giga FLOPs\n", total_flops / 1e9);
printf("--------------------------------------------------\n");
// ==========================================
// 1. 测试你的手写终极版 (Custom Kernel)
// ==========================================
matrixMulRegisterTiled<<<blocks, threads>>>(d_C_custom, d_A, d_B, N);
CHECK(cudaDeviceSynchronize());
int iter = 10;
cudaEventRecord(start);
for (int i = 0; i < iter; i++) {
matrixMulRegisterTiled<<<blocks, threads>>>(d_C_custom, d_A, d_B, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
float custom_time = ms / iter;
double custom_gflops = (total_flops / (custom_time / 1000.0)) / 1e9;
printf("Custom 4x4 Tiled Time: %8.3f ms | Performance: %8.2f GFLOPS\n", custom_time, custom_gflops);
// ==========================================
// 2. 测试官方 cuBLAS (cublasSgemm)
// ==========================================
cublasHandle_t handle;
cublasCreate(&handle);
const float alpha = 1.0f;
const float beta = 0.0f;
// 💡 黑魔法:因为 C/C++ 是行优先,而 cuBLAS 是列优先。
// 计算 C = A * B (行优先),相当于在 cuBLAS 中计算 C^T = B^T * A^T (列优先)
// 所以这里传参的顺序是先 d_B,再 d_A!
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, N, d_A, N, &beta, d_C_cublas, N);
CHECK(cudaDeviceSynchronize());
cudaEventRecord(start);
for (int i = 0; i < iter; i++) {
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, N, d_A, N, &beta, d_C_cublas, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
float cublas_time = ms / iter;
double cublas_gflops = (total_flops / (cublas_time / 1000.0)) / 1e9;
printf("cuBLAS sgemm Time: %8.3f ms | Performance: %8.2f GFLOPS\n", cublas_time, cublas_gflops);
printf("--------------------------------------------------\n");
// ==========================================
// 3. 计算你的代码达到了 cuBLAS 多少的功力?
// ==========================================
printf("Your kernel reached \033[1;32m%.2f%%\033[0m of cuBLAS performance!\n", (custom_gflops / cublas_gflops) * 100.0);
// 清理
cublasDestroy(handle);
cudaFree(d_A); cudaFree(d_B); cudaFree(d_C_custom); cudaFree(d_C_cublas);
return 0;
}

浙公网安备 33010602011771号