记录几个学习cuda编程的例子

向量加和

#include <iostream>

__global__ void vectorAdd(int n, const float* a, const float* b, float* c) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}
int main() {
    int n = 1 << 20;
    size_t size = n * sizeof(float);
    float *a, *b, *c;
    cudaMallocManaged(&a, size);
    cudaMallocManaged(&b, size);
    cudaMallocManaged(&c, size);
    for (int i = 0; i < n; i++) {
        a[i] = 1.1f;
        b[i] = 2.3f;
    }
    int threadPerBlock = 256;
    int blockPerGrid = (n + threadPerBlock - 1) / threadPerBlock;
    vectorAdd<<<blockPerGrid, threadPerBlock>>>(n, a, b, c);
    cudaDeviceSynchronize();
    std::cout << c[0] << " " << c[n / 2] << " " << c[n - 1] << std::endl;
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);
    return 0;
}

矩阵乘

#include <iostream>

__global__ void matrixMul(int m,
                          int n,
                          int k,
                          const float* a,
                          const float* b,
                          float* c) {
    // Shape: (m,n) @ (n,k) = (m,k)
    int col = blockDim.x * blockIdx.x + threadIdx.x;
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    if (col < k && row < m) {
        float sum = 0.0f;
        for (int i = 0; i < n; i++) {
            sum += a[n * row + i] * b[i * k + col];
        }
        c[row * k + col] = sum;
    }
}
int main() {
    int m = 3;
    int n = 2;
    int k = 4;
    size_t size_a = m * n * sizeof(float);
    size_t size_b = n * k * sizeof(float);
    size_t size_c = m * k * sizeof(float);
    float *a, *b, *c;
    cudaMallocManaged(&a, size_a);
    cudaMallocManaged(&b, size_b);
    cudaMallocManaged(&c, size_c);
    for (int i = 0; i < m * n; i++) {
        a[i] = 2.0f;
    }
    for (int i = 0; i < n * k; i++) {
        b[i] = 3.0f;
    }
    dim3 threadPerBlock(16, 16);
    dim3 blockPerGrid((k + threadPerBlock.x - 1) / threadPerBlock.x,
                      (m + threadPerBlock.y - 1) / threadPerBlock.y);
    matrixMul<<<blockPerGrid, threadPerBlock>>>(m, n, k, a, b, c);
    cudaDeviceSynchronize();
    std::cout << c[0] << " " << c[2 * k] << " " << c[(m - 1) * k + k - 1]
              << std::endl;
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);
    return 0;
}

无通道的卷积

#include <iostream>

__global__ void conv2D(int H,
                       int W,
                       int kH,
                       int kW,
                       int padH,
                       int padW,
                       int strideH,
                       int strideW,
                       int Hout,
                       int Wout,
                       float* img,
                       float* kernel,
                       float* output) {
    int ox = blockDim.x * blockIdx.x + threadIdx.x;  // 当前输出列
    int oy = blockDim.y * blockIdx.y + threadIdx.y;  // 当前输出行
    if (ox >= Wout || oy >= Hout) {
        return;
    }
    float sum = 0.0f;
    // 被卷积的起点坐标
    int in_x0 = ox * strideW - padW;
    int in_y0 = oy * strideH - padH;
    for (int i = 0; i < kH; i++) {
        for (int j = 0; j < kW; j++) {
            int in_xi = in_x0 + j;
            int in_yi = in_y0 + i;
            if (in_xi >= 0 && in_xi < W && in_yi >= 0 && in_yi < H) {
                sum += img[in_yi * W + in_xi] * kernel[i * kW + j];
            }
        }
    }
    output[oy * Wout + ox] = sum;
}

int main() {
    int H = 12, W = 12;
    int kH = 3, kW = 3;
    int padH = 1, padW = 1;
    int strideH = 1, strideW = 1;
    int Hout = (H + 2 * padH - kH) / strideH + 1;
    int Wout = (W + 2 * padW - kW) / strideW + 1;
    size_t img_size = H * W * sizeof(float);
    size_t kernel_size = kH * kW * sizeof(float);
    size_t out_size = Hout * Wout * sizeof(float);
    float *img, *kernel, *output;
    cudaMallocManaged(&img, img_size);
    cudaMallocManaged(&kernel, kernel_size);
    cudaMallocManaged(&output, out_size);
    for (int i = 0; i < H * W; i++) {
        img[i] = 10.0f;
    }
    for (int i = 0; i < kH * kW; i++) {
        kernel[i] = 0.5f;
    }
    dim3 threadPerBlock(16, 16);
    dim3 blockPerGrid((Wout + threadPerBlock.x - 1) / threadPerBlock.x,
                      (Hout + threadPerBlock.y - 1) / threadPerBlock.y);
    conv2D<<<blockPerGrid, threadPerBlock>>>(H, W, kH, kW, padH, padW, strideH,
                                             strideW, Hout, Wout, img, kernel,
                                             output);
    cudaDeviceSynchronize();
    for (int i = 0; i < Hout; i++) {
        for (int j = 0; j < Wout; j++) {
            std::cout << output[i * Wout + j] << " ";
        }
        std::cout << std::endl;
    }
    return 0;
}

参考文献

posted @ 2025-08-26 22:02  片刻的自由  阅读(66)  评论(0)    收藏  举报