记录几个学习cuda编程的例子
向量加和:
#include <iostream>
__global__ void vectorAdd(int n, const float* a, const float* b, float* c) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
c[i] = a[i] + b[i];
}
}
int main() {
int n = 1 << 20;
size_t size = n * sizeof(float);
float *a, *b, *c;
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
for (int i = 0; i < n; i++) {
a[i] = 1.1f;
b[i] = 2.3f;
}
int threadPerBlock = 256;
int blockPerGrid = (n + threadPerBlock - 1) / threadPerBlock;
vectorAdd<<<blockPerGrid, threadPerBlock>>>(n, a, b, c);
cudaDeviceSynchronize();
std::cout << c[0] << " " << c[n / 2] << " " << c[n - 1] << std::endl;
cudaFree(a);
cudaFree(b);
cudaFree(c);
return 0;
}
矩阵乘
#include <iostream>
__global__ void matrixMul(int m,
int n,
int k,
const float* a,
const float* b,
float* c) {
// Shape: (m,n) @ (n,k) = (m,k)
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col < k && row < m) {
float sum = 0.0f;
for (int i = 0; i < n; i++) {
sum += a[n * row + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
int main() {
int m = 3;
int n = 2;
int k = 4;
size_t size_a = m * n * sizeof(float);
size_t size_b = n * k * sizeof(float);
size_t size_c = m * k * sizeof(float);
float *a, *b, *c;
cudaMallocManaged(&a, size_a);
cudaMallocManaged(&b, size_b);
cudaMallocManaged(&c, size_c);
for (int i = 0; i < m * n; i++) {
a[i] = 2.0f;
}
for (int i = 0; i < n * k; i++) {
b[i] = 3.0f;
}
dim3 threadPerBlock(16, 16);
dim3 blockPerGrid((k + threadPerBlock.x - 1) / threadPerBlock.x,
(m + threadPerBlock.y - 1) / threadPerBlock.y);
matrixMul<<<blockPerGrid, threadPerBlock>>>(m, n, k, a, b, c);
cudaDeviceSynchronize();
std::cout << c[0] << " " << c[2 * k] << " " << c[(m - 1) * k + k - 1]
<< std::endl;
cudaFree(a);
cudaFree(b);
cudaFree(c);
return 0;
}
无通道的卷积
#include <iostream>
__global__ void conv2D(int H,
int W,
int kH,
int kW,
int padH,
int padW,
int strideH,
int strideW,
int Hout,
int Wout,
float* img,
float* kernel,
float* output) {
int ox = blockDim.x * blockIdx.x + threadIdx.x; // 当前输出列
int oy = blockDim.y * blockIdx.y + threadIdx.y; // 当前输出行
if (ox >= Wout || oy >= Hout) {
return;
}
float sum = 0.0f;
// 被卷积的起点坐标
int in_x0 = ox * strideW - padW;
int in_y0 = oy * strideH - padH;
for (int i = 0; i < kH; i++) {
for (int j = 0; j < kW; j++) {
int in_xi = in_x0 + j;
int in_yi = in_y0 + i;
if (in_xi >= 0 && in_xi < W && in_yi >= 0 && in_yi < H) {
sum += img[in_yi * W + in_xi] * kernel[i * kW + j];
}
}
}
output[oy * Wout + ox] = sum;
}
int main() {
int H = 12, W = 12;
int kH = 3, kW = 3;
int padH = 1, padW = 1;
int strideH = 1, strideW = 1;
int Hout = (H + 2 * padH - kH) / strideH + 1;
int Wout = (W + 2 * padW - kW) / strideW + 1;
size_t img_size = H * W * sizeof(float);
size_t kernel_size = kH * kW * sizeof(float);
size_t out_size = Hout * Wout * sizeof(float);
float *img, *kernel, *output;
cudaMallocManaged(&img, img_size);
cudaMallocManaged(&kernel, kernel_size);
cudaMallocManaged(&output, out_size);
for (int i = 0; i < H * W; i++) {
img[i] = 10.0f;
}
for (int i = 0; i < kH * kW; i++) {
kernel[i] = 0.5f;
}
dim3 threadPerBlock(16, 16);
dim3 blockPerGrid((Wout + threadPerBlock.x - 1) / threadPerBlock.x,
(Hout + threadPerBlock.y - 1) / threadPerBlock.y);
conv2D<<<blockPerGrid, threadPerBlock>>>(H, W, kH, kW, padH, padW, strideH,
strideW, Hout, Wout, img, kernel,
output);
cudaDeviceSynchronize();
for (int i = 0; i < Hout; i++) {
for (int j = 0; j < Wout; j++) {
std::cout << output[i * Wout + j] << " ";
}
std::cout << std::endl;
}
return 0;
}

浙公网安备 33010602011771号