cuda programming
- CUDA C++ Programming Guide
- CUDA C++ Programming Guide
- 《CUDA C Programming Guide》(《CUDA C 编程指南》)导读
- Tutorial 01: Say Hello to CUDA
- An Easy Introduction to CUDA C and C++
- Using the NVIDIA CUDA Stream-Ordered Memory Allocator, Part 1
- Using the NVIDIA CUDA Stream-Ordered Memory Allocator, Part 2
demo
saxpy.cu
#include <stdio.h>
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) y[i] = a*x[i] + y[i];
// NOTE: test out of bounds
if (i == 0) {
n += 5;
y[n] = a*x[n] + y[n];
}
}
int main(void)
{
int N = 1<<20;
float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
printf("addr of d_x = %p\n", d_x);
printf("addr of d_y = %p\n", d_y);
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements
saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(y[i]-4.0f));
printf("Max error: %f\n", maxError);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}
Makefile
.PHONY: all clean run
TARGET = saxpy
$(TARGET): saxpy.cu
nvcc $< -o $@
clean:
rm -f $(TARGET)
run: $(TARGET)
./$(TARGET)
cuda-memcheck ./$(TARGET)
compute-sanitizer ./$(TARGET)

浙公网安备 33010602011771号