cuda编程作业(stream & unified_mem)

步骤一

// Todo 1
// Allocate host memory for pointers [*h_x, *h_y, *h_z] using cudaMallocHost
CHECK(cudaMallocHost(&h_x,N*sizeof(DTYPE)));
CHECK(cudaMallocHost(&h_y,N*sizeof(DTYPE)));
CHECK(cudaMallocHost(&h_z,N*sizeof(DTYPE)));
// Todo 2
// Free host memory pointers [*h_x, *h_y, *h_z] using cudaFreeHost
cudaFreeHost(h_x);
cudaFreeHost(h_y);
cudaFreeHost(h_z);
// Todo 3
// Using multiple streams to tmplement the following function achieve overlapped memcpy [cudaMemcpyAsync] and kernel computing
void vec_add_multiple_streams_overlapped(const DTYPE *h_x, const DTYPE *h_y, DTYPE *h_z, const int n)
{
    DTYPE *d_x,*d_y,*d_z;
    DTYPE *h_z1 = (DTYPE*) malloc(sizeof(DTYPE) * N);

    CHECK(cudaMalloc(&d_x, N*sizeof(DTYPE)));
    CHECK(cudaMalloc(&d_y, N*sizeof(DTYPE)));
    CHECK(cudaMalloc(&d_z, N*sizeof(DTYPE)));
    cudaStream_t *stream=(cudaStream_t*)malloc(NUM_STREAMS*sizeof(cuda_Steam_t));
    for(int i=0;i<NUM_STREAMS;i++) CHECK(cudaStreamCreate(&sream[i]));
    int cnt=N/NUM_STREAMS;
    for(int i=0;i<NUM_STREAMS;i++){
        cudaMemcpyAsync(d_x+i*cnt, h_x+i*cnt, cnt*sizeof(DTYPE), cudaMemcpyHostToDevice,stream[i]);
        cudaMemcpyAsync(d_y+i*cnt, h_y+i*cnt, cnt*sizeof(DTYPE), cudaMemcpyHostToDevice,stream[i]);
    }

    GpuTimer timer;
    timer.Start();
    const int grid_size = (cnt - 1) / BLOCK_SIZE + 1;
    for(int i=0;i<NUM_STREAMS;i++){
        vec_add_kernel<<<grid_size, BLOCK_SIZE,stream[i]>>>(d_x+i*cnt, d_y+i*cnt, d_z+i*cnt, cnt);
    }
    timer.Stop();

    for(int i=0;i<NUM_STREAMS;i++) cudaMemcpyAsync(h_z+i*cnt, d_z+i*cnt, cnt*sizeof(DTYPE), cudaMemcpyDeviceToHOst,stream[i]);

    printf("[vec_add_default_stream] Time cost: %f ms\n", timer.Elapsed());     
    CHECK(cudaDeviceSynchronize());
    if(vec_compare(h_z1, h_z, N)==1){ printf("  PASSED!\n");  }else{  printf("  FAILED\n");  }

    CHECK(cudaFreeHost(d_z1));
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_y));
    CHECK(cudaFree(d_z));    
} 

步骤二

void gpu_vec_add(Vector *vec1, Vector *vec2, Vector *vec_out)
{
    Vector *d_vec1,*d_vec2, *d_vec_out;
    float *d_data1, *d_data2, *d_data_out;
    
    cudaMalloc((void **) &d_vec1, sizeof(Vector));
    cudaMalloc((void **) &d_data1, (vec1->length) * sizeof(float));
    cudaMemcpy(d_vec1, vec1, sizeof(Vector), cudaMemcpyHostToDevice);
    cudaMemcpy(d_data1, vec1->data, vec1->length * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(&(d_vec1->data), &d_data1, sizeof(float *), cudaMemcpyHostToDevice);

    cudaMalloc((void **) &d_vec2, sizeof(Vector));
    cudaMalloc((void **) &d_data2,(vec2->length) * sizeof(float));
    cudaMemcpy(d_vec2, vec2, sizeof(Vector), cudaMemcpyHostToDevice);
    cudaMemcpy(d_data2, vec2->data, vec2->length * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy( &(d_vec2->data), &d_data2, sizeof(float *),cudaMemcpyHostToDevice);

    cudaMalloc((void **) &d_vec_out, sizeof(Vector));
    cudaMalloc((void **) &d_data_out, (vec_out->length) * sizeof(float));
    cudaMemcpy( d_vec_out, vec_out, sizeof(Vector), cudaMemcpyHostToDevice);
    cudaMemcpy( &(d_vec_out->data), &d_data_out, sizeof(float *), cudaMemcpyHostToDevice);

    const int grid_size = (vec1->length) / BLOCK_SIZE + 1;
    kernel_vec_add<<<grid_size, BLOCK_SIZE>>>(d_vec1, d_vec2, d_vec_out);
    cudaMemcpy(vec_out->data, d_data_out, vec_out->length*sizeof(float), cudaMemcpyDeviceToHost); 
}
void gpu_vec_add(Vector *vec1, Vector *vec2, Vector *vec_out){
    const int grid_size=(vec_out->length-1)/BLOCK_SIZE+1;
    kernel_vec_add<<<grid_size,BLOCK_SIZE>>>(vec1,vec2,vec_out);
    cudaDeviceSynchronize();
}
posted @ 2023-06-08 15:03  缙云山车神  阅读(27)  评论(0编辑  收藏  举报