cuda基础--流

1:运行时API实现

  事件管理可以用于测量程序运行时间,或者管理CPU和GPU同时进行

 1 //创建事件
 2 cudaEvent_t start,stop;
 3 cudaEventCreate(&start);
 4 cudaEventCreate(&stop);
 5 
 6 cudaEventRecord(start,0);
 7 //do somthing
 8 cudaEventRecord(stop,0);
 9 cudaEventSynchronize(stop);
10 float elapsedTime;
11 cudaEventElapsedTime(&elapsedTime,start,stop);
12 
13 cudaEventDestroy(start);
14 cudaEventDestroy(stop);

2:驱动API实现

 1 CUevent start,stop;
 2 cuEventCreate(&start);
 3 cuEventCreate(&stop);
 4 
 5 cuEventRecord(start,0);
 6 //do somthing
 7 cuEventRecord(stop,0);
 8 cuEventSynchronize(stop);
 9 float elapsedTime;
10 cuEventElapsedTime(&elapsedTime,start,stop);
11 
12 cuEventDestroy(start);
13 cuEventDestroy(stop);

3:simpleStream实例

  1 #include <stdio.h>
  2 #include <cutil_inline.h>
  3 
  4 __global__ void 
  5 init_array(int *g_data,int* factor,int num_iterations)
  6 {
  7 int idx = blockIdx.x*blockDim.x+threadIdx.x;
  8 for(int i = 0;i < n;i++)
  9 g_data[idx] += *factor;
 10 }
 11 
 12 int correct_data(int *a,const int n,const int c)
 13 {
 14 for(int i = 0;i < n;i++)
 15 {
 16 if(a[i] != c)
 17 {
 18 printf("%d:%d %d\n",i,a[i],c);
 19 return 0;
 20 }
 21 return 1;
 22 }
 23 }
 24 
 25 
 26 int main(int arg,char* argv[])
 27 {
 28 int CUDA_device = 0;
 29 in nstream = 4;
 30 int nreps = 10;//整体循环次数
 31 int n = 16*1024*1024;//数组元素个数
 32 int nbytes = n*sizeof(int);
 33 dim3 threads,blocks;
 34 float elapsed_time,time_memcpy,time_kernel;
 35 
 36 int niterations;//kernel内部循环次数
 37 if(argc > 1)
 38 {
 39 CUDA_Device = atoi(argv[1]);
 40 }
 41 
 42 //查询设备计算能力
 43 int num_devices = 0;
 44 cudaGetDeviceCount(&num_devices);
 45 if(0 == num_devices)
 46 {
 47 printf("no device\n");
 48 return 1;
 49 }
 50 if(CUDA_devices >= num_device)
 51 {
 52  printf("CUDA_device between 0 and %d\n", num_devices-1);
 53 return 1;  
 54 }
 55 
 56 cudaSetDevice(CUDA_device);
 57 cudaDeviceProp device_properties;
 58 cudaDeviceProperties(&device_properties,CUDA_device);
 59 printf("running on:%s\n",device_properties.name);
 60 
 61 //内存分配
 62 int c = 5;
 63 int *a = 0;
 64 cuMallocHost((void**)&a,nbytes);
 65 
 66 //显存分配
 67 int* d_a = 0,*d_c = 0;
 68 cudaMalloc((void**)&d_a,nbytes);
 69 cudaMalloc((void**)&d_c,sizeof(int));
 70 cudaMemcpy(d_c,&c,sizeof(int),cudaMemcpyHostToDevice);
 71 
 72 //流的创建和初始化
 73 cudaStream_t* streams = (cudaStream_t*)malloc(nstreams*sizeof(cudaStream_t));
 74 for(int i = 0;i  nstreams;i++)
 75 {
 76 cudaStreamCreate(&streams[i]);
 77 }
 78 
 79 //事件的创建
 80 cudaEvent_t  start_event,stop_event;
 81 cudaEventCreate(&start_event);
 82 cudaEventCreate(&stop_event);
 83 
 84 //内存拷贝计时
 85 cudaEventRecord(start_event,0);//stream0中计时,确保所有之前的cuda调用均已完成
 86 cudaMemcpyAsync(d_a,a,nbytes,cudaMemcpyHostToDevice,streama[0]);
 87 cudaEventRecord(stop_event,0);
 88 cudaEventElapsedTimer(&time_memcpy,start_event,stop_event);
 89 
 90 //kernel计时,使用流
 91 threads = dim3(512,1);
 92 blocks = dim3(n/threads.x,1);
 93 cudaEventRecord(start_event,0);
 94 init_array<<<blocks,threads,0,streams[0]>>>(d_a,d_c,niterations);
 95 cudaEventRecord(stop_event,0);
 96 cudaEventSynchronize(stop_event);
 97 cudaElapsedTime(&time_kernel,start_event,stop_event);
 98 
 99 //kernel计时,不使用流
100 threads = dim3(512,1);
101 blocks = dim3(n/threads.x,1);
102 cudaEventRecord(start_event,0);
103 for(int i = 0;i < nreps;i++)
104 {
105 init_array<<<blocks,threads>>>(d_a,d_c,niterations);
106 cudaMemcpy(a,d_a,nbytes,cudaMemcpyDeviceToHost);
107 }
108 cudaEventRecord(stop_event,0);
109 cudaEventSynchronize(stop_event);
110 cudaElapsedTime(&elapsed_time,start_event,stop_event);
111 
112 //核对结果
113 if(correct_data(a,n,c*nreps*niterations))
114 printf("passed\n");
115 
116 //释放资源
117 for(int i = 0;i < nstreams;i++)
118 cudaStreamDestroy(streams[i]);
119 cudaEventDestroy(start_event);
120 cudaEventDestroy(stop_event);
121 cudaFreeHost(a);
122 cudaFree(d_a);
123 cudaFree(d_c);
124 
125 cudaThreadExit();
126 return 0;

 

posted @ 2020-06-10 11:24  糖糖_彭  阅读(217)  评论(0编辑  收藏  举报