CUDA计算向量内积的程序(源自CUDA范例编程)
__syncthreads() acts as a barrier at which all threads in the block must wait before any is allowed to proceed.
//计算向量的内积程序 #include<stdio.h> #define imin(a,b) (a<b?a:b) //N为输入的向量的规模 const int N=33*1024; const int threadsPerBlock=256; const int blocksPerGrid= imin(32,(N+threadsPerBlock-1)/threadsPerBlock); __global__ void dot(float *a,float *b,float *c) { //每一个块上都有cache变量的拷贝,相互之间不影响 __shared__ float cache[threadsPerBlock]; //tid为线程的偏移量 int tid=threadIdx.x+blockIdx.x*blockDim.x; int cacheIndex=threadIdx.x; float temp=0; while(tid<N) { temp+=a[tid]*b[tid]; //增加的下标量为进程总数 tid+=blockDim.x*gridDim.x; } cache[cacheIndex]=temp; //同步化当前块上的线程 __syncthreads(); int i=blockDim.x/2; //在块内计算部分和 while(i!=0) { if(cacheIndex<i) cache[cacheIndex]+=cache[cacheIndex+i]; __syncthreads(); i/=2; } if(cacheIndex==0) c[blockIdx.x]=cache[0]; } int main(void) { float *a,*b,c,*partial_c; float *dev_a,*dev_b,*dev_partial_c; a=(float*)malloc(N*sizeof(float)); b=(float*)malloc(N*sizeof(float)); partial_c=(float*)malloc(blocksPerGrid*sizeof(float)); cudaMalloc((void**)&dev_a,N*sizeof(float)); cudaMalloc((void**)&dev_b,N*sizeof(float)); cudaMalloc((void**)&dev_partial_c,blocksPerGrid*sizeof(float)); for(int i=0;i<N;i++) { a[i]=i; b[i]=i*2; } cudaMemcpy(dev_a,a,N*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(dev_b,b,N*sizeof(float),cudaMemcpyHostToDevice); dot<<<blocksPerGrid,threadsPerBlock>>>(dev_a,dev_b,dev_partial_c); cudaMemcpy(partial_c,dev_partial_c,blocksPerGrid*sizeof(float),cudaMemcpyDeviceToHost); c=0; for(int i=0;i<blocksPerGrid;i++) c+=partial_c[i]; //此程序相当于计算0,1,...N-1的平方和 #define sum_squares(x) (x*(x+1)*(2*x+1)/6) //测试向量内积的正确性 printf("Does GPU value %.6g = %.6g?\n",c, 2*sum_squares((float)(N-1))); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_partial_c); free(a); free(b); free(partial_c); //测试输出 int j; scanf("%d",&j); }