int bw = blockDim.x;
int bh = blockDim.y;
int tx = threadIdx.x%bw;
int ty = threadIdx.y%bh;
__shared__ uchar2 ys0[16][16];
__shared__ uchar2 ys1[16][16];
__shared__ uchar2 uvs[16][16];
ys0[ty][tx] = y0y1;
ys1[ty][tx] = y2y3;
uvs[ty][tx] = uv;
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0) {
for (int j = 0; j != bh; ++j) {
uchar2* py0 = (uchar2*)(pDst + (iy + j) * 2 * nPitch + ix * 2);
uchar2* py1 = (uchar2*)(pDst + ((iy + j) * 2+1) * nPitch + ix * 2);
uchar2* puv = (uchar2*)(pDstUv + (iy + j)*nWidth + ix * 2);
for (int i = 0; i != bw; ++i) {
*py0++ = ys0[j][i];
*py1++ = ys1[j][i];
//*puv++ = uvs[j][i];
}
}
}