opencl初探-sobel检测

sobel检测的C版本,neonGPU的时间比较。


Platform: LG G3, Adreno 330 ,img size 3264x2448


sobel:

C code

neon

GPU

73

13

42+3.7+6.6

 单位:ms GPU时间=memory time+Queued time+Run time




Sobel org

Sobel vector

Sobel vector + mem_fence

Queued time

4.6

7.2

2.8

Wait time

0.07

0.09

0.07

Run time

66.9

7.3

6.6






typedef unsigned char BYTE;
void sobel(BYTE *src,int w,int h,BYTE *Ix,BYTE *Iy)
{
    int src_step = w;
    int dst_step = w;
    int x, height = h - 2;
    BYTE* dstX = Ix+dst_step;
    BYTE* dstY = Iy+dst_step;
    for( ; height--; src += src_step, dstX += dst_step, dstY += dst_step )
    {
        const BYTE* src2 = src + src_step;
        const BYTE* src3 = src + src_step*2;

        for( x = 1; x < w-1 ; x++ )
        {
            short t0 = 0  ;
            short t1 = 0  ;
            t0 = -src[x-1]+src[x+1] ;
            t1 = src[x-1]+(src[x]<<1)+src[x+1];

            t0 += ((-src2[x-1]+src2[x+1])<<1) ;

            t0 += -src3[x-1]+src3[x+1] ;
            t1 -= ( src3[x-1]+(src3[x]<<1)+src3[x+1] );

            dstX[x] = t0>>3;
            dstY[x] = t1>>3;
        }
    }
}


void sobel_neon(BYTE *src,int w,int h,BYTE *Ix,BYTE *Iy)
{
    int src_step = w;
    int dst_step = w;
    int x, height = h - 2;
    BYTE* dstX = Ix+dst_step;
    BYTE* dstY = Iy+dst_step;
    for( ; height--; src += src_step, dstX += dst_step, dstY += dst_step )
    {
        const BYTE* src2 = src + src_step;
        const BYTE*  src3 = src + src_step*2;
        x = 1;
        while((x+8) <= w-1 )
        {
            uint8x8_t left =  vld1_u8(src+x-1);
            uint8x8_t mid =  vld1_u8(src+x) ;
            uint8x8_t right =  vld1_u8(src+x+1) ;

            int16x8_t t0 = vreinterpretq_s16_u16( vsubl_u8(right,left) ) ;
            int16x8_t t1 = vaddq_s16( vreinterpretq_s16_u16( vaddl_u8(left,right) ) ,
                     vreinterpretq_s16_u16( vshll_n_u8(mid,1) )   );

            left =  vld1_u8(src2+x-1);
            right =  vld1_u8(src2+x+1) ;
            int16x8_t temp = vreinterpretq_s16_u16( vsubl_u8(right,left) );
            t0 = vaddq_s16(t0,vshlq_n_s16(temp,1));

            left =  vld1_u8(src3+x-1);
            mid =  vld1_u8(src3+x) ;
            right =  vld1_u8(src3+x+1) ;
            t0 = vaddq_s16(t0,vreinterpretq_s16_u16( vsubl_u8(right,left) ));
            temp = vaddq_s16( vreinterpretq_s16_u16( vaddl_u8(left,right) ) ,
                                 vreinterpretq_s16_u16( vshll_n_u8(mid,1) )   );
            t1 = vsubq_s16(t1,temp);

            vst1_s8((int8_t*)dstX+x,vshrn_n_s16(t0,3));
            vst1_s8((int8_t*)dstY+x,vshrn_n_s16(t1,3));
            x += 8;
        }
        while( (x) < w-1 )
        {
            short t0 = 0  ;
            short t1 = 0  ;
            t0 = -src[x-1]+src[x+1] ;
            t1 = src[x-1]+(src[x]<<1)+src[x+1];

            t0 += ((-src2[x-1]+src2[x+1])<<1) ;

            t0 += -src3[x-1]+src3[x+1] ;
            t1 -= ( src3[x-1]+(src3[x]<<1)+src3[x+1] );

            dstX[x] = t0>>3;
            dstY[x] = t1>>3;
            x++;
        }
    }
}
View Code

 

posted @ 2015-12-11 15:47  mlj318  阅读(1053)  评论(0编辑  收藏  举报