len3d

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

Here is the code:

#include <stdio.h>
#include <xmmintrin.h>
#include <windows.h>

typedef __m128 Vec;

typedef unsigned long long value_t;

__forceinline value_t now()
{
    LARGE_INTEGER n;
    QueryPerformanceCounter(&n);
    return n.QuadPart;
}

inline void img_transpose(
    Vec *dst_img, 
    Vec *src_img, 
    const int src_w, 
    const int src_h)
{
#pragma omp parallel for
    for (int j = 0; j < src_w; ++j)
    {
        for (int i = 0; i < src_h; ++i)
        {
            dst_img[j * src_h + i] = src_img[i * src_w + j];
        }
    }
}

inline void img_transpose_block(
    Vec *dst_img, 
    Vec *src_img, 
    const int src_w, 
    const int src_h)
{
#pragma omp parallel for
    for (int j = 0; j < src_w; j += 8)
    {
        for (int i = 0; i < src_h; i += 8)
        {
            const int nsize = min(j + 8, src_w);
            const int msize = min(i + 8, src_h);

            for (int n = j; n < nsize; ++n)
            {
                for (int m = i; m < msize; ++m)
                {
                    dst_img[n * src_h + m] = src_img[m * src_w + n];
                }
            }
        }
    }
}

int main(int argc, char *argv[])
{
    //// performance benchmark ////

    const int w = 1280;
    const int h = 720;
    Vec *a = new Vec [w * h];
    Vec *b = new Vec [w * h];
    value_t start_time, end_time;


    LARGE_INTEGER freq;
    QueryPerformanceFrequency(&freq);
    double ms_per_tick = 1000.0 / (double)freq.QuadPart;



    start_time = now();

    for (int t = 0; t < 50; ++t)
    {
        img_transpose(b, a, w, h);
        img_transpose(a, b, h, w);
    }

    end_time = now();
    printf("img_transpose:          %f ms\n", (double)(end_time - start_time) * ms_per_tick);



    start_time = now();

    for (int t = 0; t < 50; ++t)
    {
        img_transpose_block(b, a, w, h);
        img_transpose_block(a, b, h, w);
    }

    end_time = now();
    printf("img_transpose_block:   %f ms\n", (double)(end_time - start_time) * ms_per_tick);


    delete [] a;
    delete [] b;


    //// algorithm validation ////
    const int width = 1080;
    const int height = 1920;
    Vec *src_img = new Vec [width * height];
    Vec *dst_img = new Vec [height * width];

    for (int j = 0; j < height; ++j)
    {
        for (int i = 0; i < width; ++i)
        {
            src_img[j * width + i].m128_i32[0] = i;
            src_img[j * width + i].m128_i32[1] = j;
        }
    }

    img_transpose_block(dst_img, src_img, width, height);

    for (int j = 0; j < width; ++j)
    {
        for (int i = 0; i < height; ++i)
        {
            int pi = dst_img[j * height + i].m128_i32[0];
            int pj = dst_img[j * height + i].m128_i32[1];

            if (pi != j || pj != i)
            {
                printf("Algorithm is wrong!!!\n");
                goto END_OF_PROGRAM;
            }
        }
    }

END_OF_PROGRAM:
    printf("All done\n");


    return 0;
}

 

posted on 2017-10-22 21:00  Len3d  阅读(274)  评论(0编辑  收藏  举报