高性能计算-NEON-图像旋转(18)

1. 目标:使用 NEON intrinsic 函数,对512*512 png 四通道图像顺时针旋转90度。

思路: 像素分块,对块内转置;再水平镜像。图像库使用 stb img

2. 代码

#include <stdio.h>
#include <arm_neon.h>

#include <stdlib.h>
#define STB_IMAGE_IMPLEMENTATION
#include "./stb/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "./stb/stb_image_write.h"

// #define DEBUG

int main()
{
    //读取图像
    int w,h,c;
#ifdef DEBUG
    w=h=8;c=4;
    uint8_t* src = (uint8_t*)calloc(w*h*c,1);
    for(int i=0;i<h;i++)
    {
        for(int j=0;j<h*c;j++)
            src[i*h*c+j] = j;
    }
    for(int i=0;i<h;i++)
    {
        for(int j=0;j<w*c;j+=4)
            printf("%u%u%u%u ",*(dst+i*h*c+j),*(dst+i*h*c+j+1),*(dst+i*h*c+j+2),*(dst+i*h*c+j+3));
        printf("\n");
    }
    printf("======\n");
#else
    uint8_t *src = stbi_load("./pic.png",&w,&h,&c,0);
    if(!src)
    {
        printf("load img failed.\n");
        return 0;
    }
    else
        printf("int w %d h %d c %d\n",w,h,c);//512 512 4

#endif

    uint8_t *dst = (uint8_t*)calloc(w*h*c,sizeof(uint8_t));
    int blockSize = 4;// 128/sizeof(src[0][0]);

    for(int i=0;i<h;i+=blockSize)
    {
        for(int j=0;j<w;j+=blockSize)
        {
            uint32x4x4_t block = {0};
            uint32x4x2_t blockTemp = {0};
            //储存数据: 像素转置、然后水平翻转存储[i+m][j] -> [j][i+m] -> [j][N-(i+m)-blocksize]
            //加载块数据
            for(int m=0;m<blockSize;m++)
                block.val[m] = vreinterpretq_u32_u8(vld1q_u8(src+((i+m)*w+j)*c));
            //像素转置
            blockTemp = vtrnq_u32(block.val[0],block.val[1]);
            block.val[0] = blockTemp.val[0];
            block.val[1] = blockTemp.val[1];
            blockTemp = vtrnq_u32(block.val[2],block.val[3]);
            block.val[2] = blockTemp.val[0];
            block.val[3] = blockTemp.val[1];
            //没有 vtrnq_u64 所以手动交换数据
            blockTemp.val[0] = vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(block.val[0]),vreinterpretq_u64_u32(block.val[2])));
            blockTemp.val[1] = vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(block.val[0]),vreinterpretq_u64_u32(block.val[2])));
            block.val[0] = blockTemp.val[0];
            block.val[2] = blockTemp.val[1];

            blockTemp.val[0] = vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(block.val[1]),vreinterpretq_u64_u32(block.val[3])));
            blockTemp.val[1] = vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(block.val[1]),vreinterpretq_u64_u32(block.val[3])));
            block.val[1] = blockTemp.val[0];
            block.val[3] = blockTemp.val[1];

            for(int m=0;m<blockSize;m++)
            {
                block.val[m] = vrev64q_u32(block.val[m]);
                block.val[m] = vcombine_u32(vget_high_u32(block.val[m]),vget_low_u32(block.val[m]));
                //存储
                vst1q_u8(dst+((j+m)*h+(h-i-blockSize))*c,vreinterpretq_u8_u32(block.val[m]));
            }
        }
    }
    #ifdef DEBUG
    for(int i=0;i<w;i++)
    {
        for(int j=0;j<h*c;j+=4)
            printf("%u%u%u%u ",*(dst+i*h*c+j),*(dst+i*h*c+j+1),*(dst+i*h*c+j+2),*(dst+i*h*c+j+3));
        printf("\n");
    }
    free(src);
    #else
    stbi_write_png("pic1.png",h,w,c,dst,h*c);
    stbi_image_free(src);
    #endif
    free(dst);
    return 0;
    
}

3. 测试结果

原图

image

旋转后图像

image

posted @ 2024-12-04 22:05  安洛8  阅读(109)  评论(0)    收藏  举报