【Neon指令优化】Neon实现快速压缩

需实现想大分辨率的二值化图像，其8bytes压缩成1bytes的数据，使用Neon实现快速压缩

#include <string>
#include <vector>
#include <unordered_map>
#include <arm_neon.h>


    // 高效的8位打包, 使用Neon
    uint8_t  PackBytes8To1ByNeon(uint8x8_t& bits)
    {
        // 使用位移和乘法（避免条件分支）
        bits = vand_u8(bits, vdup_n_u8(0x01)); // 确保只有最低位有效

        // 创建位权重：1, 2, 4, 8, 16, 32, 64, 128
        uint8x8_t weights = {1, 2, 4, 8, 16, 32, 64, 128};
        // uint8x8_t weights = {128, 64, 32, 16, 8, 4, 2, 1};

        // 位与权重相乘
        uint8x8_t weighted = vmul_u8(bits, weights);

        // 水平相加（向量对加并扩展）
        uint16x4_t sum16 = vpaddl_u8(weighted); // 相邻两个8位元素相加并扩展到16位
        uint32x2_t sum32 = vpaddl_u16(sum16);
        uint64x1_t sum64 = vpaddl_u32(sum32);

        // 从无符号64位到无符号8位, 提取得到无符号uint8x8_t
        return vget_lane_u8(vreinterpret_u8_u64(sum64), 0);
    }

    int CompressBytes8To1ByNeon(uint8_t* src, int srcLen, uint8_t* dst, int dstLen)
    {
        int ret = 0;
        int dstIndex = 0;
        int srcIndex = 0;

        try {
            // 参数检查
            if (src == NULL || dst == NULL || srcLen <= 0 || dstLen <= 0) {
                return -1;
            }
            // 判断dst缓冲区大小
            int reqDstLen = (srcLen + 7) / 8;
            if (dstLen < reqDstLen) {
                return -1;
            }

            for (srcIndex = 0; srcIndex + 8 <= srcLen; srcIndex += 8) {
                uint8x8_t vData = vld1_u8(src + srcIndex);
                if (vminv_u8(vData) > 0) {
                    dst[dstIndex] = 255;
                } else if (vmaxv_u8(vData) != 0) {
                    dst[dstIndex] = PackBytes8To1ByNeon(vData);
                }
                dstIndex++;
            }

            // 剩余数据(<8)
            int remaining = srcLen - srcIndex;
            if (remaining > 0) {
                for (int i = 0; i < remaining; i ++) {
                    if (src[i] > 0) {
                        dst[dstIndex + i >> 3] |= (1 << (i & 0x07));
                    }
                }
            }
        } catch(const std::exception& e)
        {
            LogError("{} exception, err: {}", __func__, e.what());
            ret = -1;
        }

        return ret;
    }

    int CompressBytes8To1ByNormal(uint8_t* src, int srcLen, uint8_t* dst, int dstLen)
    {
        int ret = 0;

        try {
            // 参数检查
            if (src == NULL || dst == NULL || srcLen <= 0 || dstLen <= 0) {
                return -1;
            }
            // 判断dst缓冲区大小
            int reqDstLen = (srcLen + 7) / 8;
            if (dstLen < reqDstLen) {
                return -1;
            }

            for (int i = 0; i < srcLen; i ++) {
                if (src[i] > 0) {
                    dst[i >> 3] |= (1 << (i & 0x07));
                }
            }
        } catch(const std::exception& e)
        {
            LogError("{} exception, err: {}", __func__, e.what());
            ret = -1;
        }

        return ret;
    }
}

#include <future>#include <thread>
namespace SuperVision::Algorithm{

// 高效的8位打包, 使用Neon uint8_t AlgorUtilsNeon::PackBytes8To1ByNeon(uint8x8_t& bits) { // 使用位移和乘法（避免条件分支） bits = vand_u8(bits, vdup_n_u8(0x01)); // 确保只有最低位有效
// 创建位权重：1, 2, 4, 8, 16, 32, 64, 128 uint8x8_t weights = {1, 2, 4, 8, 16, 32, 64, 128}; // uint8x8_t weights = {128, 64, 32, 16, 8, 4, 2, 1};
// 位与权重相乘 uint8x8_t weighted = vmul_u8(bits, weights);
// 水平相加（向量对加并扩展） uint16x4_t sum16 = vpaddl_u8(weighted); // 相邻两个8位元素相加并扩展到16位 uint32x2_t sum32 = vpaddl_u16(sum16); uint64x1_t sum64 = vpaddl_u32(sum32);
// 从无符号64位到无符号8位, 提取得到无符号uint8x8_t return vget_lane_u8(vreinterpret_u8_u64(sum64), 0); }
int AlgorUtilsNeon::CompressBytes8To1ByNeon(uint8_t* src, int srcLen, uint8_t* dst, int dstLen) { int ret = 0; int dstIndex = 0; int srcIndex = 0;
try { // 参数检查 if (src == NULL || dst == NULL || srcLen <= 0 || dstLen <= 0) { return -1; } // 判断dst缓冲区大小 int reqDstLen = (srcLen + 7) / 8; if (dstLen < reqDstLen) { return -1; }
for (srcIndex = 0; srcIndex + 8 <= srcLen; srcIndex += 8) { uint8x8_t vData = vld1_u8(src + srcIndex); if (vminv_u8(vData) > 0) { dst[dstIndex] = 255; } else if (vmaxv_u8(vData) != 0) { dst[dstIndex] = PackBytes8To1ByNeon(vData); } dstIndex++; }
// 剩余数据(<8) int remaining = srcLen - srcIndex; if (remaining > 0) { for (int i = 0; i < remaining; i ++) { if (src[i] > 0) { dst[dstIndex + i >> 3] |= (1 << (i & 0x07)); } } } } catch(const std::exception& e) { LogError("{} exception, err: {}", __func__, e.what()); ret = -1; }
return ret; }
int AlgorUtilsNeon::CompressBytes8To1ByNormal(uint8_t* src, int srcLen, uint8_t* dst, int dstLen) { int ret = 0;
try { // 参数检查 if (src == NULL || dst == NULL || srcLen <= 0 || dstLen <= 0) { return -1; } // 判断dst缓冲区大小 int reqDstLen = (srcLen + 7) / 8; if (dstLen < reqDstLen) { return -1; }
for (int i = 0; i < srcLen; i ++) { if (src[i] > 0) { dst[i >> 3] |= (1 << (i & 0x07)); } } } catch(const std::exception& e) { LogError("{} exception, err: {}", __func__, e.what()); ret = -1; }
return ret; }}

posted @ 2025-12-17 10:39 我自逍遥笑阅读(3) 评论(0) 收藏举报

刷新页面返回顶部

我自逍遥笑

言宜慢、心宜善！

【Neon指令优化】Neon实现快速压缩

公告