【Neon指令优化】Neon实现快速压缩
需实现想大分辨率的二值化图像,其8bytes压缩成1bytes的数据,使用Neon实现快速压缩
#include <string> #include <vector> #include <unordered_map> #include <arm_neon.h> // 高效的8位打包, 使用Neon uint8_t PackBytes8To1ByNeon(uint8x8_t& bits) { // 使用位移和乘法(避免条件分支) bits = vand_u8(bits, vdup_n_u8(0x01)); // 确保只有最低位有效 // 创建位权重:1, 2, 4, 8, 16, 32, 64, 128 uint8x8_t weights = {1, 2, 4, 8, 16, 32, 64, 128}; // uint8x8_t weights = {128, 64, 32, 16, 8, 4, 2, 1}; // 位与权重相乘 uint8x8_t weighted = vmul_u8(bits, weights); // 水平相加(向量对加并扩展) uint16x4_t sum16 = vpaddl_u8(weighted); // 相邻两个8位元素相加并扩展到16位 uint32x2_t sum32 = vpaddl_u16(sum16); uint64x1_t sum64 = vpaddl_u32(sum32); // 从无符号64位到无符号8位, 提取得到无符号uint8x8_t return vget_lane_u8(vreinterpret_u8_u64(sum64), 0); } int CompressBytes8To1ByNeon(uint8_t* src, int srcLen, uint8_t* dst, int dstLen) { int ret = 0; int dstIndex = 0; int srcIndex = 0; try { // 参数检查 if (src == NULL || dst == NULL || srcLen <= 0 || dstLen <= 0) { return -1; } // 判断dst缓冲区大小 int reqDstLen = (srcLen + 7) / 8; if (dstLen < reqDstLen) { return -1; } for (srcIndex = 0; srcIndex + 8 <= srcLen; srcIndex += 8) { uint8x8_t vData = vld1_u8(src + srcIndex); if (vminv_u8(vData) > 0) { dst[dstIndex] = 255; } else if (vmaxv_u8(vData) != 0) { dst[dstIndex] = PackBytes8To1ByNeon(vData); } dstIndex++; } // 剩余数据(<8) int remaining = srcLen - srcIndex; if (remaining > 0) { for (int i = 0; i < remaining; i ++) { if (src[i] > 0) { dst[dstIndex + i >> 3] |= (1 << (i & 0x07)); } } } } catch(const std::exception& e) { LogError("{} exception, err: {}", __func__, e.what()); ret = -1; } return ret; } int CompressBytes8To1ByNormal(uint8_t* src, int srcLen, uint8_t* dst, int dstLen) { int ret = 0; try { // 参数检查 if (src == NULL || dst == NULL || srcLen <= 0 || dstLen <= 0) { return -1; } // 判断dst缓冲区大小 int reqDstLen = (srcLen + 7) / 8; if (dstLen < reqDstLen) { return -1; } for (int i = 0; i < srcLen; i ++) { if (src[i] > 0) { dst[i >> 3] |= (1 << (i & 0x07)); } } } catch(const std::exception& e) { LogError("{} exception, err: {}", __func__, e.what()); ret = -1; } return ret; } }
#include <future>#include <thread>
namespace SuperVision::Algorithm{
// 高效的8位打包, 使用Neon uint8_t AlgorUtilsNeon::PackBytes8To1ByNeon(uint8x8_t& bits) { // 使用位移和乘法(避免条件分支) bits = vand_u8(bits, vdup_n_u8(0x01)); // 确保只有最低位有效
// 创建位权重:1, 2, 4, 8, 16, 32, 64, 128 uint8x8_t weights = {1, 2, 4, 8, 16, 32, 64, 128}; // uint8x8_t weights = {128, 64, 32, 16, 8, 4, 2, 1};
// 位与权重相乘 uint8x8_t weighted = vmul_u8(bits, weights);
// 水平相加(向量对加并扩展) uint16x4_t sum16 = vpaddl_u8(weighted); // 相邻两个8位元素相加并扩展到16位 uint32x2_t sum32 = vpaddl_u16(sum16); uint64x1_t sum64 = vpaddl_u32(sum32);
// 从无符号64位到无符号8位, 提取得到无符号uint8x8_t return vget_lane_u8(vreinterpret_u8_u64(sum64), 0); }
int AlgorUtilsNeon::CompressBytes8To1ByNeon(uint8_t* src, int srcLen, uint8_t* dst, int dstLen) { int ret = 0; int dstIndex = 0; int srcIndex = 0;
try { // 参数检查 if (src == NULL || dst == NULL || srcLen <= 0 || dstLen <= 0) { return -1; } // 判断dst缓冲区大小 int reqDstLen = (srcLen + 7) / 8; if (dstLen < reqDstLen) { return -1; }
for (srcIndex = 0; srcIndex + 8 <= srcLen; srcIndex += 8) { uint8x8_t vData = vld1_u8(src + srcIndex); if (vminv_u8(vData) > 0) { dst[dstIndex] = 255; } else if (vmaxv_u8(vData) != 0) { dst[dstIndex] = PackBytes8To1ByNeon(vData); } dstIndex++; }
// 剩余数据(<8) int remaining = srcLen - srcIndex; if (remaining > 0) { for (int i = 0; i < remaining; i ++) { if (src[i] > 0) { dst[dstIndex + i >> 3] |= (1 << (i & 0x07)); } } } } catch(const std::exception& e) { LogError("{} exception, err: {}", __func__, e.what()); ret = -1; }
return ret; }
int AlgorUtilsNeon::CompressBytes8To1ByNormal(uint8_t* src, int srcLen, uint8_t* dst, int dstLen) { int ret = 0;
try { // 参数检查 if (src == NULL || dst == NULL || srcLen <= 0 || dstLen <= 0) { return -1; } // 判断dst缓冲区大小 int reqDstLen = (srcLen + 7) / 8; if (dstLen < reqDstLen) { return -1; }
for (int i = 0; i < srcLen; i ++) { if (src[i] > 0) { dst[i >> 3] |= (1 << (i & 0x07)); } } } catch(const std::exception& e) { LogError("{} exception, err: {}", __func__, e.what()); ret = -1; }
return ret; }}

浙公网安备 33010602011771号