NLM去噪
一、环境搭建
1.1 安装依赖
1.1.1 VS Code
1.1.2 安装opencv
参考《Rockchip RK3588 - Mali-G610 GPU驱动》4.6.3.1节。
1.1.3 安装依赖
sudo apt install libopencv-dev opencl-headers ocl-icd-opencl-dev
1.2 CPU实现
1.2.1 NlmCPU.h
// NlmCPU.h
#ifndef NLM_CPU_H
#define NLM_CPU_H
#include <opencv2/opencv.hpp>
#include <string>
#include <vector>
#include <memory>
class NlmCPUProcessor {
public:
// 构造函数
NlmCPUProcessor(int rows = 1600, int cols = 1200, double maxVal = 1023.0);
// 主处理函数 - 改为小写开头
cv::Mat denoiseRaw9Ch(const std::string& path);
// 获取中间结果
cv::Mat getNoisyImage() const { return m_noisyImage; } // 成员变量加m_前缀
cv::Mat getMeanImage() const { return m_meanImage; }
cv::Mat getStructureClean() const { return m_structureClean; }
std::vector<cv::Mat> getStack() const { return m_stack; }
// 获取统计数据
double getSigmaEst() const { return m_sigmaEst; }
double getHFinal() const { return m_hFinal; }
std::vector<double> getTimings() const { return m_timings; }
// 设置参数
void setRows(int rows) { m_rows = rows; }
void setCols(int cols) { m_cols = cols; }
void setMaxVal(double maxVal) { m_maxVal = maxVal; }
void setPatchSize(int patchSize) { m_patchSize = patchSize; }
void setSigmaEstimationPatchSize(int patchSize) { m_sigmaEstPatchSize = patchSize; }
void setHFactor(double hFactor) { m_hFactor = hFactor; }
void setDiffSigma(double diffSigma) { m_diffSigma = diffSigma; }
// 噪声估计函数
double noiseEstimation(const cv::Mat& noisyImage, int patchSize = -1);
// 保存中间结果
void saveIntermediateResults(const std::string& prefix = "cpu") const;
// 打印统计信息
void printStatistics() const;
private:
// 核心处理函数
void readRawData(const std::string& path);
void splitChannels();
void computeMean();
void estimateNoise();
void applyNlmDenoising();
cv::Mat reconstructImage();
// 工具函数
void clearResults();
void updateTiming(double time);
private:
// 输入参数
int m_rows;
int m_cols;
double m_maxVal;
int m_patchSize;
int m_sigmaEstPatchSize;
double m_hFactor;
double m_diffSigma;
// 中间结果
cv::Mat m_rawBuffer;
cv::Mat m_noisyImage;
std::vector<cv::Mat> m_stack;
cv::Mat m_meanImage;
cv::Mat m_meanImage8u;
cv::Mat m_structureClean;
// 统计信息
double m_sigmaEst;
double m_hFinal;
std::vector<double> m_timings; // 各个阶段的时间消耗
// 临时变量
double m_currentTimer;
// 常量
static const int DEFAULT_ROWS = 1600;
static const int DEFAULT_COLS = 1200;
static const double DEFAULT_MAX_VAL;
static const int DEFAULT_PATCH_SIZE = 7;
static const int DEFAULT_SIGMA_PATCH_SIZE = 8;
static const double DEFAULT_H_FACTOR;
static const double DEFAULT_DIFF_SIGMA;
};
#endif // NLM_CPU_H
1.2.2 NlmCPU.cpp
// NlmCPU.cpp
#include "NlmCPU.h"
#include <iostream>
#include <fstream>
#include <iomanip>
#include <chrono>
const double NlmCPUProcessor::DEFAULT_MAX_VAL = 1023.0;
const double NlmCPUProcessor::DEFAULT_H_FACTOR = 0.8;
const double NlmCPUProcessor::DEFAULT_DIFF_SIGMA = 2.5;
// 构造函数
NlmCPUProcessor::NlmCPUProcessor(int rows, int cols, double maxVal)
: m_rows(rows), m_cols(cols), m_maxVal(maxVal),
m_patchSize(DEFAULT_PATCH_SIZE),
m_sigmaEstPatchSize(DEFAULT_SIGMA_PATCH_SIZE),
m_hFactor(DEFAULT_H_FACTOR),
m_diffSigma(DEFAULT_DIFF_SIGMA),
m_sigmaEst(0.0), m_hFinal(0.0), m_currentTimer(0.0) {
m_timings.resize(6, 0.0);
}
// 清空结果
void NlmCPUProcessor::clearResults() {
m_rawBuffer.release();
m_noisyImage.release();
m_stack.clear();
m_meanImage.release();
m_meanImage8u.release();
m_structureClean.release();
m_sigmaEst = 0.0;
m_hFinal = 0.0;
std::fill(m_timings.begin(), m_timings.end(), 0.0);
}
// 更新时间统计
void NlmCPUProcessor::updateTiming(double time) {
m_timings.push_back(time);
}
// 主处理函数
cv::Mat NlmCPUProcessor::denoiseRaw9Ch(const std::string& path) {
clearResults();
double totalTimeStart = (double)cv::getTickCount();
// 1. 读取数据
double stageTimeStart = (double)cv::getTickCount();
readRawData(path);
double readTime = ((double)cv::getTickCount() - stageTimeStart) / cv::getTickFrequency();
m_timings[0] = readTime;
// 2. 通道拆分
stageTimeStart = (double)cv::getTickCount();
splitChannels();
double splitTime = ((double)cv::getTickCount() - stageTimeStart) / cv::getTickFrequency();
m_timings[1] = splitTime;
// 3. 计算均值
stageTimeStart = (double)cv::getTickCount();
computeMean();
double meanTime = ((double)cv::getTickCount() - stageTimeStart) / cv::getTickFrequency();
m_timings[2] = meanTime;
// 4. 噪声估计
stageTimeStart = (double)cv::getTickCount();
estimateNoise();
double noiseTime = ((double)cv::getTickCount() - stageTimeStart) / cv::getTickFrequency();
m_timings[3] = noiseTime;
// 5. NLM去噪
stageTimeStart = (double)cv::getTickCount();
applyNlmDenoising();
double nlmTime = ((double)cv::getTickCount() - stageTimeStart) / cv::getTickFrequency();
m_timings[4] = nlmTime;
// 总时间
double totalTime = ((double)cv::getTickCount() - totalTimeStart) / cv::getTickFrequency();
m_timings[5] = totalTime;
return m_structureClean;
}
// 读取RAW数据
void NlmCPUProcessor::readRawData(const std::string& path) {
std::ifstream fileStream(path, std::ios::binary);
if (!fileStream) {
throw std::runtime_error("Cannot open file: " + path);
}
// 读取原始数据
std::vector<uint16_t> buffer(m_rows * m_cols);
fileStream.read((char*)buffer.data(), m_rows * m_cols * sizeof(uint16_t));
fileStream.close();
// 存储原始缓冲区(用于调试)
m_rawBuffer = cv::Mat(m_rows * m_cols, 1, CV_16UC1, buffer.data()).clone();
// 修正读取映射逻辑
m_noisyImage = cv::Mat::zeros(m_rows, m_cols, CV_64F);
for (int col = 0; col < m_cols; col++) {
for (int row = 0; row < m_rows; row++) {
m_noisyImage.at<double>(row, col) = (double)buffer[col * m_rows + row];
}
}
}
// 通道拆分
void NlmCPUProcessor::splitChannels() {
int subHeight = m_rows / 3;
int subWidth = m_cols / 3;
m_stack.resize(9);
int index = 0;
for (int rowGroup = 0; rowGroup < 3; rowGroup++) {
for (int colGroup = 0; colGroup < 3; colGroup++) {
cv::Mat subImage(subHeight, subWidth, CV_64F);
for (int i = 0; i < subHeight; i++) {
for (int j = 0; j < subWidth; j++) {
subImage.at<double>(i, j) =
m_noisyImage.at<double>(rowGroup + i * 3, colGroup + j * 3);
}
}
m_stack[index++] = subImage;
}
}
}
// 计算均值
void NlmCPUProcessor::computeMean() {
int subHeight = m_rows / 3;
int subWidth = m_cols / 3;
m_meanImage = cv::Mat::zeros(subHeight, subWidth, CV_64F);
for (int i = 0; i < 9; i++) {
m_meanImage += m_stack[i];
}
m_meanImage /= 9.0;
}
// 噪声估计
void NlmCPUProcessor::estimateNoise() {
m_sigmaEst = noiseEstimation(m_meanImage, m_sigmaEstPatchSize);
m_hFinal = m_hFactor * m_sigmaEst * (m_maxVal / 1023.0);
}
// NLM去噪
void NlmCPUProcessor::applyNlmDenoising() {
double scale = 255.0 / m_maxVal;
m_meanImage.convertTo(m_meanImage8u, CV_8U, scale);
cv::Mat clean8u;
cv::fastNlMeansDenoising(m_meanImage8u, clean8u,
(float)(m_hFinal * scale),
m_patchSize, m_patchSize);
clean8u.convertTo(m_structureClean, CV_64F, 1.0 / scale);
}
// 噪声估计算法
double NlmCPUProcessor::noiseEstimation(const cv::Mat& noisyImage, int patchSize) {
if (patchSize <= 0) {
patchSize = m_sigmaEstPatchSize;
}
int height = noisyImage.rows;
int width = noisyImage.cols;
// 中心区域裁剪
int centerHeight = cvRound(height / 2.0);
int centerWidth = cvRound(width / 2.0);
int cropRadius = std::min(200, cvRound(std::min(height, width) / 4.0));
int rowStart = std::max(0, centerHeight - cropRadius - 1);
int rowEnd = std::min(height - 1, centerHeight + cropRadius - 1);
int colStart = std::max(0, centerWidth - cropRadius - 1);
int colEnd = std::min(width - 1, centerWidth + cropRadius - 1);
cv::Mat croppedImage = noisyImage(cv::Range(rowStart, rowEnd + 1),
cv::Range(colStart, colEnd + 1));
// 模拟 im2col sliding
cv::Mat patches;
for (int col = 0; col <= croppedImage.cols - patchSize; col++) {
for (int row = 0; row <= croppedImage.rows - patchSize; row++) {
cv::Mat patch = croppedImage(cv::Rect(col, row, patchSize, patchSize)).clone();
patches.push_back(patch.reshape(1, 1)); // 展平为行向量
}
}
patches = patches.t(); // [PatchSize*PatchSize, NumPatches]
// 降采样 1:5:end
cv::Mat subSampledPatches;
for (int col = 0; col < patches.cols; col += 5) {
subSampledPatches.push_back(patches.col(col).t());
}
subSampledPatches = subSampledPatches.t();
// 去均值
cv::Mat columnMean;
cv::reduce(subSampledPatches, columnMean, 1, cv::REDUCE_AVG);
for (int col = 0; col < subSampledPatches.cols; col++) {
subSampledPatches.col(col) -= columnMean;
}
// 协方差与特征值
cv::Mat covariance = (subSampledPatches * subSampledPatches.t()) /
(double)(subSampledPatches.cols - 1);
cv::Mat eigenvalues;
cv::eigen(covariance, eigenvalues); // OpenCV eigen 返回降序排列的特征值
// 取最小的 10%
int numEigenvalues = eigenvalues.rows;
int count = std::max(1, cvRound(0.1 * numEigenvalues));
double sumMinEigenvalues = 0;
for (int i = numEigenvalues - count; i < numEigenvalues; i++) {
sumMinEigenvalues += eigenvalues.at<double>(i);
}
return sqrt(std::max(sumMinEigenvalues / count, 0.0));
}
// 重建图像
cv::Mat NlmCPUProcessor::reconstructImage() {
cv::Mat denoisedImage = m_noisyImage.clone();
int subHeight = m_rows / 3;
int subWidth = m_cols / 3;
int index = 0;
for (int rowGroup = 0; rowGroup < 3; rowGroup++) {
for (int colGroup = 0; colGroup < 3; colGroup++) {
cv::Mat diffMap = m_stack[index] - m_meanImage;
cv::Mat cleanDiffMap;
cv::GaussianBlur(diffMap, cleanDiffMap, cv::Size(0, 0), m_diffSigma);
cv::Mat recovered = m_structureClean + cleanDiffMap;
for (int i = 0; i < subHeight; i++) {
for (int j = 0; j < subWidth; j++) {
denoisedImage.at<double>(rowGroup + i * 3, colGroup + j * 3) =
recovered.at<double>(i, j);
}
}
index++;
}
}
cv::threshold(denoisedImage, denoisedImage, m_maxVal, m_maxVal, cv::THRESH_TRUNC);
cv::threshold(denoisedImage, denoisedImage, 0, 0, cv::THRESH_TOZERO);
return denoisedImage;
}
// 保存中间结果
void NlmCPUProcessor::saveIntermediateResults(const std::string& prefix) const {
std::cout << "Saving intermediate results with prefix: " << prefix << std::endl;
if (!m_noisyImage.empty()) {
cv::Mat noisyNormalized;
cv::normalize(m_noisyImage, noisyNormalized, 0, 255, cv::NORM_MINMAX, CV_8U);
cv::imwrite("./images/" + prefix + "_noisy.png", noisyNormalized);
}
if (!m_meanImage.empty()) {
cv::Mat meanNormalized;
cv::normalize(m_meanImage, meanNormalized, 0, 255, cv::NORM_MINMAX, CV_8U);
cv::imwrite("./images/" + prefix + "_mean.png", meanNormalized);
}
if (!m_meanImage8u.empty()) {
cv::imwrite("./images/" + prefix + "_mean_8u.png", m_meanImage8u);
}
if (!m_structureClean.empty()) {
cv::Mat structureNormalized;
cv::normalize(m_structureClean, structureNormalized, 0, 255, cv::NORM_MINMAX, CV_8U);
cv::imwrite("./images/" + prefix + "_structure_clean.png", structureNormalized);
}
// 保存堆栈中的每个通道
for (size_t i = 0; i < m_stack.size(); i++) {
if (!m_stack[i].empty()) {
cv::Mat channelNormalized;
cv::normalize(m_stack[i], channelNormalized, 0, 255, cv::NORM_MINMAX, CV_8U);
cv::imwrite("./images/" + prefix + "_stack_" + std::to_string(i) + ".png",
channelNormalized);
}
}
// 保存统计数据
std::ofstream statsFile(prefix + "_stats.txt");
if (statsFile.is_open()) {
statsFile << "CPU Processing Statistics\n";
statsFile << "=======================\n\n";
statsFile << "Parameters:\n";
statsFile << " Rows: " << m_rows << "\n";
statsFile << " Cols: " << m_cols << "\n";
statsFile << " MaxVal: " << m_maxVal << "\n";
statsFile << " PatchSize: " << m_patchSize << "\n";
statsFile << " SigmaEstPatchSize: " << m_sigmaEstPatchSize << "\n";
statsFile << " HFactor: " << m_hFactor << "\n";
statsFile << " DiffSigma: " << m_diffSigma << "\n\n";
statsFile << "Results:\n";
statsFile << " SigmaEst: " << m_sigmaEst << "\n";
statsFile << " HFinal: " << m_hFinal << "\n\n";
statsFile << "Timings (seconds):\n";
const char* stageNames[] = {
"Read Data", "Split Channels", "Compute Mean", "Noise Estimation",
"NLM Denoising", "Reconstruction", "Truncation", "Total"
};
for (size_t i = 0; i < std::min(m_timings.size(),
sizeof(stageNames)/sizeof(stageNames[0])); i++) {
statsFile << " " << stageNames[i] << ": "
<< std::fixed << std::setprecision(6) << m_timings[i] << "\n";
}
statsFile.close();
}
}
// 打印统计信息
void NlmCPUProcessor::printStatistics() const {
std::cout << "\n=== CPU Processing Statistics ===" << std::endl;
std::cout << "Parameters:" << std::endl;
std::cout << " Rows: " << m_rows << std::endl;
std::cout << " Cols: " << m_cols << std::endl;
std::cout << " MaxVal: " << m_maxVal << std::endl;
std::cout << " PatchSize: " << m_patchSize << std::endl;
std::cout << " SigmaEstPatchSize: " << m_sigmaEstPatchSize << std::endl;
std::cout << " HFactor: " << m_hFactor << std::endl;
std::cout << " DiffSigma: " << m_diffSigma << std::endl;
std::cout << "\nResults:" << std::endl;
std::cout << " SigmaEst: " << m_sigmaEst << std::endl;
std::cout << " HFinal: " << m_hFinal << std::endl;
std::cout << "\nTimings (seconds):" << std::endl;
const char* stageNames[] = {
"1. Read Data", "2. Split Channels", "3. Compute Mean", "4. Noise Estimation",
"5. NLM Denoising", "6. Total"
};
double sumStageTime = 0;
for (size_t i = 0; i < m_timings.size() - 1; i++) {
std::cout << " " << stageNames[i] << ": "
<< std::fixed << std::setprecision(6) << m_timings[i] << std::endl;
if (i < m_timings.size() - 1) {
sumStageTime += m_timings[i];
}
}
std::cout << " " << stageNames[m_timings.size() - 1] << ": "
<< std::fixed << std::setprecision(6) << m_timings.back() << std::endl;
if (m_timings.size() > 1) {
std::cout << " Sum of stages: " << sumStageTime << std::endl;
}
std::cout << "=====================================================" << std::endl;
}
1.3 GPU实现
1.3.1 NlmGPU.h
// NlmGPU.h
#ifndef NLM_GPU_H
#define NLM_GPU_H
#include <opencv2/opencv.hpp>
#include <opencv2/core/ocl.hpp>
#include <string>
#include <vector>
class NlmGPUProcessor {
public:
// 构造函数
NlmGPUProcessor(int rows = 1600, int cols = 1200, float maxVal = 1023.0f);
// 主处理函数 - 改为小写开头
cv::UMat denoiseRaw9Ch(const std::string& path);
// 获取中间结果 - 方法名修改
cv::UMat getNoisyImageGpu() const { return m_noisyImageGpu; }
cv::UMat getMeanImageGpu() const { return m_meanImageGpu; }
cv::UMat getStructureCleanGpu() const { return m_structureCleanGpu; }
std::vector<cv::UMat> getStackGpu() const { return m_stackGpu; }
// 获取统计数据
float getSigmaEst() const { return m_sigmaEst; }
float getHFinal() const { return m_hFinal; }
bool isOpenCLEnabled() const { return m_openclEnabled; }
// 设置参数
void setRows(int rows) { m_rows = rows; }
void setCols(int cols) { m_cols = cols; }
void setMaxVal(float maxVal) { m_maxVal = maxVal; }
void setPatchSize(int patchSize) { m_patchSize = patchSize; }
void setSigmaEstimationPatchSize(int patchSize) { m_sigmaEstPatchSize = patchSize; }
void setHFactor(double hFactor) { m_hFactor = hFactor; }
void setDiffSigma(double diffSigma) { m_diffSigma = diffSigma; }
// 噪声估计函数
double noiseEstimation(const cv::Mat& noisyImage, int patchSize = -1);
// 保存中间结果
void saveIntermediateResults(const std::string& prefix = "gpu") const;
// 打印统计信息
void printStatistics() const;
private:
// 核心处理函数
void readRawData(const std::string& path);
void splitChannelsGpu(); // 函数名修改
void computeMeanGpu(); // 函数名修改
void estimateNoise();
void applyNlmDenoisingGpu(); // 函数名修改
// 工具函数
void clearResults();
void updateTiming(double time);
private:
// 输入参数
int m_rows;
int m_cols;
float m_maxVal;
int m_patchSize;
int m_sigmaEstPatchSize;
double m_hFactor;
double m_diffSigma;
bool m_openclEnabled;
// GPU中间结果
cv::UMat m_noisyImageGpu;
std::vector<cv::UMat> m_stackGpu;
cv::UMat m_meanImageGpu;
cv::UMat m_meanImage8uGpu;
cv::UMat m_structureCleanGpu;
// 统计信息
double m_sigmaEst;
double m_hFinal;
std::vector<double> m_timings; // 各个阶段的时间消耗
// 临时变量
double m_currentTimer;
// 常量
static const int DEFAULT_ROWS = 1600;
static const int DEFAULT_COLS = 1200;
static const double DEFAULT_MAX_VAL;
static const int DEFAULT_PATCH_SIZE = 7;
static const int DEFAULT_SIGMA_PATCH_SIZE = 8;
static const double DEFAULT_H_FACTOR;
static const double DEFAULT_DIFF_SIGMA;
};
#endif // NLM_GPU_H
1.3.2 NlmGPU.cpp
// NlmGPU.cpp
#include "NlmGPU.h"
#include <fstream>
#include <iostream>
const double NlmGPUProcessor::DEFAULT_MAX_VAL = 1023.0;
const double NlmGPUProcessor::DEFAULT_H_FACTOR = 0.8;
const double NlmGPUProcessor::DEFAULT_DIFF_SIGMA = 2.5;
// 构造函数
NlmGPUProcessor::NlmGPUProcessor(int rows, int cols, float maxVal)
: m_rows(rows), m_cols(cols), m_maxVal(maxVal),
m_patchSize(DEFAULT_PATCH_SIZE),
m_sigmaEstPatchSize(DEFAULT_SIGMA_PATCH_SIZE),
m_hFactor(DEFAULT_H_FACTOR),
m_diffSigma(DEFAULT_DIFF_SIGMA),
m_sigmaEst(0.0f), m_hFinal(0.0f),
m_currentTimer(0.0), m_openclEnabled(false) {
cv::ocl::setUseOpenCL(true);
m_openclEnabled = cv::ocl::useOpenCL();
m_timings.resize(6, 0.0);
}
// 清空结果
void NlmGPUProcessor::clearResults() {
m_noisyImageGpu.release();
m_stackGpu.clear();
m_meanImageGpu.release();
m_meanImage8uGpu.release();
m_structureCleanGpu.release();
m_sigmaEst = 0.0;
m_hFinal = 0.0;
std::fill(m_timings.begin(), m_timings.end(), 0.0);
}
// 更新时间统计
void NlmGPUProcessor::updateTiming(double time) {
m_timings.push_back(time);
}
// 主处理函数
cv::UMat NlmGPUProcessor::denoiseRaw9Ch(const std::string& path) {
clearResults();
double totalTimeStart = (double)cv::getTickCount();
if (!m_openclEnabled) {
std::cerr << "Error: GPU version returned empty result" << std::endl;
return cv::UMat();
}
// 1. 读取数据
double stageTimeStart = (double)cv::getTickCount();
readRawData(path);
double readTime = ((double)cv::getTickCount() - stageTimeStart) / cv::getTickFrequency();
m_timings[0] = readTime;
// 2. 通道拆分
stageTimeStart = (double)cv::getTickCount();
splitChannelsGpu();
double splitTime = ((double)cv::getTickCount() - stageTimeStart) / cv::getTickFrequency();
m_timings[1] = splitTime;
// 3. 计算均值
stageTimeStart = (double)cv::getTickCount();
computeMeanGpu();
double meanTime = ((double)cv::getTickCount() - stageTimeStart) / cv::getTickFrequency();
m_timings[2] = meanTime;
// 4. 噪声估计
stageTimeStart = (double)cv::getTickCount();
estimateNoise();
double noiseTime = ((double)cv::getTickCount() - stageTimeStart) / cv::getTickFrequency();
m_timings[3] = noiseTime;
// 5. NLM去噪
stageTimeStart = (double)cv::getTickCount();
applyNlmDenoisingGpu();
double nlmTime = ((double)cv::getTickCount() - stageTimeStart) / cv::getTickFrequency();
m_timings[4] = nlmTime;
// 总时间
double totalTime = ((double)cv::getTickCount() - totalTimeStart) / cv::getTickFrequency();
m_timings[5] = totalTime;
return m_structureCleanGpu;
}
// 读取RAW数据
void NlmGPUProcessor::readRawData(const std::string& path) {
std::ifstream fileStream(path, std::ios::binary);
if (!fileStream) {
throw std::runtime_error("Cannot open file: " + path);
}
std::vector<uint16_t> buffer(m_rows * m_cols);
fileStream.read(reinterpret_cast<char*>(buffer.data()),
m_rows * m_cols * sizeof(uint16_t));
fileStream.close();
cv::Mat tempMat(m_cols, m_rows, CV_16UC1, buffer.data());
cv::UMat tempGpu, transposedGpu;
tempMat.copyTo(tempGpu);
cv::transpose(tempGpu, transposedGpu);
transposedGpu.convertTo(m_noisyImageGpu, CV_32F);
}
// 通道拆分
void NlmGPUProcessor::splitChannelsGpu()
{
CV_Assert(m_noisyImageGpu.type() == CV_32F);
const int subHeight = m_rows / 3;
const int subWidth = m_cols / 3;
// 创建输出
m_stackGpu.resize(9);
for (int i = 0; i < 9; ++i) {
m_stackGpu[i].create(subHeight, subWidth, CV_32F);
}
m_meanImageGpu.create(subHeight, subWidth, CV_32F);
// OpenCL kernel
static const char* kernelSource = R"(
#define WG_X 8
#define WG_Y 8
__kernel void split_and_mean_tile(
__global const float* src,
int src_step,
__global float* dst0,
__global float* dst1,
__global float* dst2,
__global float* dst3,
__global float* dst4,
__global float* dst5,
__global float* dst6,
__global float* dst7,
__global float* dst8,
int dst_step,
__global float* mean,
int mean_step,
int rows,
int cols
)
{
const int localX = get_local_id(1);
const int localY = get_local_id(0);
const int groupX = get_group_id(1) * WG_X;
const int groupY = get_group_id(0) * WG_Y;
__local float tile[WG_Y*3][WG_X*3];
// 加载 tile
for (int dy = localY; dy < WG_Y*3; dy += WG_Y) {
int iy = groupY*3 + dy;
if (iy < rows) {
__global const float* srcRow =
(__global const float*)((__global const char*)src + iy * src_step);
for (int dx = localX; dx < WG_X*3; dx += WG_X) {
int ix = groupX*3 + dx;
if (ix < cols)
tile[dy][dx] = srcRow[ix];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// 存储结果
int y = groupY + localY;
int x = groupX + localX;
if (localY < WG_Y && localX < WG_X &&
(y*3 + 2) < rows && (x*3 + 2) < cols)
{
float v0 = tile[localY*3+0][localX*3+0];
float v1 = tile[localY*3+0][localX*3+1];
float v2 = tile[localY*3+0][localX*3+2];
float v3 = tile[localY*3+1][localX*3+0];
float v4 = tile[localY*3+1][localX*3+1];
float v5 = tile[localY*3+1][localX*3+2];
float v6 = tile[localY*3+2][localX*3+0];
float v7 = tile[localY*3+2][localX*3+1];
float v8 = tile[localY*3+2][localX*3+2];
__global float* d0 = (__global float*)((__global char*)dst0 + y * dst_step);
__global float* d1 = (__global float*)((__global char*)dst1 + y * dst_step);
__global float* d2 = (__global float*)((__global char*)dst2 + y * dst_step);
__global float* d3 = (__global float*)((__global char*)dst3 + y * dst_step);
__global float* d4 = (__global float*)((__global char*)dst4 + y * dst_step);
__global float* d5 = (__global float*)((__global char*)dst5 + y * dst_step);
__global float* d6 = (__global float*)((__global char*)dst6 + y * dst_step);
__global float* d7 = (__global float*)((__global char*)dst7 + y * dst_step);
__global float* d8 = (__global float*)((__global char*)dst8 + y * dst_step);
d0[x] = v0; d1[x] = v1; d2[x] = v2;
d3[x] = v3; d4[x] = v4; d5[x] = v5;
d6[x] = v6; d7[x] = v7; d8[x] = v8;
__global float* m =
(__global float*)((__global char*)mean + y * mean_step);
m[x] = (v0+v1+v2+v3+v4+v5+v6+v7+v8) * (1.0f/9.0f);
}
}
)";
// 构建程序
cv::ocl::ProgramSource programSource(kernelSource);
cv::String buildError;
cv::ocl::Program program =
cv::ocl::Context::getDefault().getProg(programSource, "", buildError);
CV_Assert(program.ptr());
cv::ocl::Kernel kernel("split_and_mean_tile", program);
kernel.args(
cv::ocl::KernelArg::PtrReadOnly(m_noisyImageGpu),
(int)m_noisyImageGpu.step,
cv::ocl::KernelArg::PtrWriteOnly(m_stackGpu[0]),
cv::ocl::KernelArg::PtrWriteOnly(m_stackGpu[1]),
cv::ocl::KernelArg::PtrWriteOnly(m_stackGpu[2]),
cv::ocl::KernelArg::PtrWriteOnly(m_stackGpu[3]),
cv::ocl::KernelArg::PtrWriteOnly(m_stackGpu[4]),
cv::ocl::KernelArg::PtrWriteOnly(m_stackGpu[5]),
cv::ocl::KernelArg::PtrWriteOnly(m_stackGpu[6]),
cv::ocl::KernelArg::PtrWriteOnly(m_stackGpu[7]),
cv::ocl::KernelArg::PtrWriteOnly(m_stackGpu[8]),
(int)m_stackGpu[0].step,
cv::ocl::KernelArg::PtrWriteOnly(m_meanImageGpu),
(int)m_meanImageGpu.step,
m_rows,
m_cols
);
// 启动 kernel
size_t localSize[2] = { 8, 8 };
size_t globalSize[2] = {
(size_t)((subHeight + 7) / 8) * 8,
(size_t)((subWidth + 7) / 8) * 8
};
kernel.run(2, globalSize, localSize, false);
}
// 计算均值
void NlmGPUProcessor::computeMeanGpu() {
cv::UMat sum;
m_stackGpu[0].copyTo(sum);
for (int i = 1; i < 9; i++) {
cv::add(sum, m_stackGpu[i], sum);
}
cv::multiply(sum, 1.0f/9.0f, m_meanImageGpu);
}
// 噪声估计
void NlmGPUProcessor::estimateNoise() {
cv::Mat meanCpu;
m_meanImageGpu.copyTo(meanCpu);
m_sigmaEst = noiseEstimation(meanCpu, m_sigmaEstPatchSize);
m_hFinal = m_hFactor * m_sigmaEst * (m_maxVal / 1023.0f);
}
// NLM去噪
void NlmGPUProcessor::applyNlmDenoisingGpu() {
float scale = 255.0f / m_maxVal;
m_meanImageGpu.convertTo(m_meanImage8uGpu, CV_8U, scale);
cv::UMat clean8uGpu;
cv::fastNlMeansDenoising(m_meanImage8uGpu, clean8uGpu,
(float)(m_hFinal * scale),
m_patchSize, m_patchSize);
clean8uGpu.convertTo(m_structureCleanGpu, CV_32F, 1.0f / scale);
}
// 噪声估计算法
double NlmGPUProcessor::noiseEstimation(const cv::Mat& noisyImage, int patchSize) {
if (patchSize <= 0) {
patchSize = m_sigmaEstPatchSize;
}
int height = noisyImage.rows;
int width = noisyImage.cols;
// 中心区域裁剪
int centerHeight = cvRound(height / 2.0);
int centerWidth = cvRound(width / 2.0);
int cropRadius = std::min(200, cvRound(std::min(height, width) / 4.0));
int rowStart = std::max(0, centerHeight - cropRadius - 1);
int rowEnd = std::min(height - 1, centerHeight + cropRadius - 1);
int colStart = std::max(0, centerWidth - cropRadius - 1);
int colEnd = std::min(width - 1, centerWidth + cropRadius - 1);
cv::Mat croppedImage = noisyImage(cv::Range(rowStart, rowEnd + 1),
cv::Range(colStart, colEnd + 1));
// 模拟 im2col sliding
cv::Mat patches;
for (int col = 0; col <= croppedImage.cols - patchSize; col++) {
for (int row = 0; row <= croppedImage.rows - patchSize; row++) {
cv::Mat patch = croppedImage(cv::Rect(col, row, patchSize, patchSize)).clone();
patches.push_back(patch.reshape(1, 1)); // 展平为行向量
}
}
patches = patches.t(); // [PatchSize*PatchSize, NumPatches]
// 降采样 1:5:end
cv::Mat subSampledPatches;
for (int col = 0; col < patches.cols; col += 5) {
subSampledPatches.push_back(patches.col(col).t());
}
subSampledPatches = subSampledPatches.t();
// 去均值
cv::Mat columnMean;
cv::reduce(subSampledPatches, columnMean, 1, cv::REDUCE_AVG);
for (int col = 0; col < subSampledPatches.cols; col++) {
subSampledPatches.col(col) -= columnMean;
}
// 协方差与特征值
cv::Mat covariance = (subSampledPatches * subSampledPatches.t()) /
(double)(subSampledPatches.cols - 1);
cv::Mat eigenvalues;
cv::eigen(covariance, eigenvalues); // OpenCV eigen 返回降序排列的特征值
// 取最小的 10%
int numEigenvalues = eigenvalues.rows;
int count = std::max(1, cvRound(0.1 * numEigenvalues));
double sumMinEigenvalues = 0;
for (int i = numEigenvalues - count; i < numEigenvalues; i++) {
sumMinEigenvalues += eigenvalues.at<double>(i);
}
return sqrt(std::max(sumMinEigenvalues / count, 0.0));
}
// 保存中间结果
void NlmGPUProcessor::saveIntermediateResults(const std::string& prefix) const {
std::cout << "Saving intermediate results with prefix: " << prefix << std::endl;
if (!m_noisyImageGpu.empty()) {
cv::Mat noisyNormalized;
cv::normalize(m_noisyImageGpu, noisyNormalized, 0, 255, cv::NORM_MINMAX, CV_8U);
cv::imwrite("./images/" + prefix + "_noisy.png", noisyNormalized);
}
if (!m_meanImageGpu.empty()) {
cv::Mat meanNormalized;
cv::normalize(m_meanImageGpu, meanNormalized, 0, 255, cv::NORM_MINMAX, CV_8U);
cv::imwrite("./images/" + prefix + "_mean.png", meanNormalized);
}
if (!m_meanImage8uGpu.empty()) {
cv::imwrite("./images/" + prefix + "_mean_8u.png", m_meanImage8uGpu);
}
if (!m_structureCleanGpu.empty()) {
cv::Mat structureNormalized;
cv::normalize(m_structureCleanGpu, structureNormalized, 0, 255, cv::NORM_MINMAX, CV_8U);
cv::imwrite("./images/" + prefix + "_structure_clean.png", structureNormalized);
}
// 保存堆栈中的每个通道
for (size_t i = 0; i < m_stackGpu.size(); i++) {
if (!m_stackGpu[i].empty()) {
cv::Mat channelNormalized;
cv::normalize(m_stackGpu[i], channelNormalized, 0, 255, cv::NORM_MINMAX, CV_8U);
cv::imwrite("./images/" + prefix + "_stack_" + std::to_string(i) + ".png",
channelNormalized);
}
}
// 保存统计数据
std::ofstream statsFile(prefix + "_stats_gpu.txt");
if (statsFile.is_open()) {
statsFile << "GPU Processing Statistics\n";
statsFile << "=======================\n\n";
statsFile << "Parameters:\n";
statsFile << " Rows: " << m_rows << "\n";
statsFile << " Cols: " << m_cols << "\n";
statsFile << " MaxVal: " << m_maxVal << "\n";
statsFile << " PatchSize: " << m_patchSize << "\n";
statsFile << " SigmaEstPatchSize: " << m_sigmaEstPatchSize << "\n";
statsFile << " HFactor: " << m_hFactor << "\n";
statsFile << " DiffSigma: " << m_diffSigma << "\n\n";
statsFile << "Results:\n";
statsFile << " SigmaEst: " << m_sigmaEst << "\n";
statsFile << " HFinal: " << m_hFinal << "\n\n";
statsFile << "Timings (seconds):\n";
const char* stageNames[] = {
"Read Data", "Split Channels", "Compute Mean", "Noise Estimation",
"NLM Denoising", "Total"
};
for (size_t i = 0; i < std::min(m_timings.size(),
sizeof(stageNames)/sizeof(stageNames[0])); i++) {
statsFile << " " << stageNames[i] << ": "
<< std::fixed << std::setprecision(6) << m_timings[i] << "\n";
}
statsFile.close();
}
}
// 打印统计信息
void NlmGPUProcessor::printStatistics() const {
std::cout << "\n=== GPU Processing Statistics ===" << std::endl;
std::cout << "Parameters:" << std::endl;
std::cout << " Rows: " << m_rows << std::endl;
std::cout << " Cols: " << m_cols << std::endl;
std::cout << " MaxVal: " << m_maxVal << std::endl;
std::cout << " PatchSize: " << m_patchSize << std::endl;
std::cout << " SigmaEstPatchSize: " << m_sigmaEstPatchSize << std::endl;
std::cout << " HFactor: " << m_hFactor << std::endl;
std::cout << " DiffSigma: " << m_diffSigma << std::endl;
std::cout << "\nResults:" << std::endl;
std::cout << " SigmaEst: " << m_sigmaEst << std::endl;
std::cout << " HFinal: " << m_hFinal << std::endl;
std::cout << "\nTimings (seconds):" << std::endl;
const char* stageNames[] = {
"1. Read Data", "2. Split Channels", "3. Compute Mean", "4. Noise Estimation",
"5. NLM Denoising", "6. Total"
};
double sumStageTime = 0;
for (size_t i = 0; i < m_timings.size() - 1; i++) {
std::cout << " " << stageNames[i] << ": "
<< std::fixed << std::setprecision(6) << m_timings[i] << std::endl;
if (i < m_timings.size() - 1) {
sumStageTime += m_timings[i];
}
}
std::cout << " " << stageNames[m_timings.size() - 1] << ": "
<< std::fixed << std::setprecision(6) << m_timings.back() << std::endl;
if (m_timings.size() > 1) {
std::cout << " Sum of stages: " << sumStageTime << std::endl;
}
std::cout << "=====================================================" << std::endl;
}
1.4 Compare
1.4.1 Compare.h
// Compare.h
#ifndef COMPARE_H
#define COMPARE_H
#include <opencv2/opencv.hpp>
#include <string>
#include <cmath>
/**
* @brief 比较CPU和GPU处理结果的函数
*
* @param cpuMat CPU处理结果(cv::Mat)
* @param gpuUmat GPU处理结果(cv::UMat)
* @param name 比较对象的名称(用于输出标识)
* @param tolerance 容差值,默认1e-5
*/
void compareCPUvsGPU(const cv::Mat& cpuMat, const cv::UMat& gpuUmat,
const std::string& name, float tolerance = 1e-5f);
#endif // COMPARE_H
1.4.2 Compare.cpp
// Compare.cpp
#include "Compare.h"
#include <iostream>
#include <iomanip>
// 增强版的比较函数,自动处理类型转换
void compareCPUvsGPU(const cv::Mat& cpuMat, const cv::UMat& gpuUmat,
const std::string& name, float tolerance) {
std::cout << "\n=== Comparing CPU vs GPU: " << name << " ===" << std::endl;
// 下载GPU数据
cv::Mat gpuMat;
gpuUmat.copyTo(gpuMat);
// 检查尺寸
if (cpuMat.size() != gpuMat.size()) {
std::cout << "ERROR: Size mismatch!" << std::endl;
std::cout << " CPU: " << cpuMat.rows << "x" << cpuMat.cols << std::endl;
std::cout << " GPU: " << gpuMat.rows << "x" << gpuMat.cols << std::endl;
return;
}
// 输出类型信息
std::cout << "CPU type: " << cpuMat.type()
<< " (CV_64F=" << CV_64F << ", CV_32F=" << CV_32F
<< ", CV_8U=" << CV_8U << ")" << std::endl;
std::cout << "GPU type: " << gpuMat.type() << std::endl;
// 处理类型不匹配:统一转换为float进行比较
cv::Mat cpuFloatMat, gpuFloatMat;
if (cpuMat.type() != gpuMat.type()) {
std::cout << "INFO: Type mismatch, converting both to CV_32F for comparison" << std::endl;
// 转换CPU数据
if (cpuMat.depth() == CV_64F) {
cpuMat.convertTo(cpuFloatMat, CV_32F);
} else if (cpuMat.depth() == CV_8U) {
cpuMat.convertTo(cpuFloatMat, CV_32F);
} else {
cpuFloatMat = cpuMat.clone();
}
// 转换GPU数据
if (gpuMat.depth() == CV_64F) {
gpuMat.convertTo(gpuFloatMat, CV_32F);
} else if (gpuMat.depth() == CV_8U) {
gpuMat.convertTo(gpuFloatMat, CV_32F);
} else {
gpuFloatMat = gpuMat.clone();
}
// 再次检查
if (cpuFloatMat.type() != gpuFloatMat.type()) {
std::cout << "ERROR: Failed to convert to same type!" << std::endl;
return;
}
} else {
// 类型相同,直接使用
if (cpuMat.type() == CV_64F) {
cpuMat.convertTo(cpuFloatMat, CV_32F);
gpuMat.convertTo(gpuFloatMat, CV_32F);
} else {
cpuFloatMat = cpuMat.clone();
gpuFloatMat = gpuMat.clone();
}
}
std::cout << "After conversion: CPU type=" << cpuFloatMat.type()
<< ", GPU type=" << gpuFloatMat.type() << std::endl;
// 逐元素比较
int mismatchCount = 0;
float maxDiff = 0.0f;
int maxDiffRow = -1, maxDiffCol = -1;
float cpuMaxDiffVal = 0.0f, gpuMaxDiffVal = 0.0f;
for (int row = 0; row < cpuFloatMat.rows; row++) {
const float* cpuRow = cpuFloatMat.ptr<float>(row);
const float* gpuRow = gpuFloatMat.ptr<float>(row);
for (int col = 0; col < cpuFloatMat.cols; col++) {
float diff = std::abs(cpuRow[col] - gpuRow[col]);
if (diff > maxDiff) {
maxDiff = diff;
maxDiffRow = row;
maxDiffCol = col;
cpuMaxDiffVal = cpuRow[col];
gpuMaxDiffVal = gpuRow[col];
}
if (diff > tolerance) {
mismatchCount++;
if (mismatchCount <= 5) { // 只打印前5个不匹配
std::cout << " Mismatch at [" << row << "," << col << "]: "
<< "CPU=" << cpuRow[col] << ", GPU=" << gpuRow[col]
<< ", diff=" << diff << std::endl;
}
}
}
}
// 输出统计信息
std::cout << "\nComparison Summary:" << std::endl;
std::cout << " Total elements: " << cpuFloatMat.total() << std::endl;
std::cout << " Mismatch count: " << mismatchCount << std::endl;
std::cout << " Mismatch percentage: "
<< std::fixed << std::setprecision(4)
<< (100.0f * mismatchCount / cpuFloatMat.total()) << "%" << std::endl;
std::cout << " Max difference: " << maxDiff << std::endl;
std::cout << " at [" << maxDiffRow << "," << maxDiffCol << "]" << std::endl;
std::cout << " CPU: " << cpuMaxDiffVal << std::endl;
std::cout << " GPU: " << gpuMaxDiffVal << std::endl;
// 计算整体差异
cv::Mat diffMat;
cv::absdiff(cpuFloatMat, gpuFloatMat, diffMat);
double avgDiff = cv::mean(diffMat)[0];
std::cout << " Average difference: " << avgDiff << std::endl;
if (mismatchCount == 0) {
std::cout << "✓ CPU and GPU results match within tolerance!" << std::endl;
} else if (mismatchCount < cpuFloatMat.total() * 0.01) {
std::cout << "⚠ Minor differences (<1%) between CPU and GPU" << std::endl;
} else {
std::cout << "✗ Significant differences between CPU and GPU!" << std::endl;
}
std::cout << "===================================\n" << std::endl;
}
1.5 main.cpp
// main.cpp
#include <iostream>
#include <opencv2/opencv.hpp>
#include <opencv2/core/ocl.hpp>
#include "NlmCPU.h"
#include "NlmGPU.h"
#include "Compare.h"
#define DEBUG 0
// --- 主要处理函数 ---
int main() {
std::vector<cv::ocl::PlatformInfo> platforms;
cv::ocl::getPlatfomsInfo(platforms); // 注意:原代码拼写错误应为getPlatformsInfo
if (platforms.empty()) {
std::cerr << "No OpenCL platforms found!" << std::endl;
return -1;
}
const cv::ocl::PlatformInfo* platform = &platforms[0];
std::cout << "Platform Name: " << platform->name().c_str() << std::endl;
cv::ocl::Device device;
platform->getDevice(device, 0);
std::cout << "Device name: " << device.name().c_str() << std::endl;
cv::ocl::setUseOpenCL(true);
std::cout << "Use OpenCL Device? " << cv::ocl::useOpenCL() << std::endl;
cv::UMat displayFrame;
// 创建处理器实例
NlmGPUProcessor gpuProcessor; // 改为驼峰命名
gpuProcessor.setRows(1600);
gpuProcessor.setCols(1200);
NlmCPUProcessor cpuProcessor; // 改为驼峰命名
cpuProcessor.setRows(1600);
cpuProcessor.setCols(1200);
std::string path = "bai2.raw";
while (true) {
// CPU处理
cv::Mat denoisedCpu = cpuProcessor.denoiseRaw9Ch(path); // 函数名改为小写开头
cpuProcessor.saveIntermediateResults();
cpuProcessor.printStatistics();
// GPU处理
cv::UMat denoisedGpu = gpuProcessor.denoiseRaw9Ch(path); // 函数名改为小写开头
gpuProcessor.saveIntermediateResults();
gpuProcessor.printStatistics();
#if DEBUG
compareCPUvsGPU(cpuProcessor.getNoisyImage(),gpuProcessor.getNoisyImageGPU(), "读取数据");
compareCPUvsGPU(cpuProcessor.getStack().at(0),gpuProcessor.getStackGPU().at(0), "通道0");
compareCPUvsGPU(cpuProcessor.getStack().at(1),gpuProcessor.getStackGPU().at(1), "通道1");
compareCPUvsGPU(cpuProcessor.getStack().at(2),gpuProcessor.getStackGPU().at(2), "通道2");
compareCPUvsGPU(cpuProcessor.getMeanImage(),gpuProcessor.getMeanImageGPU(), "均值");
compareCPUvsGPU(cpuProcessor.getStructureClean(),gpuProcessor.getStructureCleanGPU(), "NLM");
#endif
if (denoisedGpu.empty()) {
std::cerr << "无法读取文件或处理失败。" << std::endl;
} else {
// 转换回8U用于显示
denoisedGpu.convertTo(displayFrame, CV_8U);
// 显示结果
cv::imshow("Denoised Frame", displayFrame);
}
int key = cv::waitKey(30);
if (key == 27 || key == 'q' || key == 'Q') {
break;
}
}
// 关闭所有窗口
cv::destroyAllWindows();
return 0;
}
1.6 编译运行
pi@NanoPC-T6:/opt/opencl-project/nlm_project$ ./build/nlm_project
arm_release_ver: g13p0-01eac0, rk_so_ver: 10
Platform Name: ARM Platform
Device name: Mali-G610 r0p0
Use OpenCL Device? 1
Saving intermediate results with prefix: cpu
=== CPU Processing Statistics ===
Parameters:
Rows: 1600
Cols: 1200
MaxVal: 1023
PatchSize: 7
SigmaEstPatchSize: 8
HFactor: 0.8
DiffSigma: 2.5
Results:
SigmaEst: 2.42909
HFinal: 1.94327
Timings (seconds):
1. Read Data: 0.053899
2. Split Channels: 0.036228
3. Compute Mean: 0.003276
4. Noise Estimation: 0.106404
5. NLM Denoising: 0.066914
6. Total: 0.266724
Sum of stages: 0.266722
=====================================================
Saving intermediate results with prefix: gpu
=== GPU Processing Statistics ===
Parameters:
Rows: 1600
Cols: 1200
MaxVal: 1023.000000
PatchSize: 7
SigmaEstPatchSize: 8
HFactor: 0.800000
DiffSigma: 2.500000
Results:
SigmaEst: 0.000000
HFinal: 0.000000
Timings (seconds):
1. Read Data: 0.011386
2. Split Channels: 0.042756
3. Compute Mean: 0.000962
4. Noise Estimation: 0.101066
5. NLM Denoising: 0.001585
6. Total: 0.157755
Sum of stages: 0.157754
=====================================================
Gtk-Message: 16:07:40.455: Failed to load module "canberra-gtk-module"
Saving intermediate results with prefix: cpu
=== CPU Processing Statistics ===
Parameters:
Rows: 1600
Cols: 1200
MaxVal: 1023.000000
PatchSize: 7
SigmaEstPatchSize: 8
HFactor: 0.800000
DiffSigma: 2.500000
Results:
SigmaEst: 2.429091
HFinal: 1.943273
Timings (seconds):
1. Read Data: 0.046851
2. Split Channels: 0.028041
3. Compute Mean: 0.004390
4. Noise Estimation: 0.102197
5. NLM Denoising: 0.045189
6. Total: 0.226671
Sum of stages: 0.226668
=====================================================
Saving intermediate results with prefix: gpu
=== GPU Processing Statistics ===
Parameters:
Rows: 1600
Cols: 1200
MaxVal: 1023.000000
PatchSize: 7
SigmaEstPatchSize: 8
HFactor: 0.800000
DiffSigma: 2.500000
Results:
SigmaEst: 0.000000
HFinal: 0.000000
Timings (seconds):
1. Read Data: 0.011834
2. Split Channels: 0.000097
3. Compute Mean: 0.000463
4. Noise Estimation: 0.087953
5. NLM Denoising: 0.000411
6. Total: 0.100760
Sum of stages: 0.100759





浙公网安备 33010602011771号