完整代码@折腾笔记[49]-cuda的SIFT特征匹配

摘要

使用CUDA在GPU上加速SIFT特征提取与匹配算法, 并封装为CSharp库, 支持图像相似度评分和模板缓存。

声明

本文人类为第一作者, 龙虾为通讯作者.本文有AI生成内容.

关键信息

CUDA Toolkit 12.4+
Linux amd64 / Windows x64
C# / .NET Framework 4.7.2+

简介

SIFT算法简介

SIFT（Scale-Invariant Feature Transform，尺度不变特征变换）是由 David Lowe 于 1999 年提出、2004 年完善的经典局部特征描述算法。它能够在不同尺度、旋转、光照条件下提取稳定的图像特征点，广泛应用于图像匹配、目标识别、全景拼接、三维重建等领域。

主要特点：

尺度不变性：通过高斯金字塔和 DoG（Difference of Gaussian）空间检测不同尺度的特征点
旋转不变性：基于梯度方向直方图确定特征点主方向
光照鲁棒性：描述子经归一化处理，对光照变化不敏感
高区分度：128维特征描述子，具有极强的特征区分能力

SIFT算法流程

尺度空间极值检测：构建高斯金字塔（Gaussian Pyramid）和 DoG 金字塔，检测局部极值点作为候选特征点
关键点精确定位：通过三维二次函数拟合剔除低对比度和边缘响应点
方向赋值：计算关键点邻域梯度方向直方图，确定主方向以实现旋转不变性
描述子生成：在关键点周围 4×4 子区域计算 8 方向梯度直方图，生成 128 维特征向量

Lowe比率测试

特征匹配阶段采用 Lowe 提出的比率测试（Ratio Test）：对于每个特征点，找到最近邻和次近邻匹配点，只有当最近邻距离与次近邻距离的比值小于阈值（通常 0.75）时才视为有效匹配。该策略能有效剔除模糊匹配，提高匹配精度。

参考链接: SIFT - Wikipedia
参考论文: Lowe, D. G. (2004). Distinctive image features from scale-invariant keypoints. International Journal of Computer Vision, 60(2), 91-110.

工程目录结构

CudaSharp/
├── CudaSharpNative.cu      # CUDA DLL 导出层（二值化 + SIFT）
├── CudaBinarizeLib.cs      # C# P/Invoke 封装层
├── CudaBinarizeLib.csproj  # .NET 类库工程
├── build.py                # CUDA DLL 编译脚本
├── Program.cs              # C# 示例程序
├── gray_image.c/h          # 灰度图像数据结构
├── image_ops.cu/h          # CUDA 图像操作（高斯模糊、下采样、差分）
├── sift_types.c/h          # SIFT Feature 类型定义
├── sift_algorithm.cu/h     # SIFT 特征提取算法
├── sift_matcher.cu/h       # CUDA 加速 SIFT 特征匹配
├── image_similarity.cu/h   # 图像相似度评估器（含模板缓存）
├── test_sift.cpp           # SIFT 命令行测试程序
└── benchmark_sift.py       # SIFT 性能基准测试脚本

完整代码

gray_image.h

#ifndef GRAY_IMAGE_H
#define GRAY_IMAGE_H

#ifdef __cplusplus
extern "C" {
#endif

#include <stddef.h>

typedef struct {
    int width;
    int height;
    float* data; // row-major: data[y * width + x]
} GrayImage;

GrayImage* gray_image_create(int width, int height);
GrayImage* gray_image_create_from_data(int width, int height, const float* data);
void gray_image_free(GrayImage* img);
GrayImage* gray_image_clone(const GrayImage* img);
GrayImage* gray_image_load(const char* filename);

static inline float gray_image_get(const GrayImage* img, int row, int col) {
    return img->data[row * img->width + col];
}

static inline void gray_image_set(GrayImage* img, int row, int col, float val) {
    img->data[row * img->width + col] = val;
}

#ifdef __cplusplus
}
#endif

#endif

gray_image.c

#include "gray_image.h"
#include "stb_image.h"
#include <stdlib.h>
#include <string.h>
#include <math.h>

GrayImage* gray_image_create(int width, int height) {
    GrayImage* img = (GrayImage*)malloc(sizeof(GrayImage));
    if (!img) return NULL;
    img->width = width;
    img->height = height;
    img->data = (float*)calloc(width * height, sizeof(float));
    if (!img->data) {
        free(img);
        return NULL;
    }
    return img;
}

GrayImage* gray_image_create_from_data(int width, int height, const float* data) {
    GrayImage* img = gray_image_create(width, height);
    if (!img) return NULL;
    memcpy(img->data, data, width * height * sizeof(float));
    return img;
}

void gray_image_free(GrayImage* img) {
    if (img) {
        free(img->data);
        free(img);
    }
}

GrayImage* gray_image_clone(const GrayImage* img) {
    if (!img) return NULL;
    return gray_image_create_from_data(img->width, img->height, img->data);
}

GrayImage* gray_image_load(const char* filename) {
    int w, h, channels;
    unsigned char* pixels = stbi_load(filename, &w, &h, &channels, 0);
    if (!pixels) return NULL;

    GrayImage* img = gray_image_create(w, h);
    if (!img) {
        stbi_image_free(pixels);
        return NULL;
    }

    for (int y = 0; y < h; y++) {
        for (int x = 0; x < w; x++) {
            int idx = (y * w + x) * channels;
            float r, g, b;
            if (channels >= 3) {
                r = pixels[idx + 0] / 255.0f;
                g = pixels[idx + 1] / 255.0f;
                b = pixels[idx + 2] / 255.0f;
            } else if (channels == 1) {
                r = g = b = pixels[idx] / 255.0f;
            } else {
                r = g = b = 0;
            }
            float gray = r * 0.299f + g * 0.587f + b * 0.114f;
            gray_image_set(img, y, x, gray);
        }
    }

    stbi_image_free(pixels);
    return img;
}

sift_types.h

#ifndef SIFT_TYPES_H
#define SIFT_TYPES_H

#ifdef __cplusplus
extern "C" {
#endif

#include <stddef.h>

typedef enum {
    FEATURE_OXFD,
    FEATURE_LOWE
} FeatureType;

typedef struct {
    int r;
    int c;
    int octv;
    int intvl;
    double subintvl;
    double scl_octv;
} DetectionData;

#define FEATURE_MAX_D 128

typedef struct Feature {
    double x;
    double y;
    double a;
    double b;
    double c;
    double scl;
    double ori;
    int d;
    double descr[FEATURE_MAX_D];
    FeatureType type;
    int category;
    struct Feature* fwd_match;
    struct Feature* bck_match;
    struct Feature* mdl_match;
    float img_pt_x;
    float img_pt_y;
    DetectionData feature_data;
} Feature;

Feature* feature_new(void);
void feature_free(Feature* feat);
Feature* feature_clone(const Feature* feat);

#ifdef __cplusplus
}
#endif

#endif

sift_types.c

#include "sift_types.h"
#include <stdlib.h>
#include <string.h>

Feature* feature_new(void) {
    Feature* feat = (Feature*)calloc(1, sizeof(Feature));
    if (feat) {
        memset(feat->descr, 0, sizeof(feat->descr));
        feat->type = FEATURE_LOWE;
    }
    return feat;
}

void feature_free(Feature* feat) {
    free(feat);
}

Feature* feature_clone(const Feature* feat) {
    if (!feat) return NULL;
    Feature* new_feat = feature_new();
    if (!new_feat) return NULL;
    new_feat->x = feat->x;
    new_feat->y = feat->y;
    new_feat->a = feat->a;
    new_feat->b = feat->b;
    new_feat->c = feat->c;
    new_feat->scl = feat->scl;
    new_feat->ori = feat->ori;
    new_feat->d = feat->d;
    new_feat->type = feat->type;
    new_feat->category = feat->category;
    new_feat->img_pt_x = feat->img_pt_x;
    new_feat->img_pt_y = feat->img_pt_y;
    new_feat->feature_data = feat->feature_data;
    memcpy(new_feat->descr, feat->descr, sizeof(feat->descr));
    return new_feat;
}

image_ops.h

#ifndef IMAGE_OPS_H
#define IMAGE_OPS_H

#ifdef __cplusplus
extern "C" {
#endif

#include "gray_image.h"

GrayImage* image_ops_gaussian_blur(const GrayImage* src, double sigma);
GrayImage* image_ops_downsample(const GrayImage* src);
GrayImage* image_ops_resize_cubic(const GrayImage* src, int newW, int newH);
GrayImage* image_ops_subtract(const GrayImage* a, const GrayImage* b);

#ifdef __cplusplus
}
#endif

#endif

image_ops.cu

#include "image_ops.h"
#include <cuda_runtime.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

__global__ void gaussian_blur_h_kernel(const float* src, float* dst, int width, int height,
                                       const float* kernel, int kernel_size, int radius) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;

    float val = 0.0f;
    for (int k = 0; k < kernel_size; k++) {
        int px = x + k - radius;
        if (px < 0) px = 0;
        if (px >= width) px = width - 1;
        val += src[y * width + px] * kernel[k];
    }
    dst[y * width + x] = val;
}

__global__ void gaussian_blur_v_kernel(const float* src, float* dst, int width, int height,
                                       const float* kernel, int kernel_size, int radius) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;

    float val = 0.0f;
    for (int k = 0; k < kernel_size; k++) {
        int py = y + k - radius;
        if (py < 0) py = 0;
        if (py >= height) py = height - 1;
        val += src[py * width + x] * kernel[k];
    }
    dst[y * width + x] = val;
}

__global__ void downsample_kernel(const float* src, float* dst, int srcW, int srcH, int dstW, int dstH) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= dstW || y >= dstH) return;
    dst[y * dstW + x] = src[(y * 2) * srcW + (x * 2)];
}

__global__ void subtract_kernel(const float* a, const float* b, float* dst, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;
    int idx = y * width + x;
    dst[idx] = a[idx] - b[idx];
}

static void check_cuda(cudaError_t err, const char* msg) {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error (%s): %s\n", msg, cudaGetErrorString(err));
        exit(1);
    }
}

GrayImage* image_ops_gaussian_blur(const GrayImage* src, double sigma) {
    if (sigma <= 0.01) {
        return gray_image_clone(src);
    }

    int size = (int)ceil(sigma * 6.0);
    if (size % 2 == 0) size++;
    int radius = size / 2;

    float* h_kernel = (float*)malloc(size * sizeof(float));
    double sum = 0.0;
    for (int i = 0; i < size; i++) {
        double x = i - radius;
        h_kernel[i] = (float)exp(-(x * x) / (2.0 * sigma * sigma));
        sum += h_kernel[i];
    }
    for (int i = 0; i < size; i++) {
        h_kernel[i] /= (float)sum;
    }

    int w = src->width;
    int h = src->height;
    int num_pixels = w * h;

    float *d_src, *d_tmp, *d_dst, *d_kernel;
    check_cuda(cudaMalloc(&d_src, num_pixels * sizeof(float)), "cudaMalloc d_src");
    check_cuda(cudaMalloc(&d_tmp, num_pixels * sizeof(float)), "cudaMalloc d_tmp");
    check_cuda(cudaMalloc(&d_dst, num_pixels * sizeof(float)), "cudaMalloc d_dst");
    check_cuda(cudaMalloc(&d_kernel, size * sizeof(float)), "cudaMalloc d_kernel");

    check_cuda(cudaMemcpy(d_src, src->data, num_pixels * sizeof(float), cudaMemcpyHostToDevice), "memcpy src");
    check_cuda(cudaMemcpy(d_kernel, h_kernel, size * sizeof(float), cudaMemcpyHostToDevice), "memcpy kernel");

    dim3 block(16, 16);
    dim3 grid((w + block.x - 1) / block.x, (h + block.y - 1) / block.y);

    gaussian_blur_h_kernel<<<grid, block>>>(d_src, d_tmp, w, h, d_kernel, size, radius);
    check_cuda(cudaGetLastError(), "gaussian_blur_h_kernel");
    gaussian_blur_v_kernel<<<grid, block>>>(d_tmp, d_dst, w, h, d_kernel, size, radius);
    check_cuda(cudaGetLastError(), "gaussian_blur_v_kernel");

    GrayImage* dst = gray_image_create(w, h);
    check_cuda(cudaMemcpy(dst->data, d_dst, num_pixels * sizeof(float), cudaMemcpyDeviceToHost), "memcpy dst");

    cudaFree(d_src);
    cudaFree(d_tmp);
    cudaFree(d_dst);
    cudaFree(d_kernel);
    free(h_kernel);

    return dst;
}

GrayImage* image_ops_downsample(const GrayImage* src) {
    int newW = src->width / 2;
    int newH = src->height / 2;
    GrayImage* dst = gray_image_create(newW, newH);

    float *d_src, *d_dst;
    check_cuda(cudaMalloc(&d_src, src->width * src->height * sizeof(float)), "cudaMalloc d_src");
    check_cuda(cudaMalloc(&d_dst, newW * newH * sizeof(float)), "cudaMalloc d_dst");

    check_cuda(cudaMemcpy(d_src, src->data, src->width * src->height * sizeof(float), cudaMemcpyHostToDevice), "memcpy src");

    dim3 block(16, 16);
    dim3 grid((newW + block.x - 1) / block.x, (newH + block.y - 1) / block.y);
    downsample_kernel<<<grid, block>>>(d_src, d_dst, src->width, src->height, newW, newH);
    check_cuda(cudaGetLastError(), "downsample_kernel");

    check_cuda(cudaMemcpy(dst->data, d_dst, newW * newH * sizeof(float), cudaMemcpyDeviceToHost), "memcpy dst");

    cudaFree(d_src);
    cudaFree(d_dst);
    return dst;
}

GrayImage* image_ops_subtract(const GrayImage* a, const GrayImage* b) {
    int w = a->width;
    int h = a->height;
    GrayImage* dst = gray_image_create(w, h);

    float *d_a, *d_b, *d_dst;
    check_cuda(cudaMalloc(&d_a, w * h * sizeof(float)), "cudaMalloc d_a");
    check_cuda(cudaMalloc(&d_b, w * h * sizeof(float)), "cudaMalloc d_b");
    check_cuda(cudaMalloc(&d_dst, w * h * sizeof(float)), "cudaMalloc d_dst");

    check_cuda(cudaMemcpy(d_a, a->data, w * h * sizeof(float), cudaMemcpyHostToDevice), "memcpy a");
    check_cuda(cudaMemcpy(d_b, b->data, w * h * sizeof(float), cudaMemcpyHostToDevice), "memcpy b");

    dim3 block(16, 16);
    dim3 grid((w + block.x - 1) / block.x, (h + block.y - 1) / block.y);
    subtract_kernel<<<grid, block>>>(d_a, d_b, d_dst, w, h);
    check_cuda(cudaGetLastError(), "subtract_kernel");

    check_cuda(cudaMemcpy(dst->data, d_dst, w * h * sizeof(float), cudaMemcpyDeviceToHost), "memcpy dst");

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_dst);
    return dst;
}

static double cubic_kernel(double x) {
    x = fabs(x);
    if (x <= 1.0)
        return 1.5 * x * x * x - 2.5 * x * x + 1.0;
    if (x <= 2.0)
        return -0.5 * x * x * x + 2.5 * x * x - 4.0 * x + 2.0;
    return 0.0;
}

static int clamp(int v, int min, int max) {
    return v < min ? min : (v > max ? max : v);
}

static double bicubic_interpolate(const GrayImage* img, double x, double y) {
    int ix = (int)floor(x);
    int iy = (int)floor(y);
    double dx = x - ix;
    double dy = y - iy;

    double sum = 0.0;
    double wsum = 0.0;
    for (int j = -1; j <= 2; j++) {
        for (int i = -1; i <= 2; i++) {
            int px = clamp(ix + i, 0, img->width - 1);
            int py = clamp(iy + j, 0, img->height - 1);
            double wx = cubic_kernel(dx - i);
            double wy = cubic_kernel(dy - j);
            double w = wx * wy;
            sum += gray_image_get(img, py, px) * w;
            wsum += w;
        }
    }
    return wsum > 0.0 ? sum / wsum : 0.0;
}

GrayImage* image_ops_resize_cubic(const GrayImage* src, int newW, int newH) {
    GrayImage* dst = gray_image_create(newW, newH);
    double scaleX = (double)src->width / newW;
    double scaleY = (double)src->height / newH;

    for (int y = 0; y < newH; y++) {
        for (int x = 0; x < newW; x++) {
            double sx = (x + 0.5) * scaleX - 0.5;
            double sy = (y + 0.5) * scaleY - 0.5;
            gray_image_set(dst, y, x, (float)bicubic_interpolate(src, sx, sy));
        }
    }
    return dst;
}

sift_algorithm.h

#ifndef SIFT_ALGORITHM_H
#define SIFT_ALGORITHM_H

#ifdef __cplusplus
extern "C" {
#endif

#include "gray_image.h"
#include "sift_types.h"

typedef struct {
    Feature** data;
    int size;
    int capacity;
} FeatureArray;

FeatureArray* feature_array_new(void);
void feature_array_free(FeatureArray* arr);
void feature_array_push(FeatureArray* arr, Feature* feat);
void feature_array_sort_by_scale(FeatureArray* arr);

FeatureArray* sift_extract_features(const GrayImage* img);

#ifdef __cplusplus
}
#endif

#endif

sift_algorithm.cu

#include "sift_algorithm.h"
#include "image_ops.h"
#include <cuda_runtime.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif

#define SIFT_INIT_SIGMA 0.5
#define SIFT_IMG_BORDER 5
#define SIFT_MAX_INTERP_STEPS 5
#define SIFT_ORI_HIST_BINS 36
#define SIFT_ORI_SIG_FCTR 1.5
#define SIFT_ORI_RADIUS (3.0 * SIFT_ORI_SIG_FCTR)
#define SIFT_ORI_SMOOTH_PASSES 2
#define SIFT_ORI_PEAK_RATIO 0.8
#define SIFT_DESCR_SCL_FCTR 3.0
#define SIFT_DESCR_MAG_THR 0.2
#define SIFT_INT_DESCR_FCTR 512.0

#define SIFT_INTVLS 3
#define SIFT_SIGMA 1.6
#define SIFT_CONTR_THR 0.04
#define SIFT_CURV_THR 10
#define SIFT_IMG_DBL 1
#define SIFT_DESCR_WIDTH 4
#define SIFT_DESCR_HIST_BINS 8

FeatureArray* feature_array_new(void) {
    FeatureArray* arr = (FeatureArray*)malloc(sizeof(FeatureArray));
    arr->data = NULL;
    arr->size = 0;
    arr->capacity = 0;
    return arr;
}

void feature_array_free(FeatureArray* arr) {
    if (!arr) return;
    for (int i = 0; i < arr->size; i++) {
        feature_free(arr->data[i]);
    }
    free(arr->data);
    free(arr);
}

void feature_array_push(FeatureArray* arr, Feature* feat) {
    if (arr->size >= arr->capacity) {
        arr->capacity = arr->capacity == 0 ? 16 : arr->capacity * 2;
        arr->data = (Feature**)realloc(arr->data, arr->capacity * sizeof(Feature*));
    }
    arr->data[arr->size++] = feat;
}

static int compare_feature_scale(const void* a, const void* b) {
    Feature* fa = *(Feature**)a;
    Feature* fb = *(Feature**)b;
    if (fa->scl < fb->scl) return 1;
    if (fa->scl > fb->scl) return -1;
    return 0;
}

void feature_array_sort_by_scale(FeatureArray* arr) {
    if (arr->size > 1) {
        qsort(arr->data, arr->size, sizeof(Feature*), compare_feature_scale);
    }
}

static void invert3x3(double H[3][3], double H_inv[3][3]) {
    double aug[3][6];
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
            aug[i][j] = H[i][j];
            aug[i][j + 3] = (i == j) ? 1.0 : 0.0;
        }
    }
    for (int i = 0; i < 3; i++) {
        double pivot = aug[i][i];
        if (fabs(pivot) < 1e-12) {
            for (int k = i + 1; k < 3; k++) {
                if (fabs(aug[k][i]) > fabs(pivot)) {
                    for (int j = 0; j < 6; j++) {
                        double tmp = aug[i][j];
                        aug[i][j] = aug[k][j];
                        aug[k][j] = tmp;
                    }
                    pivot = aug[i][i];
                    break;
                }
            }
        }
        for (int j = 0; j < 6; j++) aug[i][j] /= pivot;
        for (int k = 0; k < 3; k++) {
            if (k != i) {
                double factor = aug[k][i];
                for (int j = 0; j < 6; j++)
                    aug[k][j] -= factor * aug[i][j];
            }
        }
    }
    for (int i = 0; i < 3; i++)
        for (int j = 0; j < 3; j++)
            H_inv[i][j] = aug[i][j + 3];
}

static GrayImage* create_init_img(const GrayImage* img, int img_dbl, double sigma) {
    GrayImage* gray = gray_image_clone(img);
    if (img_dbl != 0) {
        float sig_diff = (float)sqrt(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA * 4.0);
        GrayImage* dbl = image_ops_resize_cubic(gray, img->width * 2, img->height * 2);
        gray_image_free(gray);
        GrayImage* blurred = image_ops_gaussian_blur(dbl, sig_diff);
        gray_image_free(dbl);
        return blurred;
    } else {
        float sig_diff = (float)sqrt(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA);
        GrayImage* blurred = image_ops_gaussian_blur(gray, sig_diff);
        gray_image_free(gray);
        return blurred;
    }
}

static GrayImage*** build_gauss_pyr(GrayImage* basepic, int octvs, int intvls, double sigma) {
    GrayImage*** gauss_pyr = (GrayImage***)malloc(octvs * sizeof(GrayImage**));
    for (int o = 0; o < octvs; o++) {
        gauss_pyr[o] = (GrayImage**)malloc((intvls + 3) * sizeof(GrayImage*));
    }
    double* sig = (double*)malloc((intvls + 3) * sizeof(double));
    double k = pow(2.0, 1.0 / intvls);
    sig[0] = sigma;
    for (int i = 1; i < intvls + 3; i++) {
        double sig_prev = pow(k, i - 1) * sigma;
        double sig_total = sig_prev * k;
        sig[i] = sqrt(sig_total * sig_total - sig_prev * sig_prev);
    }
    for (int o = 0; o < octvs; o++) {
        for (int i = 0; i < intvls + 3; i++) {
            if (o == 0 && i == 0) {
                gauss_pyr[o][i] = gray_image_clone(basepic);
            } else if (i == 0) {
                gauss_pyr[o][i] = image_ops_downsample(gauss_pyr[o - 1][intvls]);
            } else {
                gauss_pyr[o][i] = image_ops_gaussian_blur(gauss_pyr[o][i - 1], sig[i]);
            }
        }
    }
    free(sig);
    return gauss_pyr;
}

static GrayImage*** build_dog_pyr(GrayImage*** gauss_pyr, int octvs, int intvls) {
    GrayImage*** dog_pyr = (GrayImage***)malloc(octvs * sizeof(GrayImage**));
    for (int o = 0; o < octvs; o++) {
        dog_pyr[o] = (GrayImage**)malloc((intvls + 2) * sizeof(GrayImage*));
        for (int i = 0; i < intvls + 2; i++) {
            dog_pyr[o][i] = image_ops_subtract(gauss_pyr[o][i + 1], gauss_pyr[o][i]);
        }
    }
    return dog_pyr;
}

static void free_pyr(GrayImage*** pyr, int octvs, int layers) {
    for (int o = 0; o < octvs; o++) {
        for (int i = 0; i < layers; i++) {
            gray_image_free(pyr[o][i]);
        }
        free(pyr[o]);
    }
    free(pyr);
}

static int calc_grad_mag_ori(const GrayImage* img, int r, int c, double* mag, double* ori) {
    if (r > 0 && r < img->height - 1 && c > 0 && c < img->width - 1) {
        double dx = gray_image_get(img, r, c + 1) - gray_image_get(img, r, c - 1);
        double dy = gray_image_get(img, r - 1, c) - gray_image_get(img, r + 1, c);
        *mag = sqrt(dx * dx + dy * dy);
        *ori = atan2(dy, dx);
        return 1;
    }
    *mag = 0;
    *ori = 0;
    return 0;
}

static double* ori_hist(const GrayImage* img, int r, int c, int n, int rad, double sigma) {
    double* hist = (double*)calloc(n, sizeof(double));
    double exp_denom = 2.0 * sigma * sigma;
    double PI2 = M_PI * 2.0;
    for (int i = -rad; i <= rad; i++) {
        for (int j = -rad; j <= rad; j++) {
            double mag, ori;
            if (calc_grad_mag_ori(img, r + i, c + j, &mag, &ori) == 1) {
                double w = exp(-(i * i + j * j) / exp_denom);
                int bin = (int)round(n * (ori + M_PI) / PI2);
                bin = (bin < n) ? bin : 0;
                hist[bin] += w * mag;
            }
        }
    }
    return hist;
}

static void smooth_ori_hist(double* hist, int n) {
    double h0 = hist[0];
    double prev = hist[n - 1];
    for (int i = 0; i < n; i++) {
        double tmp = hist[i];
        hist[i] = 0.25 * prev + 0.5 * hist[i] + 0.25 * ((i + 1 == n) ? h0 : hist[i + 1]);
        prev = tmp;
    }
}

static double dominant_ori(double* hist, int n) {
    double omax = hist[0];
    for (int i = 1; i < n; i++)
        if (hist[i] > omax) omax = hist[i];
    return omax;
}

static double interp_hist_peak(double l, double c, double r) {
    return 0.5 * (l - r) / (l - 2.0 * c + r);
}

static void add_good_ori_features(FeatureArray* features, double* hist, int n,
                                   double mag_thr, Feature* feat) {
    double PI2 = M_PI * 2.0;
    for (int i = 0; i < n; i++) {
        int l = (i == 0) ? n - 1 : i - 1;
        int r_idx = (i + 1) % n;
        if (hist[i] > hist[l] && hist[i] > hist[r_idx] && hist[i] >= mag_thr) {
            double bin = i + interp_hist_peak(hist[l], hist[i], hist[r_idx]);
            bin = (bin < 0) ? n + bin : (bin >= n) ? bin - n : bin;
            Feature* new_feat = feature_clone(feat);
            new_feat->ori = ((PI2 * bin) / n) - M_PI;
            feature_array_push(features, new_feat);
        }
    }
}

static void calc_feature_oris(FeatureArray* features, GrayImage*** gauss_pyr) {
    int n = features->size;
    for (int i = 0; i < n; i++) {
        Feature* feat = features->data[0];
        for (int j = 0; j < features->size - 1; j++) {
            features->data[j] = features->data[j + 1];
        }
        features->size--;
        DetectionData ddata = feat->feature_data;
        double* hist = ori_hist(gauss_pyr[ddata.octv][ddata.intvl],
                                ddata.r, ddata.c, SIFT_ORI_HIST_BINS,
                                (int)round(SIFT_ORI_RADIUS * ddata.scl_octv),
                                SIFT_ORI_SIG_FCTR * ddata.scl_octv);
        for (int j = 0; j < SIFT_ORI_SMOOTH_PASSES; j++)
            smooth_ori_hist(hist, SIFT_ORI_HIST_BINS);
        double omax = dominant_ori(hist, SIFT_ORI_HIST_BINS);
        add_good_ori_features(features, hist, SIFT_ORI_HIST_BINS,
                              omax * SIFT_ORI_PEAK_RATIO, feat);
        free(hist);
        feature_free(feat);
    }
}

static void calc_feature_scales(FeatureArray* features, double sigma, int intvls) {
    for (int i = 0; i < features->size; i++) {
        Feature* feat = features->data[i];
        double intvl = feat->feature_data.intvl + feat->feature_data.subintvl;
        feat->scl = sigma * pow(2.0, feat->feature_data.octv + intvl / intvls);
        feat->feature_data.scl_octv = sigma * pow(2.0, intvl / intvls);
    }
}

static void adjust_for_img_dbl(FeatureArray* features) {
    for (int i = 0; i < features->size; i++) {
        Feature* feat = features->data[i];
        feat->x /= 2.0;
        feat->y /= 2.0;
        feat->scl /= 2.0;
        feat->img_pt_x = (float)(feat->img_pt_x / 2.0f);
        feat->img_pt_y = (float)(feat->img_pt_y / 2.0f);
    }
}

static void interp_hist_entry(float* hist, int d, int n,
                               double rbin, double cbin, double obin, double mag) {
    int r0 = (int)floor(rbin);
    int c0 = (int)floor(cbin);
    int o0 = (int)floor(obin);
    float d_r = (float)(rbin - r0);
    float d_c = (float)(cbin - c0);
    float d_o = (float)(obin - o0);
    for (int r = 0; r <= 1; r++) {
        int rb = r0 + r;
        if (rb >= 0 && rb < d) {
            float v_r = (float)mag * ((r == 0) ? 1.0f - d_r : d_r);
            for (int c = 0; c <= 1; c++) {
                int cb = c0 + c;
                if (cb >= 0 && cb < d) {
                    float v_c = v_r * ((c == 0) ? 1.0f - d_c : d_c);
                    for (int o = 0; o <= 1; o++) {
                        int ob = (o0 + o) % n;
                        float v_o = v_c * ((o == 0) ? 1.0f - d_o : d_o);
                        hist[((rb * d) + cb) * n + ob] += v_o;
                    }
                }
            }
        }
    }
}

static float* descr_hist(const GrayImage* img, int r, int c, double ori,
                          double scl, int d, int n) {
    float* hist = (float*)calloc(d * d * n, sizeof(float));
    double cos_t = cos(ori);
    double sin_t = sin(ori);
    double bins_per_rad = n / (2.0 * M_PI);
    double exp_denom = d * d * 0.5;
    double hist_width = SIFT_DESCR_SCL_FCTR * scl;
    int radius = (int)(hist_width * sqrt(2.0) * (d + 1.0) * 0.5 + 0.5);
    for (int i = -radius; i <= radius; i++) {
        for (int j = -radius; j <= radius; j++) {
            double c_rot = (j * cos_t - i * sin_t) / hist_width;
            double r_rot = (j * sin_t + i * cos_t) / hist_width;
            double rbin = r_rot + d / 2.0 - 0.5;
            double cbin = c_rot + d / 2.0 - 0.5;
            if (rbin > -1.0 && rbin < d && cbin > -1.0 && cbin < d) {
                double grad_mag, grad_ori;
                if (calc_grad_mag_ori(img, r + i, c + j, &grad_mag, &grad_ori) != 0) {
                    grad_ori -= ori;
                    while (grad_ori < 0.0) grad_ori += 2.0 * M_PI;
                    while (grad_ori >= 2.0 * M_PI) grad_ori -= 2.0 * M_PI;
                    double obin = grad_ori * bins_per_rad;
                    double w = exp(-(c_rot * c_rot + r_rot * r_rot) / exp_denom);
                    interp_hist_entry(hist, d, n, rbin, cbin, obin, grad_mag * w);
                }
            }
        }
    }
    return hist;
}

static void normalize_descr(Feature* feat) {
    double len_sq = 0.0;
    for (int i = 0; i < feat->d; i++) {
        double cur = feat->descr[i];
        len_sq += cur * cur;
    }
    double len_inv = 1.0 / sqrt(len_sq);
    for (int i = 0; i < feat->d; i++)
        feat->descr[i] *= len_inv;
}

static void hist_to_descr(float* hist, int d, int n, Feature* feat) {
    int k = 0;
    for (int r = 0; r < d; r++)
        for (int c = 0; c < d; c++)
            for (int o = 0; o < n; o++)
                feat->descr[k++] = hist[((r * d) + c) * n + o];
    feat->d = k;
    normalize_descr(feat);
    for (int i = 0; i < k; i++)
        if (feat->descr[i] > SIFT_DESCR_MAG_THR)
            feat->descr[i] = SIFT_DESCR_MAG_THR;
    normalize_descr(feat);
    for (int i = 0; i < k; i++) {
        int int_val = (int)(SIFT_INT_DESCR_FCTR * feat->descr[i]);
        feat->descr[i] = int_val < 255 ? int_val : 255;
    }
}

static void compute_descriptors(FeatureArray* features, GrayImage*** gauss_pyr, int d, int n) {
    for (int i = 0; i < features->size; i++) {
        Feature* feat = features->data[i];
        DetectionData ddata = feat->feature_data;
        float* hist = descr_hist(gauss_pyr[ddata.octv][ddata.intvl], ddata.r,
                                 ddata.c, feat->ori, ddata.scl_octv, d, n);
        hist_to_descr(hist, d, n, feat);
        free(hist);
    }
}

static int is_extremum(GrayImage*** dog_pyr, int octv, int intvl, int r, int c) {
    float val = gray_image_get(dog_pyr[octv][intvl], r, c);
    if (val > 0) {
        for (int i = -1; i <= 1; i++)
            for (int j = -1; j <= 1; j++)
                for (int k = -1; k <= 1; k++)
                    if (val < gray_image_get(dog_pyr[octv][intvl + i], r + j, c + k))
                        return 0;
    } else {
        for (int i = -1; i <= 1; i++)
            for (int j = -1; j <= 1; j++)
                for (int k = -1; k <= 1; k++)
                    if (val > gray_image_get(dog_pyr[octv][intvl + i], r + j, c + k))
                        return 0;
    }
    return 1;
}

static void deriv_3D(GrayImage*** dog_pyr, int octv, int intvl, int r, int c, double dI[3]) {
    dI[0] = (gray_image_get(dog_pyr[octv][intvl], r, c + 1) -
             gray_image_get(dog_pyr[octv][intvl], r, c - 1)) / 2.0;
    dI[1] = (gray_image_get(dog_pyr[octv][intvl], r + 1, c) -
             gray_image_get(dog_pyr[octv][intvl], r - 1, c)) / 2.0;
    dI[2] = (gray_image_get(dog_pyr[octv][intvl + 1], r, c) -
             gray_image_get(dog_pyr[octv][intvl - 1], r, c)) / 2.0;
}

static void hessian_3D(GrayImage*** dog_pyr, int octv, int intvl, int r, int c, double H[3][3]) {
    double v = gray_image_get(dog_pyr[octv][intvl], r, c);
    double dxx = gray_image_get(dog_pyr[octv][intvl], r, c + 1) +
                 gray_image_get(dog_pyr[octv][intvl], r, c - 1) - 2 * v;
    double dyy = gray_image_get(dog_pyr[octv][intvl], r + 1, c) +
                 gray_image_get(dog_pyr[octv][intvl], r - 1, c) - 2 * v;
    double dss = gray_image_get(dog_pyr[octv][intvl + 1], r, c) +
                 gray_image_get(dog_pyr[octv][intvl - 1], r, c) - 2 * v;
    double dxy = (gray_image_get(dog_pyr[octv][intvl], r + 1, c + 1) -
                  gray_image_get(dog_pyr[octv][intvl], r + 1, c - 1) -
                  gray_image_get(dog_pyr[octv][intvl], r - 1, c + 1) +
                  gray_image_get(dog_pyr[octv][intvl], r - 1, c - 1)) / 4.0;
    double dxs = (gray_image_get(dog_pyr[octv][intvl + 1], r, c + 1) -
                  gray_image_get(dog_pyr[octv][intvl + 1], r, c - 1) -
                  gray_image_get(dog_pyr[octv][intvl - 1], r, c + 1) +
                  gray_image_get(dog_pyr[octv][intvl - 1], r, c - 1)) / 4.0;
    double dys = (gray_image_get(dog_pyr[octv][intvl + 1], r + 1, c) -
                  gray_image_get(dog_pyr[octv][intvl + 1], r - 1, c) -
                  gray_image_get(dog_pyr[octv][intvl - 1], r + 1, c) +
                  gray_image_get(dog_pyr[octv][intvl - 1], r - 1, c)) / 4.0;
    H[0][0] = dxx; H[0][1] = dxy; H[0][2] = dxs;
    H[1][0] = dxy; H[1][1] = dyy; H[1][2] = dys;
    H[2][0] = dxs; H[2][1] = dys; H[2][2] = dss;
}

static void interp_step(GrayImage*** dog_pyr, int octv, int intvl, int r, int c,
                        double* xi, double* xr, double* xc) {
    double dD[3];
    deriv_3D(dog_pyr, octv, intvl, r, c, dD);
    double H[3][3];
    hessian_3D(dog_pyr, octv, intvl, r, c, H);
    double H_inv[3][3];
    invert3x3(H, H_inv);
    double x[3] = {0, 0, 0};
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++)
            x[i] -= H_inv[i][j] * dD[j];
    }
    *xc = x[0];
    *xr = x[1];
    *xi = x[2];
}

static double interp_contr(GrayImage*** dog_pyr, int octv, int intvl, int r,
                           int c, double xi, double xr, double xc) {
    double dD[3];
    deriv_3D(dog_pyr, octv, intvl, r, c, dD);
    double t = dD[0] * xc + dD[1] * xr + dD[2] * xi;
    return gray_image_get(dog_pyr[octv][intvl], r, c) + t * 0.5;
}

static Feature* new_feature(void) {
    Feature* feat = feature_new();
    feat->type = FEATURE_LOWE;
    return feat;
}

static Feature* interp_extremum(GrayImage*** dog_pyr, int octv, int intvl,
                                int r, int c, int intvls, double contr_thr) {
    double xi = 0, xr = 0, xc = 0, contr;
    int i = 0;
    while (i < SIFT_MAX_INTERP_STEPS) {
        interp_step(dog_pyr, octv, intvl, r, c, &xi, &xr, &xc);
        if (fabs(xi) < 0.5 && fabs(xr) < 0.5 && fabs(xc) < 0.5)
            break;
        c += (int)round(xc);
        r += (int)round(xr);
        intvl += (int)round(xi);
        if (intvl < 1 || intvl > intvls ||
            c < SIFT_IMG_BORDER || r < SIFT_IMG_BORDER ||
            c >= dog_pyr[octv][0]->width - SIFT_IMG_BORDER ||
            r >= dog_pyr[octv][0]->height - SIFT_IMG_BORDER) {
            return NULL;
        }
        i++;
    }
    if (i >= SIFT_MAX_INTERP_STEPS)
        return NULL;
    contr = interp_contr(dog_pyr, octv, intvl, r, c, xi, xr, xc);
    if (fabs(contr) < contr_thr / intvls)
        return NULL;
    Feature* feat = new_feature();
    DetectionData ddata = feat->feature_data;
    feat->img_pt_x = (float)((feat->x = (c + xc) * pow(2.0, octv)));
    feat->img_pt_y = (float)((feat->y = (r + xr) * pow(2.0, octv)));
    ddata.r = r;
    ddata.c = c;
    ddata.octv = octv;
    ddata.intvl = intvl;
    ddata.subintvl = xi;
    feat->feature_data = ddata;
    return feat;
}

static int is_too_edge_like(const GrayImage* dog_img, int r, int c, int curv_thr) {
    if (c == 0 || r == 0) return 1;
    double d = gray_image_get(dog_img, r, c);
    double dxx = gray_image_get(dog_img, r, c + 1) + gray_image_get(dog_img, r, c - 1) - 2 * d;
    double dyy = gray_image_get(dog_img, r + 1, c) + gray_image_get(dog_img, r - 1, c) - 2 * d;
    double dxy = (gray_image_get(dog_img, r + 1, c + 1) - gray_image_get(dog_img, r + 1, c - 1) -
                  gray_image_get(dog_img, r - 1, c + 1) + gray_image_get(dog_img, r - 1, c - 1)) / 4.0;
    double tr = dxx + dyy;
    double det = dxx * dyy - dxy * dxy;
    if (det <= 0) return 1;
    if (tr * tr / det < (curv_thr + 1.0) * (curv_thr + 1.0) / curv_thr)
        return 0;
    return 1;
}

static FeatureArray* scale_space_extrema(GrayImage*** dog_pyr, int octvs, int intvls,
                                          double contr_thr, int curv_thr) {
    FeatureArray* features = feature_array_new();
    double prelim_contr_thr = 0.5 * contr_thr / intvls;
    for (int o = 0; o < octvs; o++) {
        for (int i = 1; i <= intvls; i++) {
            for (int r = SIFT_IMG_BORDER; r < dog_pyr[o][0]->height - SIFT_IMG_BORDER; r++) {
                for (int c = SIFT_IMG_BORDER; c < dog_pyr[o][0]->width - SIFT_IMG_BORDER; c++) {
                    if (fabs(gray_image_get(dog_pyr[o][i], r, c)) > prelim_contr_thr) {
                        if (is_extremum(dog_pyr, o, i, r, c) == 1) {
                            Feature* feat = interp_extremum(dog_pyr, o, i, r, c, intvls, contr_thr);
                            if (feat != NULL) {
                                DetectionData ddata = feat->feature_data;
                                if (is_too_edge_like(dog_pyr[ddata.octv][ddata.intvl],
                                                     ddata.r, ddata.c, curv_thr) == 0) {
                                    feature_array_push(features, feat);
                                } else {
                                    feature_free(feat);
                                }
                            }
                        }
                    }
                }
            }
        }
    }
    return features;
}

FeatureArray* sift_extract_features(const GrayImage* img) {
    GrayImage* init_img = create_init_img(img, SIFT_IMG_DBL, SIFT_SIGMA);
    int octvs = (int)(log(fmin(init_img->width, init_img->height)) / log(2.0) - 2);
    if (octvs < 1) octvs = 1;

    GrayImage*** gauss_pyr = build_gauss_pyr(init_img, octvs, SIFT_INTVLS, SIFT_SIGMA);
    GrayImage*** dog_pyr = build_dog_pyr(gauss_pyr, octvs, SIFT_INTVLS);

    FeatureArray* features = scale_space_extrema(dog_pyr, octvs, SIFT_INTVLS, SIFT_CONTR_THR, SIFT_CURV_THR);
    calc_feature_scales(features, SIFT_SIGMA, SIFT_INTVLS);
    if (SIFT_IMG_DBL != 0)
        adjust_for_img_dbl(features);

    calc_feature_oris(features, gauss_pyr);
    compute_descriptors(features, gauss_pyr, SIFT_DESCR_WIDTH, SIFT_DESCR_HIST_BINS);
    feature_array_sort_by_scale(features);

    gray_image_free(init_img);
    free_pyr(gauss_pyr, octvs, SIFT_INTVLS + 3);
    free_pyr(dog_pyr, octvs, SIFT_INTVLS + 2);

    return features;
}

sift_matcher.h

#ifndef SIFT_MATCHER_H
#define SIFT_MATCHER_H

#ifdef __cplusplus
extern "C" {
#endif

#include "sift_types.h"
#include "sift_algorithm.h"

double sift_matcher_compute_similarity(const FeatureArray* features1, const FeatureArray* features2);
int sift_matcher_count_matches(const FeatureArray* features1, const FeatureArray* features2);
double sift_matcher_descriptor_distance(const Feature* a, const Feature* b);

#ifdef __cplusplus
}
#endif

#endif

sift_matcher.cu

#include "sift_matcher.h"
#include <cuda_runtime.h>
#include <math.h>
#include <float.h>
#include <stdlib.h>
#include <stdio.h>

__global__ void descriptor_distances_kernel(const double* descr1, const double* descr2,
                                            double* distances, int n1, int n2, int d) {
    int idx1 = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx1 >= n1) return;

    for (int idx2 = 0; idx2 < n2; idx2++) {
        double sum = 0.0;
        for (int i = 0; i < d; i++) {
            double diff = descr1[idx1 * d + i] - descr2[idx2 * d + i];
            sum += diff * diff;
        }
        distances[idx1 * n2 + idx2] = sqrt(sum);
    }
}

double sift_matcher_descriptor_distance(const Feature* a, const Feature* b) {
    int len = (a->d < b->d) ? a->d : b->d;
    if (len == 0) {
        int la = FEATURE_MAX_D;
        int lb = FEATURE_MAX_D;
        len = (la < lb) ? la : lb;
    }
    double sum = 0.0;
    for (int i = 0; i < len; i++) {
        double diff = a->descr[i] - b->descr[i];
        sum += diff * diff;
    }
    return sqrt(sum);
}

static void check_cuda(cudaError_t err, const char* msg) {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error (%s): %s\n", msg, cudaGetErrorString(err));
        exit(1);
    }
}

int sift_matcher_count_matches(const FeatureArray* features1, const FeatureArray* features2) {
    if (!features1 || !features2 || features1->size == 0 || features2->size == 0)
        return 0;

    int n1 = features1->size;
    int n2 = features2->size;
    int d = FEATURE_MAX_D;

    double* h_descr1 = (double*)malloc(n1 * d * sizeof(double));
    double* h_descr2 = (double*)malloc(n2 * d * sizeof(double));
    for (int i = 0; i < n1; i++) {
        memcpy(h_descr1 + i * d, features1->data[i]->descr, d * sizeof(double));
    }
    for (int i = 0; i < n2; i++) {
        memcpy(h_descr2 + i * d, features2->data[i]->descr, d * sizeof(double));
    }

    double *d_descr1, *d_descr2, *d_distances;
    check_cuda(cudaMalloc(&d_descr1, n1 * d * sizeof(double)), "cudaMalloc d_descr1");
    check_cuda(cudaMalloc(&d_descr2, n2 * d * sizeof(double)), "cudaMalloc d_descr2");
    check_cuda(cudaMalloc(&d_distances, n1 * n2 * sizeof(double)), "cudaMalloc d_distances");

    check_cuda(cudaMemcpy(d_descr1, h_descr1, n1 * d * sizeof(double), cudaMemcpyHostToDevice), "memcpy descr1");
    check_cuda(cudaMemcpy(d_descr2, h_descr2, n2 * d * sizeof(double), cudaMemcpyHostToDevice), "memcpy descr2");

    int threadsPerBlock = 256;
    int blocksPerGrid = (n1 + threadsPerBlock - 1) / threadsPerBlock;
    descriptor_distances_kernel<<<blocksPerGrid, threadsPerBlock>>>(d_descr1, d_descr2, d_distances, n1, n2, d);
    check_cuda(cudaGetLastError(), "descriptor_distances_kernel");

    double* h_distances = (double*)malloc(n1 * n2 * sizeof(double));
    check_cuda(cudaMemcpy(h_distances, d_distances, n1 * n2 * sizeof(double), cudaMemcpyDeviceToHost), "memcpy distances");

    int matchCount = 0;
    double ratioThresh = 0.75;

    for (int i = 0; i < n1; i++) {
        double bestDist = DBL_MAX;
        double secondBestDist = DBL_MAX;
        for (int j = 0; j < n2; j++) {
            double dist = h_distances[i * n2 + j];
            if (dist < bestDist) {
                secondBestDist = bestDist;
                bestDist = dist;
            } else if (dist < secondBestDist) {
                secondBestDist = dist;
            }
        }
        if (secondBestDist > 0 && bestDist / secondBestDist < ratioThresh) {
            matchCount++;
        }
    }

    free(h_descr1);
    free(h_descr2);
    free(h_distances);
    cudaFree(d_descr1);
    cudaFree(d_descr2);
    cudaFree(d_distances);

    return matchCount;
}

double sift_matcher_compute_similarity(const FeatureArray* features1, const FeatureArray* features2) {
    if (!features1 || !features2 || features1->size == 0 || features2->size == 0)
        return 0.0;

    int matches = sift_matcher_count_matches(features1, features2);
    int minFeatures = (features1->size < features2->size) ? features1->size : features2->size;
    if (minFeatures == 0)
        return 0.0;

    double ratio = (double)matches / minFeatures;
    double score = 1.0 - exp(-3.0 * ratio);
    if (score < 0.0) score = 0.0;
    if (score > 1.0) score = 1.0;
    return score;
}

image_similarity.h

#ifndef IMAGE_SIMILARITY_H
#define IMAGE_SIMILARITY_H

#ifdef __cplusplus
extern "C" {
#endif

#include "gray_image.h"
#include "sift_algorithm.h"

typedef struct TemplateCacheEntry {
    char* label;
    FeatureArray* features;
    struct TemplateCacheEntry* next;
} TemplateCacheEntry;

typedef struct {
    TemplateCacheEntry** buckets;
    int bucket_count;
} TemplateCache;

typedef struct {
    TemplateCache* cache;
} ImageSimilarityEvaluator;

ImageSimilarityEvaluator* evaluator_new(void);
void evaluator_free(ImageSimilarityEvaluator* eval);

double evaluator_evaluate(ImageSimilarityEvaluator* eval, const GrayImage* currentImage,
                          const char* label, const GrayImage* correctImage);

void evaluator_preload_template(ImageSimilarityEvaluator* eval, const char* label,
                                const GrayImage* correctImage);
void evaluator_clear_cache(ImageSimilarityEvaluator* eval, const char* label);
void evaluator_clear_all_cache(ImageSimilarityEvaluator* eval);

#ifdef __cplusplus
}
#endif

#endif

image_similarity.cu

#include "image_similarity.h"
#include "sift_matcher.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#ifdef _WIN32
#include <string.h>
#define strcasecmp _stricmp
#define strdup _strdup
#else
#include <strings.h>
#endif

static unsigned int hash_string(const char* str) {
    unsigned int hash = 5381;
    int c;
    while ((c = *str++))
        hash = ((hash << 5) + hash) + c;
    return hash;
}

static TemplateCache* template_cache_new(void) {
    TemplateCache* cache = (TemplateCache*)malloc(sizeof(TemplateCache));
    cache->bucket_count = 64;
    cache->buckets = (TemplateCacheEntry**)calloc(cache->bucket_count, sizeof(TemplateCacheEntry*));
    return cache;
}

static void template_cache_free(TemplateCache* cache) {
    if (!cache) return;
    for (int i = 0; i < cache->bucket_count; i++) {
        TemplateCacheEntry* entry = cache->buckets[i];
        while (entry) {
            TemplateCacheEntry* next = entry->next;
            free(entry->label);
            feature_array_free(entry->features);
            free(entry);
            entry = next;
        }
    }
    free(cache->buckets);
    free(cache);
}

static FeatureArray* template_cache_get(TemplateCache* cache, const char* label) {
    unsigned int h = hash_string(label) % cache->bucket_count;
    TemplateCacheEntry* entry = cache->buckets[h];
    while (entry) {
        if (strcasecmp(entry->label, label) == 0)
            return entry->features;
        entry = entry->next;
    }
    return NULL;
}

static void template_cache_set(TemplateCache* cache, const char* label, FeatureArray* features) {
    unsigned int h = hash_string(label) % cache->bucket_count;
    TemplateCacheEntry* entry = cache->buckets[h];
    while (entry) {
        if (strcasecmp(entry->label, label) == 0) {
            feature_array_free(entry->features);
            entry->features = features;
            return;
        }
        entry = entry->next;
    }
    entry = (TemplateCacheEntry*)malloc(sizeof(TemplateCacheEntry));
    entry->label = strdup(label);
    entry->features = features;
    entry->next = cache->buckets[h];
    cache->buckets[h] = entry;
}

static void template_cache_remove(TemplateCache* cache, const char* label) {
    unsigned int h = hash_string(label) % cache->bucket_count;
    TemplateCacheEntry** p = &cache->buckets[h];
    while (*p) {
        if (strcasecmp((*p)->label, label) == 0) {
            TemplateCacheEntry* to_remove = *p;
            *p = (*p)->next;
            free(to_remove->label);
            feature_array_free(to_remove->features);
            free(to_remove);
            return;
        }
        p = &(*p)->next;
    }
}

static FeatureArray* extract_features(const GrayImage* img) {
    return sift_extract_features(img);
}

ImageSimilarityEvaluator* evaluator_new(void) {
    ImageSimilarityEvaluator* eval = (ImageSimilarityEvaluator*)malloc(sizeof(ImageSimilarityEvaluator));
    eval->cache = template_cache_new();
    return eval;
}

void evaluator_free(ImageSimilarityEvaluator* eval) {
    if (!eval) return;
    template_cache_free(eval->cache);
    free(eval);
}

double evaluator_evaluate(ImageSimilarityEvaluator* eval, const GrayImage* currentImage,
                          const char* label, const GrayImage* correctImage) {
    if (!currentImage || !label || strlen(label) == 0)
        return 0.0;

    FeatureArray* templateFeatures = NULL;
    if (correctImage != NULL) {
        templateFeatures = extract_features(correctImage);
        template_cache_set(eval->cache, label, templateFeatures);
    } else {
        templateFeatures = template_cache_get(eval->cache, label);
    }

    if (!templateFeatures)
        return 0.0;

    FeatureArray* currentFeatures = extract_features(currentImage);

    if (currentFeatures->size == 0 || templateFeatures->size == 0) {
        feature_array_free(currentFeatures);
        return 0.0;
    }

    double similarity = sift_matcher_compute_similarity(currentFeatures, templateFeatures);
    feature_array_free(currentFeatures);
    return similarity;
}

void evaluator_preload_template(ImageSimilarityEvaluator* eval, const char* label,
                                const GrayImage* correctImage) {
    if (!correctImage || !label || strlen(label) == 0)
        return;

    FeatureArray* features = extract_features(correctImage);
    template_cache_set(eval->cache, label, features);
}

void evaluator_clear_cache(ImageSimilarityEvaluator* eval, const char* label) {
    if (!label) return;
    template_cache_remove(eval->cache, label);
}

void evaluator_clear_all_cache(ImageSimilarityEvaluator* eval) {
    template_cache_free(eval->cache);
    eval->cache = template_cache_new();
}

test_sift.cpp

#define STB_IMAGE_IMPLEMENTATION
#include "stb_image/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image/stb_image_write.h"

#include "gray_image.h"
#include "sift_algorithm.h"
#include "sift_matcher.h"

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <float.h>

#define FEATURE_MAX_D 128

typedef struct {
    int idx1;
    int idx2;
    double dist;
} MatchPair;

static GrayImage* load_gray_image(const char* path) {
    int w, h, channels;
    unsigned char* pixels = stbi_load(path, &w, &h, &channels, 0);
    if (!pixels) {
        fprintf(stderr, "Failed to load image: %s\n", path);
        return NULL;
    }
    GrayImage* img = gray_image_create(w, h);
    if (!img) {
        stbi_image_free(pixels);
        return NULL;
    }
    for (int y = 0; y < h; y++) {
        for (int x = 0; x < w; x++) {
            int idx = (y * w + x) * channels;
            float gray = 0.0f;
            if (channels >= 3) {
                float r = pixels[idx + 0] / 255.0f;
                float g = pixels[idx + 1] / 255.0f;
                float b = pixels[idx + 2] / 255.0f;
                gray = r * 0.299f + g * 0.587f + b * 0.114f;
            } else if (channels == 1) {
                gray = pixels[idx] / 255.0f;
            }
            gray_image_set(img, y, x, gray);
        }
    }
    stbi_image_free(pixels);
    return img;
}

static double descriptor_distance(const Feature* a, const Feature* b) {
    int len = FEATURE_MAX_D;
    double sum = 0.0;
    for (int i = 0; i < len; i++) {
        double diff = a->descr[i] - b->descr[i];
        sum += diff * diff;
    }
    return sqrt(sum);
}

static int match_features(const FeatureArray* f1, const FeatureArray* f2, MatchPair** out_matches) {
    if (!f1 || !f2 || f1->size == 0 || f2->size == 0) return 0;

    int n1 = f1->size;
    int n2 = f2->size;
    double ratioThresh = 0.75;

    MatchPair* matches = (MatchPair*)malloc(n1 * sizeof(MatchPair));
    int match_count = 0;

    for (int i = 0; i < n1; i++) {
        double bestDist = DBL_MAX;
        double secondBestDist = DBL_MAX;
        int bestIdx = -1;

        for (int j = 0; j < n2; j++) {
            double dist = descriptor_distance(f1->data[i], f2->data[j]);
            if (dist < bestDist) {
                secondBestDist = bestDist;
                bestDist = dist;
                bestIdx = j;
            } else if (dist < secondBestDist) {
                secondBestDist = dist;
            }
        }

        if (secondBestDist > 0 && bestDist / secondBestDist < ratioThresh && bestIdx >= 0) {
            matches[match_count].idx1 = i;
            matches[match_count].idx2 = bestIdx;
            matches[match_count].dist = bestDist;
            match_count++;
        }
    }

    *out_matches = matches;
    return match_count;
}

static void set_pixel(unsigned char* img, int w, int h, int x, int y, unsigned char r, unsigned char g, unsigned char b) {
    if (x < 0 || x >= w || y < 0 || y >= h) return;
    int idx = (y * w + x) * 3;
    img[idx + 0] = r;
    img[idx + 1] = g;
    img[idx + 2] = b;
}

static void draw_line(unsigned char* img, int w, int h, int x0, int y0, int x1, int y1, unsigned char r, unsigned char g, unsigned char b) {
    int dx = abs(x1 - x0);
    int dy = abs(y1 - y0);
    int sx = (x0 < x1) ? 1 : -1;
    int sy = (y0 < y1) ? 1 : -1;
    int err = dx - dy;
    while (1) {
        set_pixel(img, w, h, x0, y0, r, g, b);
        if (x0 == x1 && y0 == y1) break;
        int e2 = 2 * err;
        if (e2 > -dy) {
            err -= dy;
            x0 += sx;
        }
        if (e2 < dx) {
            err += dx;
            y0 += sy;
        }
    }
}

static void draw_circle(unsigned char* img, int w, int h, int cx, int cy, int radius, unsigned char r, unsigned char g, unsigned char b) {
    int x = radius;
    int y = 0;
    int err = 0;
    while (x >= y) {
        set_pixel(img, w, h, cx + x, cy + y, r, g, b);
        set_pixel(img, w, h, cx + y, cy + x, r, g, b);
        set_pixel(img, w, h, cx - y, cy + x, r, g, b);
        set_pixel(img, w, h, cx - x, cy + y, r, g, b);
        set_pixel(img, w, h, cx - x, cy - y, r, g, b);
        set_pixel(img, w, h, cx - y, cy - x, r, g, b);
        set_pixel(img, w, h, cx + y, cy - x, r, g, b);
        set_pixel(img, w, h, cx + x, cy - y, r, g, b);
        if (err <= 0) {
            y += 1;
            err += 2 * y + 1;
        }
        if (err > 0) {
            x -= 1;
            err -= 2 * x + 1;
        }
    }
}

static unsigned char* create_side_by_side(const unsigned char* img1, int w1, int h1,
                                          const unsigned char* img2, int w2, int h2,
                                          int* out_w, int* out_h) {
    int max_h = (h1 > h2) ? h1 : h2;
    int out_w_val = w1 + w2;
    int out_h_val = max_h;
    *out_w = out_w_val;
    *out_h = out_h_val;

    unsigned char* out = (unsigned char*)calloc(out_w_val * out_h_val * 3, sizeof(unsigned char));
    if (!out) return NULL;

    for (int i = 0; i < out_w_val * out_h_val * 3; i++) {
        out[i] = 30;
    }

    for (int y = 0; y < h1; y++) {
        for (int x = 0; x < w1; x++) {
            int src_idx = (y * w1 + x) * 3;
            int dst_idx = (y * out_w_val + x) * 3;
            out[dst_idx + 0] = img1[src_idx + 0];
            out[dst_idx + 1] = img1[src_idx + 1];
            out[dst_idx + 2] = img1[src_idx + 2];
        }
    }

    for (int y = 0; y < h2; y++) {
        for (int x = 0; x < w2; x++) {
            int src_idx = (y * w2 + x) * 3;
            int dst_idx = (y * out_w_val + (x + w1)) * 3;
            out[dst_idx + 0] = img2[src_idx + 0];
            out[dst_idx + 1] = img2[src_idx + 1];
            out[dst_idx + 2] = img2[src_idx + 2];
        }
    }

    return out;
}

static unsigned char* rgb_from_gray(const GrayImage* gray) {
    int w = gray->width;
    int h = gray->height;
    unsigned char* rgb = (unsigned char*)malloc(w * h * 3);
    if (!rgb) return NULL;
    for (int y = 0; y < h; y++) {
        for (int x = 0; x < w; x++) {
            unsigned char val = (unsigned char)(gray_image_get(gray, y, x) * 255.0f);
            int idx = (y * w + x) * 3;
            rgb[idx + 0] = val;
            rgb[idx + 1] = val;
            rgb[idx + 2] = val;
        }
    }
    return rgb;
}

int main(int argc, char** argv) {
    if (argc < 3) {
        printf("Usage: %s <image1> <image2> [output.jpg]\n", argv[0]);
        return 1;
    }

    const char* img1_path = argv[1];
    const char* img2_path = argv[2];
    const char* output_path = (argc >= 4) ? argv[3] : "sift_match_result.jpg";

    printf("Loading images...\n");
    GrayImage* gray1 = load_gray_image(img1_path);
    GrayImage* gray2 = load_gray_image(img2_path);
    if (!gray1 || !gray2) {
        fprintf(stderr, "Failed to load images\n");
        return 1;
    }

    printf("Image1: %dx%d\n", gray1->width, gray1->height);
    printf("Image2: %dx%d\n", gray2->width, gray2->height);

    printf("Extracting SIFT features from image1...\n");
    FeatureArray* f1 = sift_extract_features(gray1);
    printf("Features1: %d\n", f1 ? f1->size : 0);

    printf("Extracting SIFT features from image2...\n");
    FeatureArray* f2 = sift_extract_features(gray2);
    printf("Features2: %d\n", f2 ? f2->size : 0);

    if (!f1 || !f2 || f1->size == 0 || f2->size == 0) {
        fprintf(stderr, "No features found\n");
        return 1;
    }

    printf("Matching features...\n");
    MatchPair* matches = NULL;
    int match_count = match_features(f1, f2, &matches);
    printf("Matches: %d\n", match_count);

    unsigned char* rgb1 = rgb_from_gray(gray1);
    unsigned char* rgb2 = rgb_from_gray(gray2);
    if (!rgb1 || !rgb2) {
        fprintf(stderr, "Memory allocation failed\n");
        return 1;
    }

    int out_w, out_h;
    unsigned char* out = create_side_by_side(rgb1, gray1->width, gray1->height,
                                             rgb2, gray2->width, gray2->height,
                                             &out_w, &out_h);
    if (!out) {
        fprintf(stderr, "Memory allocation failed\n");
        return 1;
    }

    unsigned char colors[][3] = {
        {255, 0, 0}, {0, 255, 0}, {0, 0, 255},
        {255, 255, 0}, {255, 0, 255}, {0, 255, 255},
        {255, 128, 0}, {128, 255, 0}, {0, 128, 255},
        {255, 0, 128}, {128, 0, 255}, {0, 255, 128}
    };
    int color_count = sizeof(colors) / sizeof(colors[0]);
    int draw_count = (match_count > 50) ? 50 : match_count;

    for (int i = 0; i < draw_count; i++) {
        Feature* feat1 = f1->data[matches[i].idx1];
        Feature* feat2 = f2->data[matches[i].idx2];
        int x1 = (int)feat1->x;
        int y1 = (int)feat1->y;
        int x2 = (int)feat2->x + gray1->width;
        int y2 = (int)feat2->y;
        unsigned char* c = colors[i % color_count];
        draw_line(out, out_w, out_h, x1, y1, x2, y2, c[0], c[1], c[2]);
        draw_circle(out, out_w, out_h, x1, y1, 3, c[0], c[1], c[2]);
        draw_circle(out, out_w, out_h, x2, y2, 3, c[0], c[1], c[2]);
    }

    printf("Saving result to %s...\n", output_path);
    int success = stbi_write_jpg(output_path, out_w, out_h, 3, out, 95);
    if (!success) {
        fprintf(stderr, "Failed to write output image\n");
        return 1;
    }

    printf("Done. Result saved to %s\n", output_path);

    free(matches);
    free(rgb1);
    free(rgb2);
    free(out);
    feature_array_free(f1);
    feature_array_free(f2);
    gray_image_free(gray1);
    gray_image_free(gray2);

    return 0;
}

CudaSharpNative.cu (SIFT导出层节选)

// ==================== SIFT 模块导出接口 ====================

#include "gray_image.h"
#include "sift_algorithm.h"
#include "sift_matcher.h"
#include "image_similarity.h"

static GrayImage* load_gray_from_memory(const unsigned char* data, int size) {
    int w, h, channels;
    unsigned char* pixels = stbi_load_from_memory(data, size, &w, &h, &channels, 0);
    if (!pixels) return nullptr;
    GrayImage* img = gray_image_create(w, h);
    if (!img) { stbi_image_free(pixels); return nullptr; }
    for (int y = 0; y < h; y++) {
        for (int x = 0; x < w; x++) {
            int idx = (y * w + x) * channels;
            float gray = 0.0f;
            if (channels >= 3) {
                float r = pixels[idx + 0] / 255.0f;
                float g = pixels[idx + 1] / 255.0f;
                float b = pixels[idx + 2] / 255.0f;
                gray = r * 0.299f + g * 0.587f + b * 0.114f;
            } else if (channels == 1) {
                gray = pixels[idx] / 255.0f;
            }
            gray_image_set(img, y, x, gray);
        }
    }
    stbi_image_free(pixels);
    return img;
}

DLLEXPORT double CompareImagesSift(const unsigned char* data1, int size1,
                                    const unsigned char* data2, int size2) {
    if (!data1 || size1 <= 0 || !data2 || size2 <= 0) {
        lastError = "Invalid arguments";
        return -1.0;
    }
    GrayImage* img1 = load_gray_from_memory(data1, size1);
    if (!img1) { lastError = std::string("Failed to decode image 1: ") + stbi_failure_reason(); return -1.0; }
    GrayImage* img2 = load_gray_from_memory(data2, size2);
    if (!img2) { lastError = std::string("Failed to decode image 2: ") + stbi_failure_reason(); gray_image_free(img1); return -1.0; }

    FeatureArray* f1 = sift_extract_features(img1);
    FeatureArray* f2 = sift_extract_features(img2);
    double similarity = sift_matcher_compute_similarity(f1, f2);

    feature_array_free(f1);
    feature_array_free(f2);
    gray_image_free(img1);
    gray_image_free(img2);
    return similarity;
}

DLLEXPORT int CountMatchesSift(const unsigned char* data1, int size1,
                                const unsigned char* data2, int size2) {
    if (!data1 || size1 <= 0 || !data2 || size2 <= 0) {
        lastError = "Invalid arguments";
        return -1;
    }
    GrayImage* img1 = load_gray_from_memory(data1, size1);
    if (!img1) { lastError = std::string("Failed to decode image 1: ") + stbi_failure_reason(); return -1; }
    GrayImage* img2 = load_gray_from_memory(data2, size2);
    if (!img2) { lastError = std::string("Failed to decode image 2: ") + stbi_failure_reason(); gray_image_free(img1); return -1; }

    FeatureArray* f1 = sift_extract_features(img1);
    FeatureArray* f2 = sift_extract_features(img2);
    int matches = sift_matcher_count_matches(f1, f2);

    feature_array_free(f1);
    feature_array_free(f2);
    gray_image_free(img1);
    gray_image_free(img2);
    return matches;
}

DLLEXPORT void* EvaluatorNew() {
    return evaluator_new();
}

DLLEXPORT void EvaluatorFree(void* handle) {
    if (handle) evaluator_free(static_cast<ImageSimilarityEvaluator*>(handle));
}

DLLEXPORT double EvaluatorEvaluate(void* handle,
                                    const unsigned char* currentData, int currentSize,
                                    const char* label,
                                    const unsigned char* correctData, int correctSize) {
    if (!handle || !currentData || currentSize <= 0 || !label) {
        lastError = "Invalid arguments";
        return -1.0;
    }
    GrayImage* currentImg = load_gray_from_memory(currentData, currentSize);
    if (!currentImg) { lastError = std::string("Failed to decode current image: ") + stbi_failure_reason(); return -1.0; }

    GrayImage* correctImg = nullptr;
    if (correctData && correctSize > 0) {
        correctImg = load_gray_from_memory(correctData, correctSize);
        if (!correctImg) { lastError = std::string("Failed to decode correct image: ") + stbi_failure_reason(); gray_image_free(currentImg); return -1.0; }
    }

    double result = evaluator_evaluate(static_cast<ImageSimilarityEvaluator*>(handle),
                                        currentImg, label, correctImg);
    gray_image_free(currentImg);
    gray_image_free(correctImg);
    return result;
}

DLLEXPORT void EvaluatorPreloadTemplate(void* handle, const char* label,
                                         const unsigned char* correctData, int correctSize) {
    if (!handle || !label || !correctData || correctSize <= 0) return;
    GrayImage* correctImg = load_gray_from_memory(correctData, correctSize);
    if (!correctImg) return;
    evaluator_preload_template(static_cast<ImageSimilarityEvaluator*>(handle), label, correctImg);
    gray_image_free(correctImg);
}

CudaBinarizeLib.cs (SIFT部分)

using System;
using System.Runtime.InteropServices;

namespace CudaSharp
{
    public class CudaImageSimilarityEvaluator : IDisposable
    {
        private const string DllName = "libCudaSharpNative.so";
        private IntPtr _handle;

        [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
        private static extern IntPtr EvaluatorNew();

        [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
        private static extern void EvaluatorFree(IntPtr handle);

        [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
        private static extern double EvaluatorEvaluate(IntPtr handle,
            byte[] currentData, int currentSize, string label,
            byte[] correctData, int correctSize);

        [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
        private static extern void EvaluatorPreloadTemplate(IntPtr handle,
            string label, byte[] correctData, int correctSize);

        public CudaImageSimilarityEvaluator()
        {
            _handle = EvaluatorNew();
        }

        public double Evaluate(byte[] currentImage, string label)
        {
            return EvaluatorEvaluate(_handle, currentImage, currentImage.Length,
                                     label, null, 0);
        }

        public double Evaluate(byte[] currentImage, string label, byte[] templateImage)
        {
            return EvaluatorEvaluate(_handle, currentImage, currentImage.Length,
                                     label, templateImage, templateImage.Length);
        }

        public void PreloadTemplate(string label, byte[] templateImage)
        {
            EvaluatorPreloadTemplate(_handle, label, templateImage, templateImage.Length);
        }

        public void Dispose()
        {
            if (_handle != IntPtr.Zero)
            {
                EvaluatorFree(_handle);
                _handle = IntPtr.Zero;
            }
        }
    }

    public static class CudaSift
    {
        private const string DllName = "libCudaSharpNative.so";

        [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
        private static extern double CompareImagesSift(byte[] data1, int size1,
                                                        byte[] data2, int size2);

        [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
        private static extern int CountMatchesSift(byte[] data1, int size1,
                                                    byte[] data2, int size2);

        public static double CompareImages(byte[] image1, byte[] image2)
            => CompareImagesSift(image1, image1.Length, image2, image2.Length);

        public static int CountMatches(byte[] image1, byte[] image2)
            => CountMatchesSift(image1, image1.Length, image2, image2.Length);
    }
}

build.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
CudaSharp - CUDA 动态阈值二值化 + SIFT DLL 构建脚本
"""

import os
import sys
import subprocess
from pathlib import Path

os.environ['PYTHONIOENCODING'] = 'utf-8'

CUDA_ARCHS = [
    ("61", "sm_61"),
    ("75", "sm_75"),
    ("86", "sm_86"),
    ("89", "sm_89"),
]

SOURCE_FILES = [
    "CudaSharpNative.cu",
    "gray_image.c",
    "image_ops.cu",
    "sift_types.c",
    "sift_algorithm.cu",
    "sift_matcher.cu",
    "image_similarity.cu",
]
OUTPUT_NAME = "CudaSharpNative.dll"
BUILD_DIR = "build"


def print_header():
    print("=" * 50)
    print("CudaSharp - CUDA DLL 构建脚本")
    print("=" * 50)
    print()


def find_nvcc_linux():
    nvcc = "nvcc"
    try:
        result = subprocess.run(["which", nvcc], capture_output=True, text=True)
        if result.returncode == 0:
            return result.stdout.strip()
    except Exception:
        pass

    cuda_path = os.environ.get("CUDA_PATH", "")
    if cuda_path:
        nvcc_path = os.path.join(cuda_path, "bin", "nvcc")
        if os.path.exists(nvcc_path):
            return nvcc_path

    common_paths = [
        "/usr/local/cuda/bin/nvcc",
        "/usr/local/cuda-12.6/bin/nvcc",
        "/usr/local/cuda-12.4/bin/nvcc",
        "/usr/local/cuda-12.0/bin/nvcc",
        "/usr/local/cuda-11.8/bin/nvcc",
    ]
    for p in common_paths:
        if os.path.exists(p):
            return p

    print("[错误] 找不到 nvcc，请确保 CUDA Toolkit 已安装并加入 PATH")
    sys.exit(1)


def build_linux():
    build_path = Path(BUILD_DIR)
    build_path.mkdir(exist_ok=True)

    nvcc = find_nvcc_linux()
    print(f"[信息] 使用 NVCC: {nvcc}")
    try:
        result = subprocess.run([nvcc, "--version"], capture_output=True, text=True)
        print("[信息] NVCC 版本:")
        print(result.stdout)
    except Exception as e:
        print(f"[错误] 无法运行 nvcc: {e}")
        sys.exit(1)

    output_name = "libCudaSharpNative.so"

    old_so = build_path / output_name
    if old_so.exists():
        old_so.unlink()

    gencode_args = ["-arch=native"]

    cmd = [
        nvcc,
        *SOURCE_FILES,
        "--shared",
        "-o", str(build_path / output_name),
        "-O3",
        *gencode_args,
        "--use_fast_math",
        "-Xcompiler", "-fPIC",
        "-I.",
        "-Istb_image",
        "-DNDEBUG"
    ]

    print("[信息] 开始编译 CUDA 共享库...")
    print(f"[信息] 命令: {' '.join(cmd)}")
    print()

    log_path = build_path / "build.log"
    try:
        with open(log_path, "w", encoding="utf-8") as log_file:
            result = subprocess.run(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                encoding="utf-8",
                errors="ignore"
            )
            log_file.write(result.stdout)
            print(result.stdout, end="")

        if result.returncode != 0:
            print()
            print(f"[错误] 编译失败！详细日志见: {log_path}")
            sys.exit(1)
        else:
            print(f"[信息] 编译日志已保存到: {log_path}")
    except Exception as e:
        print(f"[错误] 编译过程出错: {e}")
        sys.exit(1)

    print()
    print("=" * 50)
    print(f"[成功] CUDA 共享库编译完成！")
    print(f"输出文件: {BUILD_DIR}/{output_name}")
    print("=" * 50)
    print()

    # 编译 SIFT 测试程序
    test_cpp = Path("test_sift.cpp")
    if test_cpp.exists():
        print("[信息] 编译 SIFT 测试程序...")
        test_out = build_path / "test_sift"
        test_sources = [src for src in SOURCE_FILES if src != "CudaSharpNative.cu"]
        test_cmd = [
            nvcc,
            str(test_cpp),
            *test_sources,
            "-o", str(test_out),
            "-O3",
            *gencode_args,
            "--use_fast_math",
            "-Xcompiler", "-fPIC",
            "-I.",
            "-Istb_image",
            "-DNDEBUG"
        ]
        print(f"[信息] 命令: {' '.join(test_cmd)}")
        try:
            result = subprocess.run(
                test_cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                encoding="utf-8",
                errors="ignore"
            )
            print(result.stdout, end="")
            if result.returncode == 0:
                print(f"[成功] 测试程序编译完成: {test_out}")
            else:
                print("[警告] 测试程序编译失败")
        except Exception as e:
            print(f"[警告] 测试程序编译出错: {e}")
        print()


def main():
    print_header()
    if sys.platform == "win32":
        print("[信息] 检测到 Windows 平台")
    else:
        print("[信息] 检测到 Linux 平台，使用 Linux 编译路径")
        print()
        build_linux()


if __name__ == "__main__":
    main()

benchmark_sift.py

#!/usr/bin/env python3
"""
SIFT 性能基准测试：连续运行 10 次，计算平均耗时
"""

import subprocess
import time
import os
import sys

TEST_PAIRS = [
    ("1.jpg", "2.jpg"),
    ("3.jpg", "4.jpg"),
]

RUNS = 10
TEST_SIFT_PATH = "./build/test_sift"


def run_single(pair_idx, img1, img2, run_idx):
    """运行一次 SIFT 测试，返回耗时（秒）"""
    output_file = f"bench_{pair_idx}_{run_idx}.jpg"
    cmd = [TEST_SIFT_PATH, img1, img2, output_file]
    
    start = time.perf_counter()
    result = subprocess.run(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    elapsed = time.perf_counter() - start
    
    if result.returncode != 0:
        print(f"[错误] 运行失败: {result.stderr}")
        return None
    
    matches = 0
    for line in result.stdout.splitlines():
        if "Matches:" in line:
            try:
                matches = int(line.split(":")[1].strip())
            except:
                pass
    
    return elapsed, matches, output_file


def benchmark_pair(pair_idx, img1, img2):
    """对一对图像运行 10 次测试"""
    print(f"\n{'='*50}")
    print(f"测试组合 {pair_idx + 1}: {img1} & {img2}")
    print(f"{'='*50}")
    
    times = []
    matches_list = []
    last_output = None
    
    for i in range(RUNS):
        print(f"  第 {i+1}/{RUNS} 次运行...", end=" ", flush=True)
        result = run_single(pair_idx, img1, img2, i)
        if result is None:
            continue
        elapsed, matches, output_file = result
        times.append(elapsed)
        matches_list.append(matches)
        last_output = output_file
        print(f"耗时: {elapsed*1000:.2f} ms, 匹配: {matches}")
    
    if not times:
        return None
    
    avg_ms = sum(times) / len(times) * 1000
    min_ms = min(times) * 1000
    max_ms = max(times) * 1000
    avg_matches = sum(matches_list) / len(matches_list)
    
    print(f"\n  统计结果:")
    print(f"    平均耗时: {avg_ms:.2f} ms")
    print(f"    最小耗时: {min_ms:.2f} ms")
    print(f"    最大耗时: {max_ms:.2f} ms")
    print(f"    平均匹配: {avg_matches:.1f}")
    
    return {
        "pair": f"{img1} & {img2}",
        "avg_ms": avg_ms,
        "min_ms": min_ms,
        "max_ms": max_ms,
        "avg_matches": avg_matches,
        "runs": len(times),
        "last_output": last_output
    }


def main():
    print("="*50)
    print("SIFT 性能基准测试")
    print(f"运行次数: {RUNS} 次/组合")
    print(f"测试程序: {TEST_SIFT_PATH}")
    print("="*50)
    
    if not os.path.exists(TEST_SIFT_PATH):
        print(f"[错误] 找不到测试程序: {TEST_SIFT_PATH}")
        sys.exit(1)
    
    results = []
    for idx, (img1, img2) in enumerate(TEST_PAIRS):
        result = benchmark_pair(idx, img1, img2)
        if result:
            results.append(result)
    
    # 清理临时文件
    for idx, (img1, img2) in enumerate(TEST_PAIRS):
        for i in range(RUNS):
            f = f"bench_{idx}_{i}.jpg"
            if os.path.exists(f) and f != results[idx]["last_output"]:
                os.remove(f)
    
    # 生成报告
    print("\n" + "="*50)
    print("测试完成！生成报告...")
    print("="*50)
    
    report_lines = [
        "【SIFT 性能基准测试报告】",
        "",
        f"测试配置：连续运行 {RUNS} 次/组合",
        "",
    ]
    
    for r in results:
        report_lines.append(f"组合: {r['pair']}")
        report_lines.append(f"  平均耗时: {r['avg_ms']:.2f} ms")
        report_lines.append(f"  最小耗时: {r['min_ms']:.2f} ms")
        report_lines.append(f"  最大耗时: {r['max_ms']:.2f} ms")
        report_lines.append(f"  平均匹配: {r['avg_matches']:.1f}")
        report_lines.append("")
    
    report = "\n".join(report_lines)
    print(report)
    
    with open("benchmark_report.txt", "w", encoding="utf-8") as f:
        f.write(report)
    
    print("\n报告已保存到 benchmark_report.txt")
    print("结果图:")
    for r in results:
        print(f"  {r['last_output']}")


if __name__ == "__main__":
    main()

编译及测试

# 完整构建流程
python3 build.py

# SIFT 特征匹配测试
./build/test_sift <image1> <image2> [output.jpg]

# 示例
./build/test_sift 1.jpg 2.jpg sift_result.jpg

# 性能基准测试
python3 benchmark_sift.py

运行输出:

Loading images...
Image1: 800x600
Image2: 1080x900
Extracting SIFT features from image1...
Features1: 156
Extracting SIFT features from image2...
Features2: 203
Matching features...
Matches: 89
Saving result to sift_result.jpg...
Done. Result saved to sift_result.jpg

许可证

本项目使用 stb_image 库，遵循其公共领域许可证条款。

posted @ 2026-04-18 12:59 qsBye 阅读(6) 评论(0) 收藏举报

刷新页面返回顶部

qsBye