语言-目标检测 MM Grounding Dino Large (1) 环境配置 -GroundingDino针对航空图像检测的改进

https://blog.csdn.net/gitblog_00330/article/details/152013136

MM Grounding Dino Large在无人机航拍图像中的检测性能

https://link.gitcode.com/i/9da6757aed6c4f33f18c964e0fed76c2?uuid_tt_dd=10_10332516180-1761162132749-683371&isLogin=1&from_id=152013136

https://arxiv.org/abs/2401.02361

测试数据集构建

针对无人机航拍特性，构建包含以下场景的测试集：

城市航拍：建筑物、车辆、行人（分辨率3840×2160）
乡村农田：农机、作物行、电线杆（分辨率2560×1440）
灾害救援：倒塌建筑、救援车辆、幸存者（分辨率1920×1080）

典型案例分析
小目标检测能力：在300米高空拍摄的农田图像中（单个农机目标像素尺寸约20×30），模型实现89.7%的召回率，优于YOLOv8x的76.2%。通过可视化特征图可见，Swin-Large的stage4特征层（1/32下采样）仍能保留农机的关键轮廓信息。

类别泛化能力：对于训练集中未出现的"太阳能光伏板"类别，通过文本提示"a solar panel with blue cells"，模型实现零样本检测mAP 37.5，验证了GOLD-G数据集带来的开放式词汇理解能力。

DJI_0183

https://huggingface.co/collections/rziga/mm-grounding-dino

https://huggingface.co/openmmlab-community/mm_grounding_dino_large_all

1 下载工程

git clone https://gitcode.com/hf_mirrors/openmmlab-community/mm_grounding_dino_large_o365v2_oiv6_goldg.git
cd mm_grounding_dino_large_o365v2_oiv6_goldg

仓库中包含模型权重文件model.safetensors、配置文件config.json、预处理配置preprocessor_config.json以及分词器相关文件special_tokens_map.json和vocab.txt，这些文件共同构成了完整的模型运行环境。

2 安装库

cuda11.8 unbuntu20 rtx3070

# 创建并激活环境
conda create -n sam2 python=3.10 -y
conda activate sam2
 
# 安装PyTorch（根据CUDA版本调整）
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
 
# 安装核心依赖
pip install transformers==4.28.0
pip install datasets
pip install opencv-python
pip install matplotlib
pip install timm
pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu118/torch1.13.0/index.html

3 如何加速和准确

# 定义缺陷类型
    defect_types = [
        "裂纹", "划痕", "凹痕", 
        "污渍", "变形"
    ]


# Define defect types
    defect_types = [
        "crack", "scratch", "dent", 
        "stain", "deformation"
    ]

测试数据集构建
针对无人机航拍特性，构建包含以下场景的测试集：

城市航拍：建筑物、车辆、行人（分辨率3840×2160）
乡村农田：农机、作物行、电线杆（分辨率2560×1440）
灾害救援：倒塌建筑、救援车辆、幸存者（分辨率1920×1080）


    text_labels = ["vehicle", "person", "building", "tree", 
                   "power line", "agricultural machinery", "water body"]

==========================

代码1 处理一张图片 matlab显示

使用的是16精度不是32

import torch
from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
 

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

torch.cuda.empty_cache()


import numpy as np


# 模型ID或本地路径
model_path = "./"  # 当前项目路径
device = "cuda" if torch.cuda.is_available() else "cpu"
 
# 加载处理器和模型
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForZeroShotObjectDetection.from_pretrained(
    model_path, 
    #torch_dtype=torch.float32 if device == "cpu" else torch.float16
    torch_dtype=torch.float16,
    device_map="auto"
).to(device)




from transformers.image_utils import load_image
 
# 1. 加载图像
image_url = "DJI_0183.JPG"
image = load_image(image_url)  # 也可使用本地路径: load_image("./test.jpg")
 
# 2. 定义文本提示（零样本类别）


text_labels = [
    "vehicle", "person", "building", "tree", 
    "power line", "agricultural machinery", "water body"
]


# 3. 预处理并推理
inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
 
# with torch.no_grad():
#     outputs = model(**inputs)


with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.float16):
    outputs = model(**inputs)

 
# 4. 后处理结果
results = processor.post_process_grounded_object_detection(
    outputs,
    threshold=0.3,  # 置信度阈值
    target_sizes=[(image.height, image.width)]
)


# 获取第一张图像的结果
result = results[0]
 
# 解析边界框、分数和标签
for box, score, label in zip(result["boxes"], result["scores"], result["labels"]):
    # 边界框坐标转换为整数
    box = [round(coord, 2) for coord in box.tolist()]
    xmin, ymin, xmax, ymax = box
    
    # 打印结果
    print(
        f"检测到: {label} "
        f"置信度: {score.item():.3f} "
        f"位置: [{xmin}, {ymin}, {xmax}, {ymax}]"
    )



import cv2
import matplotlib.pyplot as plt
 
def visualize_detection(image, result, threshold=0.3):
    # 转换PIL图像为OpenCV格式
    img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    
    # 定义颜色映射
    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]
    
    # 绘制边界框和标签
    for i, (box, score, label) in enumerate(zip(result["boxes"], result["scores"], result["labels"])):
        if score < threshold:
            continue
            
        xmin, ymin, xmax, ymax = [int(round(coord)) for coord in box.tolist()]
        color = colors[i % len(colors)]
        
        # 绘制矩形框
        cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 2)
        
        # 绘制标签背景
        label_text = f"{label}: {score.item():.2f}"
        (text_width, text_height), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        cv2.rectangle(img, (xmin, ymin - text_height - 10), (xmin + text_width, ymin), color, -1)
        
        # 绘制标签文本
        cv2.putText(img, label_text, (xmin, ymin - 5), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
    
    # 转换回RGB格式用于Matplotlib显示
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # 显示结果
    plt.figure(figsize=(10, 10))
    plt.imshow(img_rgb)
    plt.axis('off')
    plt.show()
    
    # 保存结果
    cv2.imwrite("detection_result.jpg", img)
    return img_rgb
 
# 调用可视化函数
visualize_detection(image, result, threshold=0.3)

代码2 从文件夹读取数据 Opencv可视化

使用的是16精度不是32

图片缩放一半

import os
import cv2
import torch
import numpy as np
import time
from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
from transformers.image_utils import load_image
import matplotlib.pyplot as plt

# 设置CUDA内存配置
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()


# 在预处理时添加resize操作
def preprocess_image(image, scale):
    # 保持宽高比缩放，短边=target_size
    width, height = image.size
    #scale = target_size / min(width, height)
    new_size = (int(width / scale), int(height / scale))
    return image.resize(new_size)



# 初始化模型和处理器
def initialize_model(model_path):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    processor = AutoProcessor.from_pretrained(model_path)
    model = AutoModelForZeroShotObjectDetection.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="auto"
    ).to(device)
    return processor, model, device

# 执行目标检测
def detect_objects(image, processor, model, device, text_labels):
    inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
    
    with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.float16):
        outputs = model(**inputs)
    
    results = processor.post_process_grounded_object_detection(
        outputs,
        threshold=0.3,
        target_sizes=[(image.height, image.width)]
    )
    return results[0]

# 可视化检测结果（添加FPS显示）
def visualize_detection(image, result, fps=None):
    img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]
    
    # 绘制检测结果
    for i, (box, score, label) in enumerate(zip(result["boxes"], result["scores"], result["labels"])):
        if score < 0.3:  # 使用阈值过滤
            continue
            
        xmin, ymin, xmax, ymax = [int(round(coord)) for coord in box.tolist()]
        color = colors[i % len(colors)]
        
        cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 2)
        
        label_text = f"{label}: {score.item():.2f}"
        (text_width, text_height), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        cv2.rectangle(img, (xmin, ymin - text_height - 10), (xmin + text_width, ymin), color, -1)
        cv2.putText(img, label_text, (xmin, ymin - 5), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
    
    # 添加FPS显示
    if fps is not None:
        fps_text = f"FPS: {fps:.1f}"
        cv2.putText(img, fps_text, (10, 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# 主函数：处理文件夹中的图像（添加FPS计算）
def process_folder_images(folder_path, model_path,img_scale=1):
    # 获取并排序所有DJI_*.JPG文件
    image_files = sorted([f for f in os.listdir(folder_path) 
                         if f.startswith('DJI_') and f.lower().endswith('.jpg')])
    
    if not image_files:
        print("未找到DJI_*.JPG格式的图像文件")
        return
    
    # 初始化模型
    processor, model, device = initialize_model(model_path)
    text_labels = ["vehicle", "person", "building", "tree", 
                   "power line", "agricultural machinery", "water body"]
    
    # 创建可调整大小的窗口
    cv2.namedWindow('Zero-Shot Object Detection', cv2.WINDOW_NORMAL)
    
    current_index = 0
    total_images = len(image_files)
    
    # FPS计算变量
    fps = 0
    prev_time = 0
    curr_time = 0
    
    while True:
        # 开始计时
        start_time = time.time()
        
        # 加载当前图像
        image_path = os.path.join(folder_path, image_files[current_index])
        image = load_image(image_path)

        image = preprocess_image(image,img_scale) # 缩放2倍
        
        # 执行检测
        result = detect_objects(image, processor, model, device, text_labels)
        
        # 计算处理时间
        inference_time = time.time() - start_time
        fps = 1.0 / inference_time if inference_time > 0 else 0
        
        # 可视化结果（传入FPS）
        result_img = visualize_detection(image, result, fps)
        
        # 显示结果
        cv2.imshow('Zero-Shot Object Detection', cv2.cvtColor(result_img, cv2.COLOR_RGB2BGR))
        
        # 打印处理信息（包含FPS）
        print(f"处理: {image_files[current_index]} ({current_index + 1}/{total_images}) | FPS: {fps:.1f}")
        #print(torch.cuda.memory_summary())  # 打印显存分配情况
        # 等待按键
        key = cv2.waitKey(0) & 0xFF
        
        # 按键处理
        if key == 27 or key == ord('q'):  # ESC或q退出
            break
        elif key == ord('n') or key == 32 or key == 83 or key == 2:  # 下一张
            current_index = (current_index + 1) % total_images
        elif key == ord('p') or key == 81 or key == 3:  # 上一张
            current_index = (current_index - 1) % total_images
    
    cv2.destroyAllWindows()

# 使用示例
if __name__ == "__main__":
    folder_path = "/media/r9000k/DD_XS/2数据/2RTK/data_4_city/300_locatiopn_2pm/images"  # 图像文件夹路径
    model_path = "./"   # 模型路径
    img_scale=1 # 缩放
    process_folder_images(folder_path, model_path,img_scale)

import os
import cv2
import torch
import numpy as np
import time
from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
from transformers.image_utils import load_image
import matplotlib.pyplot as plt

# 设置CUDA内存配置
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()


# 在预处理时添加resize操作
def preprocess_image(image, scale):
    # 保持宽高比缩放，短边=target_size
    width, height = image.size
    #scale = target_size / min(width, height)
    new_size = (int(width / scale), int(height / scale))
    return image.resize(new_size)



# 初始化模型和处理器
def initialize_model(model_path):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    processor = AutoProcessor.from_pretrained(model_path)
    model = AutoModelForZeroShotObjectDetection.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="auto"
    ).to(device)
    return processor, model, device

# 执行目标检测
def detect_objects(image, processor, model, device, text_labels):
    inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
 
    with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.float16):
        outputs = model(**inputs)
 
    results = processor.post_process_grounded_object_detection(
        outputs,
        threshold=0.3,
        target_sizes=[(image.height, image.width)]
    )
    return results[0]

# 可视化检测结果（添加FPS显示）
def visualize_detection(image, result, fps=None):
    img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]
 
    # 绘制检测结果
    for i, (box, score, label) in enumerate(zip(result["boxes"], result["scores"], result["labels"])):
        if score < 0.3:  # 使用阈值过滤
            continue
 
        xmin, ymin, xmax, ymax = [int(round(coord)) for coord in box.tolist()]
        color = colors[i % len(colors)]
 
        cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 2)
 
        label_text = f"{label}: {score.item():.2f}"
        (text_width, text_height), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        cv2.rectangle(img, (xmin, ymin - text_height - 10), (xmin + text_width, ymin), color, -1)
        cv2.putText(img, label_text, (xmin, ymin - 5), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
 
    # 添加FPS显示
    if fps is not None:
        fps_text = f"FPS: {fps:.1f}"
        cv2.putText(img, fps_text, (10, 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
 
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# 主函数：处理文件夹中的图像（添加FPS计算）
def process_folder_images(folder_path, model_path,img_scale=1):
    # 获取并排序所有DJI_*.JPG文件
    image_files = sorted([f for f in os.listdir(folder_path) 
                         if f.startswith('DJI_') and f.lower().endswith('.jpg')])
 
    if not image_files:
        print("未找到DJI_*.JPG格式的图像文件")
        return
 
    # 初始化模型
    processor, model, device = initialize_model(model_path)
    text_labels = ["vehicle", "person", "building", "tree", 
                   "power line", "agricultural machinery", "water body"]
 
    # 创建可调整大小的窗口
    cv2.namedWindow('Zero-Shot Object Detection', cv2.WINDOW_NORMAL)
 
    current_index = 0
    total_images = len(image_files)
 
    # FPS计算变量
    fps = 0
    prev_time = 0
    curr_time = 0
 
    while True:
        # 开始计时
        start_time = time.time()
 
        # 加载当前图像
        image_path = os.path.join(folder_path, image_files[current_index])
        image = load_image(image_path)

        image = preprocess_image(image,img_scale) # 缩放2倍
 
        # 执行检测
        result = detect_objects(image, processor, model, device, text_labels)
 
        # 计算处理时间
        inference_time = time.time() - start_time
        fps = 1.0 / inference_time if inference_time > 0 else 0
 
        # 可视化结果（传入FPS）
        result_img = visualize_detection(image, result, fps)
 
        # 显示结果
        cv2.imshow('Zero-Shot Object Detection', cv2.cvtColor(result_img, cv2.COLOR_RGB2BGR))
 
        # 打印处理信息（包含FPS）
        print(f"处理: {image_files[current_index]} ({current_index + 1}/{total_images}) | FPS: {fps:.1f}")
        #print(torch.cuda.memory_summary())  # 打印显存分配情况
        # 等待按键
        key = cv2.waitKey(0) & 0xFF
 
        # 按键处理
        if key == 27 or key == ord('q'):  # ESC或q退出
            break
        elif key == ord('n') or key == 32 or key == 83 or key == 2:  # 下一张
            current_index = (current_index + 1) % total_images
        elif key == ord('p') or key == 81 or key == 3:  # 上一张
            current_index = (current_index - 1) % total_images
 
    cv2.destroyAllWindows()

# 使用示例
if __name__ == "__main__":
    folder_path = "/media/r9000k/DD_XS/2数据/2RTK/data_4_city/300_locatiopn_2pm/images"  # 图像文件夹路径
    model_path = "./"   # 模型路径
    img_scale=1 # 缩放
    process_folder_images(folder_path, model_path,img_scale)

posted on 2025-10-28 01:13 MKT-porter 阅读(3) 评论(0) 收藏举报

刷新页面返回顶部