sam3 （1）安装 - MKT-porter

sam3 （1）安装

1需要申请身份

模型下载

https://huggingface.co/facebook/sam3

网站申请大概几十分钟就同意了

成功后给你个跳转链接

https://huggingface.co/collections/facebook/sam3

下载facebook/sam3

未来代码加载本地的权重

# Load the model
model = build_sam3_image_model(
    checkpoint_path="/home/r9000k/v2_project/sam/sam3/assets/model/sam3.pt"
)
processor = Sam3Processor(model, confidence_threshold=0.3)

2 安装过程

1 虽然官方要求cuda12 ，但是cuda11.8也能用，安装对应的torch。

2 win11 conda安装各种报错库缺少问题。

'''
conda create -n sam3 python=3.12
conda deactivate
conda activate sam3

pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


git clone https://github.com/facebookresearch/sam3.git
cd sam3
pip install -e .

# For running example notebooks
pip install -e ".[notebooks]"

# For development
pip install -e ".[train,dev]"

https://huggingface.co/madbuda/triton-windows-builds/blob/main/triton-3.0.0-cp312-cp312-win_amd64.whl

############ 1 额外库 
在powershell中 输入以下命令：
git clone https://github.com/triton-lang/triton.git
cd triton
pip install -r python/requirements.txt # build-time dependencies
pip install -e .

'''
import torch
#################################### For Image ####################################
from PIL import Image
from sam3.model_builder import build_sam3_image_model
from sam3.model.sam3_image_processor import Sam3Processor
# Load the model
model = build_sam3_image_model()
processor = Sam3Processor(model)
# Load an image
image = Image.open("npu2pm.JPG")
inference_state = processor.set_image(image)
# Prompt the model with text
output = processor.set_text_prompt(state=inference_state, prompt="house")

# Get the masks, bounding boxes, and scores
masks, boxes, scores = output["masks"], output["boxes"], output["scores"]

完整1

car and building

segmentation_plot_with_info

building

segmentation_plot_with_info

segmentation_result_with_info

tree

adf3a71120fe53826e0d7b80612fe0fb

segmentation_result_rgb

car

segmentation_result_with_info

1测试1

import torch
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.patches as patches
import time  # 添加时间模块

#################################### For Image ####################################
from sam3.model_builder import build_sam3_image_model
from sam3.model.sam3_image_processor import Sam3Processor

# 记录总开始时间
total_start_time = time.time()

# 记录模型加载开始时间
model_load_start_time = time.time()

# Load the model
model = build_sam3_image_model(
    checkpoint_path="/home/r9000k/v2_project/sam/sam3/assets/model/sam3.pt"
)
processor = Sam3Processor(model, confidence_threshold=0.3)

# 记录模型加载结束时间
model_load_end_time = time.time()
model_load_time = model_load_end_time - model_load_start_time
print(f"模型加载时间: {model_load_time:.3f} 秒")

# 记录单张检测开始时间
detection_start_time = time.time()

image_path = "testimage/微信图片_20251120225825_37.jpg"
# Load an image
image = Image.open(image_path)
inference_state = processor.set_image(image)

# Prompt the model with text
output = processor.set_text_prompt(state=inference_state, prompt="person in white clothes") #building car car、people、bicycle

# Get the masks, bounding boxes, and scores
masks, boxes, scores = output["masks"], output["boxes"], output["scores"]

# 记录单张检测结束时间
detection_end_time = time.time()
detection_time = detection_end_time - detection_start_time
print(f"检测单张时间: {detection_time:.3f} 秒")
print(f"检测到 {len(masks)} 个分割结果")
print(f"掩码形状: {masks.shape}")

def overlay_masks_with_info(image, masks, boxes, scores):
    """
    在图像上叠加掩码，并添加ID、得分和矩形框
    masks: 形状为 [N, 1, H, W] 的四维张量
    boxes: 形状为 [N, 4] 的边界框张量 [x1, y1, x2, y2]
    scores: 形状为 [N] 的得分张量
    """
    # 转换为RGB模式以便绘制
    image = image.convert("RGB")
    draw = ImageDraw.Draw(image)
    
    # 尝试加载字体，如果失败则使用默认字体
    try:
        # 尝试使用系统中文字体
        font = ImageFont.truetype("SimHei.ttf", 20)
    except:
        try:
            font = ImageFont.truetype("Arial.ttf", 20)
        except:
            font = ImageFont.load_default()
    
    # 将掩码转换为numpy数组并去除通道维度
    masks_np = masks.cpu().numpy().astype(np.uint8)  # 形状: [N, 1, H, W]
    masks_np = masks_np.squeeze(1)  # 移除通道维度，形状: [N, H, W]
    boxes_np = boxes.cpu().numpy()  # 形状: [N, 4]
    scores_np = scores.cpu().numpy()  # 形状: [N]
    
    n_masks = masks_np.shape[0]
    cmap = plt.cm.get_cmap("rainbow", n_masks)
    
    for i, (mask, box, score) in enumerate(zip(masks_np, boxes_np, scores_np)):
        # 获取颜色
        color = tuple(int(c * 255) for c in cmap(i)[:3])
        
        # 确保掩码是二维的
        if mask.ndim == 3:
            mask = mask.squeeze(0)
        
        # 创建透明度掩码
        alpha_mask = (mask * 128).astype(np.uint8)  # 0.5透明度
        
        # 创建彩色覆盖层
        overlay = Image.new("RGBA", image.size, color + (128,))
        
        # 应用alpha通道
        alpha = Image.fromarray(alpha_mask, mode='L')
        overlay.putalpha(alpha)
        
        # 叠加到图像上
        image = Image.alpha_composite(image.convert("RGBA"), overlay).convert("RGB")
        draw = ImageDraw.Draw(image)
        
        # 绘制边界框
        x1, y1, x2, y2 = box
        # 确保坐标在图像范围内
        x1 = max(0, min(x1, image.width))
        y1 = max(0, min(y1, image.height))
        x2 = max(0, min(x2, image.width))
        y2 = max(0, min(y2, image.height))
        
        # 绘制矩形框
        draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
        
        # 准备文本信息
        text = f"ID:{i} Score:{score:.3f}"
        
        # 计算文本位置（在框的上方）
        text_width, text_height = draw.textbbox((0, 0), text, font=font)[2:4]
        text_x = x1
        text_y = max(0, y1 - text_height - 5)
        
        # 绘制文本背景
        draw.rectangle([text_x, text_y, text_x + text_width + 10, text_y + text_height + 5], 
                      fill=color)
        
        # 绘制文本
        draw.text((text_x + 5, text_y + 2), text, fill="white", font=font)
    
    return image

# 记录可视化开始时间
visualization_start_time = time.time()

# 应用掩码叠加（带信息）
result_image = overlay_masks_with_info(Image.open(image_path), masks, boxes, scores)

# 保存结果图像
output_path = "segmentation_result_with_info.png"
result_image.save(output_path)

# 记录可视化结束时间
visualization_end_time = time.time()
visualization_time = visualization_end_time - visualization_start_time
print(f"可视化时间: {visualization_time:.3f} 秒")

print(f"带信息的分割结果已保存到: {output_path}")

# 设置中文字体或使用英文避免警告
try:
    # 尝试设置中文字体
    plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
    plt.rcParams['axes.unicode_minus'] = False
    title = f"检测到 {len(masks)} 个建筑分割结果（带ID、得分和边界框）"
except:
    # 如果中文字体不可用，使用英文
    title = f"Detected {len(masks)} building segmentation results (with ID, score and bbox)"

# 显示图像
plt.figure(figsize=(12, 8))
plt.imshow(result_image)
plt.axis('off')
plt.title(title)
plt.tight_layout()
plt.savefig("segmentation_plot_with_info.png", bbox_inches='tight', dpi=300, facecolor='white')
plt.show()

# 记录总结束时间
total_end_time = time.time()
total_time = total_end_time - total_start_time

# 打印详细的时间统计
print("\n" + "="*50)
print("运行时间统计:")
print("="*50)
print(f"模型加载时间: {model_load_time:.3f} 秒")
print(f"检测单张时间: {detection_time:.3f} 秒")
print(f"可视化时间:   {visualization_time:.3f} 秒")
print("-"*50)
print(f"总运行时间:   {total_time:.3f} 秒")
print("="*50)

# # 可选：保存每个单独的掩码（带信息）
# print("\n保存带信息的单个掩码...")
# for i, (mask, box, score) in enumerate(zip(masks, boxes, scores)):
#     # 创建单个掩码的可视化
#     base_image = Image.open(image_path).convert("RGB")
#     single_draw = ImageDraw.Draw(base_image)
    
#     # 尝试加载字体
#     try:
#         single_font = ImageFont.truetype("SimHei.ttf", 24)
#     except:
#         try:
#             single_font = ImageFont.truetype("Arial.ttf", 24)
#         except:
#             single_font = ImageFont.load_default()
    
#     # 处理掩码
#     mask_np = mask.cpu().numpy().squeeze().astype(np.uint8)
#     color = tuple(int(c * 255) for c in plt.cm.get_cmap("rainbow", len(masks))(i)[:3])
    
#     # 创建透明度掩码
#     alpha_mask = (mask_np * 128).astype(np.uint8)
#     overlay = Image.new("RGBA", base_image.size, color + (128,))
#     alpha = Image.fromarray(alpha_mask, mode='L')
#     overlay.putalpha(alpha)
#     base_image = Image.alpha_composite(base_image.convert("RGBA"), overlay).convert("RGB")
#     single_draw = ImageDraw.Draw(base_image)
    
#     # 绘制边界框和文本
#     x1, y1, x2, y2 = box.cpu().numpy()
#     single_draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
    
#     text = f"ID:{i} Score:{score:.3f}"
#     text_width, text_height = single_draw.textbbox((0, 0), text, font=single_font)[2:4]
#     text_x = x1
#     text_y = max(0, y1 - text_height - 5)
    
#     single_draw.rectangle([text_x, text_y, text_x + text_width + 10, text_y + text_height + 5], 
#                           fill=color)
#     single_draw.text((text_x + 5, text_y + 2), text, fill="white", font=single_font)
    
#     base_image.save(f"mask_with_info_{i:02d}.png")
#     print(f"保存带信息的掩码 {i:02d}.png (得分: {score:.3f})")

# print("所有处理完成！")

后续开发测试

1 根据iou 合并重合的物体为一个。

2 测试视频跟踪目标功能，使得多帧目标跟踪知道是同一个物体。

SAM3 视频 - 视频的可提示概念分割 (PCS)

SAM3 Video 对视频执行可提示概念分割 (PCS)，以文本作为提示，检测并跟踪视频帧中所有匹配的对象实例。

from transformers import Sam3VideoModel, Sam3VideoProcessor
from accelerate import Accelerator
import torch

device = Accelerator().device
model = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
processor = Sam3VideoProcessor.from_pretrained("facebook/sam3")

# Load video frames
from transformers.video_utils import load_video
video_url = "https://huggingface.co/datasets/hf-internal-testing/sam2-fixtures/resolve/main/bedroom.mp4"
video_frames, _ = load_video(video_url)

# Initialize video inference session
inference_session = processor.init_video_session(
    video=video_frames,
    inference_device=device,
    processing_device="cpu",
    video_storage_device="cpu",
    dtype=torch.bfloat16,
)

# Add text prompt to detect and track objects
text = "person"
inference_session = processor.add_text_prompt(
    inference_session=inference_session,
    text=text,
)

# Process all frames in the video
outputs_per_frame = {}
for model_outputs in model.propagate_in_video_iterator(
    inference_session=inference_session, max_frame_num_to_track=50
):
    processed_outputs = processor.postprocess_outputs(inference_session, model_outputs)
    outputs_per_frame[model_outputs.frame_idx] = processed_outputs

print(f"Processed {len(outputs_per_frame)} frames")

# Access results for a specific frame
frame_0_outputs = outputs_per_frame[0]
print(f"Detected {len(frame_0_outputs['object_ids'])} objects")
print(f"Object IDs: {frame_0_outputs['object_ids'].tolist()}")
print(f"Scores: {frame_0_outputs['scores'].tolist()}")
print(f"Boxes shape (XYXY format, absolute coordinates): {frame_0_outputs['boxes'].shape}")
print(f"Masks shape: {frame_0_outputs['masks'].shape}")

posted on 2025-11-20 17:25 MKT-porter 阅读(221) 评论(3) 收藏举报

刷新页面返回顶部

1需要申请身份

2 安装过程

后续开发 测试

SAM3 视频 - 视频的可提示概念分割 (PCS)

后续开发测试