ollama部署SmolVLM-256M多模态Ollama API版本

直接windows下安装部署:

https://blog.csdn.net/m0_52919859/article/details/148643913

用podman部署 (docker收费)

1. 安装podman (略)

2. 拉取 Ollama 镜像

podman pull ollama/ollama

3. 挂载本地 smolVLM模型地址 C:\Users\sdt\Desktop\smolVLM

podman run -d `
  --name ollama `
  -p 11434:11434 `
  -v ollama_data:/root/.ollama `
  -v C:\Users\sdt\Desktop\smolVLM:/models `
  ollama/ollama

4. 运行

podman exec -it ollama sh

5. 在容器内部创建并运行你的模型　　

# 进入挂载的模型目录
cd /models

# 创建模型（假设你的 modelfile 名为 smolvlm.modelfile）
ollama create smolvlm:256m -f smolvlm.modelfile

# 运行模型
ollama run smolvlm:256m

smolvlm.modelfile内容:

# 针对256M参数模型的专项优化
FROM ./SmolVLM-256M-Instruct-f16.gguf
ADAPTER ./mmproj-SmolVLM-256M-Instruct-f16.gguf

# 优化的模板配置，确保最佳性能
TEMPLATE """
<|im_start|>system
{{ .System }}
<end_of_utterance>
{{- range .Messages }}
<|im_start|>{{ .Role }}:
{{ .Content }}
<end_of_utterance>
{{- end }}
<|im_start|>assistant
"""

# 专为SmolVLM优化的系统提示
SYSTEM "You are a visual assistant powered by SmolVLM-256M. Describe images clearly and answer questions based on visual content with high efficiency."

# 针对256M模型的参数优化
PARAMETER num_ctx 4096
PARAMETER stop "<end_of_utterance>"
PARAMETER stop "<|im_start|>"
PARAMETER temperature 0.01
PARAMETER top_p 0.9
PARAMETER repeat_penalty 1.1


6. 善后及删除

podman stop ollama
podman rm ollama

7. API调用
脚本 smolvlm-api.py

python smolvlm-api.py --image [图片路径] --stream

　　脚本代码:

#!/usr/bin/env python3
# vision_chat.py
import argparse
import base64
from pathlib import Path
import cv2
import tempfile
import os

from VIsionModel import VisionModel

# ----------------------------------------------------------------------
def build_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="文本 + 图像 多模态推理小工具",
    )
    p.add_argument("--model",  default="smolvlm:256m",
                   help="Vision 模型标签(ollama list)")
    p.add_argument("--image",  
                   help="要推理的图像文件")
    p.add_argument("--camera", action="store_true",
                   help="使用摄像头拍照")
    p.add_argument("--prompt", default="",
                   help="首次提问内容；留空则进入交互循环")
    p.add_argument("--stream", action="store_true",
                   help="是否流式输出")
    return p.parse_args()

# ----------------------------------------------------------------------
def capture_from_camera() -> str:
    """从摄像头捕获图像并返回base64编码"""
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        raise RuntimeError("无法打开摄像头")
    
    print("按空格键拍照，按ESC退出...")
    while True:
        ret, frame = cap.read()
        if not ret:
            raise RuntimeError("无法获取摄像头画面")
        
        cv2.imshow('Camera', frame)
        key = cv2.waitKey(1) & 0xFF
        
        if key == 27:  # ESC
            cap.release()
            cv2.destroyAllWindows()
            raise KeyboardInterrupt("用户取消拍照")
        elif key == 32:  # 空格
            break
    
    cap.release()
    cv2.destroyAllWindows()
    
    # 保存临时文件
    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
        cv2.imwrite(tmp.name, frame)
        img_b64 = load_b64(Path(tmp.name))
        os.unlink(tmp.name)
        return img_b64

# ----------------------------------------------------------------------
def load_b64(img_path: Path) -> str:
    if not img_path.is_file():
        raise FileNotFoundError(img_path)
    return base64.b64encode(img_path.read_bytes()).decode()

# ----------------------------------------------------------------------
def main() -> None:
    args = build_args()

    # 1) 载入模型
    llm = VisionModel(vision_model_path=args.model, stream=args.stream)

    # 2) 获取图像
    try:
        if args.camera:
            img_b64 = capture_from_camera()
        elif args.image:
            img_b64 = load_b64(Path(args.image))
        else:
            raise ValueError("请指定 --image 或 --camera 参数")
    except Exception as e:
        print(f"错误: {e}")
        return

    # 3) 如果命令行已经给 prompt → 直接跑一次
    if args.prompt:
        run_once(llm, args.prompt, img_b64)
    else:
        # 否则进入 REPL
        try:
            while True:
                prompt = input("请输入内容（Ctrl-C 退出）：")
                run_once(llm, prompt, img_b64)
        except KeyboardInterrupt:
            print("\n已退出")


# ----------------------------------------------------------------------
def run_once(llm, prompt: str, img_b64: str) -> None:
    """调用 VisionModel 并打印结果（支持流式或非流式）"""
    for chunk in llm.generate(prompt, img_b64):
        print(chunk, end="", flush=True)
    print()


# ----------------------------------------------------------------------
if __name__ == "__main__":
    main()

posted @ 2025-07-05 20:36 CrossPython 阅读(135) 评论(0) 收藏举报

刷新页面返回顶部

ollama部署SmolVLM-256M多模态Ollama API版本

公告