局部视觉语言模型(VLMs)中的结构化输出

通过使用结构化输出,你可以将提示词转化为干净、易于维护的代码,同时利用 Pydantic 强大的验证功能。这种方法为构建 RAG 应用、错误管理以及实现 LLM 优雅重试机制提供了坚实基础。
from mlx_vlm import load, apply_chat_template, generate
from mlx_vlm.utils import load_image

import json

from pydantic import BaseModel, Field, ConfigDict

class Object(BaseModel):
name: str
description: str = Field(..., description="short description")
x: float = Field(..., description="x coordinate of the object")
y: float = Field(..., description="y coordinate of the object")

model_config = ConfigDict(
    json_schema_extra={
        "example": [
            {
                "name": "object 1",
                "description": "object 1 description",
                "x": 13.3,
                "y": 59.6,
            }
        ]
    }
)

class ObjectList(BaseModel):
object_list: list[Object]

model, processor = load("mlx-community/Molmo-7B-D-0924-4bit", processor_config={"trust_remote_code": True})
config = model.config

image_path = "test.jpg"
image = load_image(image_path)

messages = [
{
"role": "user",
"content": f"""identify the main 3 objects in the image Your response should Return the correct JSON response within a ```json codeblock. not the JSON_SCHEMA".
JSON schema: {json.dumps(ObjectList.model_json_schema())}. Also use the provided example to format your json.
""",
}
]

prompt = apply_chat_template(processor, config, messages)

output = generate(model, processor, image, prompt, max_tokens=1200, temperature=0.7)
print(output)

cleaned_json = output.strip().replace("json", "").replace("", "").strip()
object_list = ObjectList.model_validate_json(cleaned_json)
print(object_list)

posted @ 2026-01-04 14:20  Python喵  阅读(4)  评论(0)    收藏  举报