BentoML

https://github.com/bentoml/BentoML?tab=readme-ov-file

The easiest way to serve AI apps and models - Build Model Inference APIs, Job queues, LLM apps, Multi-model pipelines, and more!

bentoml.com

Unified Model Serving Framework

🍱 Build model inference APIs and multi-model serving systems with any open-source or custom AI models. 👉 Join our Slack community!

What is BentoML?

BentoML is a Python library for building online serving systems optimized for AI apps and model inference.

🍱 Easily build APIs for Any AI/ML Model. Turn any model inference script into a REST API server with just a few lines of code and standard Python type hints.
🐳 Docker Containers made simple. No more dependency hell! Manage your environments, dependencies and model versions with a simple config file. BentoML automatically generates Docker images, ensures reproducibility, and simplifies how you deploy to different environments.
🧭 Maximize CPU/GPU utilization. Build high performance inference APIs leveraging built-in serving optimization features like dynamic batching, model parallelism, multi-stage pipeline and multi-model inference-graph orchestration.
👩‍💻 Fully customizable. Easily implement your own APIs or task queues, with custom business logic, model inference and multi-model composition. Supports any ML framework, modality, and inference runtime.
🚀 Ready for Production. Develop, run and debug locally. Seamlessly deploy to production with Docker containers or BentoCloud.

https://docs.bentoml.com/en/latest/get-started/hello-world.html

https://github.com/bentoml/quickstart

from __future__ import annotations
import bentoml

with bentoml.importing():
    from transformers import pipeline


EXAMPLE_INPUT = "Breaking News: In an astonishing turn of events, the small \
town of Willow Creek has been taken by storm as local resident Jerry Thompson's cat, \
Whiskers, performed what witnesses are calling a 'miraculous and gravity-defying leap.' \
Eyewitnesses report that Whiskers, an otherwise unremarkable tabby cat, jumped \
a record-breaking 20 feet into the air to catch a fly. The event, which took \
place in Thompson's backyard, is now being investigated by scientists for potential \
breaches in the laws of physics. Local authorities are considering a town festival \
to celebrate what is being hailed as 'The Leap of the Century."


my_image = bentoml.images.Image(python_version="3.11") \
        .python_packages("torch", "transformers")


@bentoml.service(
    image=my_image,
    resources={"cpu": "2"},
    traffic={"timeout": 30},
)
class Summarization:
    # Define the Hugging Face model as a class variable
    model_path = bentoml.models.HuggingFaceModel("sshleifer/distilbart-cnn-12-6")

    def __init__(self) -> None:
        # Load model into pipeline
        self.pipeline = pipeline('summarization', model=self.model_path)
    
    @bentoml.api
    def summarize(self, text: str = EXAMPLE_INPUT) -> str:
        result = self.pipeline(text)
        return f"Hello world! Here's your summary: {result[0]['summary_text']}"

https://github.com/bentoml/BentoLangGraph/blob/main/langgraph-anthropic/service.py

from __future__ import annotations

import typing, logging, traceback

import bentoml

from langchain_core.messages import HumanMessage
from langchain_anthropic import ChatAnthropic
from langchain_core.tools import tool
from langgraph.graph import END, START, StateGraph, MessagesState
from langgraph.prebuilt import ToolNode
from langchain_community.tools import DuckDuckGoSearchRun

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

IMAGE = bentoml.images.PythonImage(
    python_version="3.11", lock_python_packages=False, python_requirements="requirements.txt"
)


@tool
def search(query: str):
    """A wrapper around DuckDuckGo Search.
    Useful for when you need to answer questions about current events, current weather, latest news, up-to-date information, etc.
    Input should be a search query.
    """
    duckduckgo_search = DuckDuckGoSearchRun()

    res = duckduckgo_search.invoke({"query": query})
    return [res]


# Define the function that determines whether to continue or not
def should_continue(state: MessagesState) -> typing.Literal["tools", END]:
    messages = state["messages"]
    last_message = messages[-1]
    # If the LLM makes a tool call, then we route to the "tools" node
    if last_message.tool_calls:
        return "tools"
    # Otherwise, we stop (reply to the user)
    return END


@bentoml.service(
    name="langgraph-anthropic-search-agent",
    workers=2,
    resources={"cpu": "2000m"},
    envs=[{"name": "ANTHROPIC_API_KEY"}],
    traffic={"concurrency": 16, "external_queue": True},
    labels={"owner": "bentoml-team", "project": "langgraph-anthropic"},
    image=IMAGE,
)
class SearchAgentService:
    @bentoml.on_startup
    def initialize_app(self):
        tools = [search]
        tool_node = ToolNode(tools)

        model = ChatAnthropic(model="claude-3-7-sonnet-20250219", temperature=0).bind_tools(tools)

        # Define the function that calls the model
        def call_model(state: MessagesState):
            messages = state["messages"]
            response = model.invoke(messages)
            # We return a list, because this will get added to the existing list
            return {"messages": [response]}

        # Define a new graph
        workflow = StateGraph(MessagesState)

        # Define the two nodes we will cycle between
        workflow.add_node("agent", call_model)
        workflow.add_node("tools", tool_node)

        # Set the entrypoint as `agent`
        # This means that this node is the first one called
        workflow.add_edge(START, "agent")

        # We now add a conditional edge
        workflow.add_conditional_edges(
            # First, we define the start node. We use `agent`.
            # This means these are the edges taken after the `agent` node is called.
            "agent",
            # Next, we pass in the function that will determine which node is called next.
            should_continue,
        )

        # We now add a normal edge from `tools` to `agent`.
        # This means that after `tools` is called, `agent` node is called next.
        workflow.add_edge("tools", "agent")
        self.app = workflow.compile()

    @bentoml.task
    async def invoke(
        self,
        input_query: str = "What is the weather in San Francisco today?",
    ) -> str:
        try:
            final_state = await self.app.ainvoke({"messages": [HumanMessage(content=input_query)]})
            return final_state["messages"][-1].content
        except Exception as e:
            logger.error(f"An error occurred: {e}")
            logger.error(traceback.format_exc())
            return "I'm sorry, but I encountered an error while processing your request. Please try again later."

    @bentoml.api
    async def stream(
        self,
        input_query: str = "What is the weather in San Francisco today?",
    ) -> typing.AsyncGenerator[str, None]:
        async for event in self.app.astream_events({"messages": [HumanMessage(content=input_query)]}, version="v2"):
            yield str(event) + "\n"

https://github.com/bentoml/BentoYolo/blob/main/service.py

from __future__ import annotations

import json
import os
import typing as t
from pathlib import Path

import bentoml
from bentoml.validators import ContentType

Image = t.Annotated[Path, ContentType("image/*")]

image = bentoml.images.Image(python_version='3.11', lock_python_packages=False) \
    .system_packages('libglib2.0-0', 'libsm6', 'libxext6', 'libxrender1', 'libgl1-mesa-glx') \
    .requirements_file('requirements.txt')

@bentoml.service(resources={"gpu": 1}, image=image)
class YoloService:
    def __init__(self):
        from ultralytics import YOLO

        yolo_model = os.getenv("YOLO_MODEL", "yolo11n.pt")

        self.model = YOLO(yolo_model)

    @bentoml.api(batchable=True)
    def predict(self, images: list[Image]) -> list[list[dict]]:
        results = self.model.predict(source=images)
        return [json.loads(result.tojson()) for result in results]

    @bentoml.api
    def render(self, image: Image) -> Image:
        result = self.model.predict(image)[0]
        output = image.parent.joinpath(f"{image.stem}_result{image.suffix}")
        result.save(str(output))
        return output

posted @ 2025-06-22 21:48 lightsong 阅读(23) 评论(0) 收藏举报

刷新页面返回顶部

Stay Hungry,Stay Foolish!

lightsong

{Web: [React, Vue, NodeJS, HTTP]，DevOps:[Jenkins,Docker,K8S], Languages:[Python, JS, C, Lua, Shell, Groovy]}, AI:[LLM, langchain，langraph]

BentoML

BentoML

Unified Model Serving Framework

What is BentoML?

公告