BentoML
BentoML
https://github.com/bentoml/BentoML?tab=readme-ov-file
The easiest way to serve AI apps and models - Build Model Inference APIs, Job queues, LLM apps, Multi-model pipelines, and more!
🍱 Build model inference APIs and multi-model serving systems with any open-source or custom AI models. 👉 Join our Slack community!
BentoML is a Python library for building online serving systems optimized for AI apps and model inference.
- 🍱 Easily build APIs for Any AI/ML Model. Turn any model inference script into a REST API server with just a few lines of code and standard Python type hints.
- 🐳 Docker Containers made simple. No more dependency hell! Manage your environments, dependencies and model versions with a simple config file. BentoML automatically generates Docker images, ensures reproducibility, and simplifies how you deploy to different environments.
- 🧭 Maximize CPU/GPU utilization. Build high performance inference APIs leveraging built-in serving optimization features like dynamic batching, model parallelism, multi-stage pipeline and multi-model inference-graph orchestration.
- 👩💻 Fully customizable. Easily implement your own APIs or task queues, with custom business logic, model inference and multi-model composition. Supports any ML framework, modality, and inference runtime.
- 🚀 Ready for Production. Develop, run and debug locally. Seamlessly deploy to production with Docker containers or BentoCloud.
https://docs.bentoml.com/en/latest/get-started/hello-world.html
https://github.com/bentoml/quickstart
from __future__ import annotations import bentoml with bentoml.importing(): from transformers import pipeline EXAMPLE_INPUT = "Breaking News: In an astonishing turn of events, the small \ town of Willow Creek has been taken by storm as local resident Jerry Thompson's cat, \ Whiskers, performed what witnesses are calling a 'miraculous and gravity-defying leap.' \ Eyewitnesses report that Whiskers, an otherwise unremarkable tabby cat, jumped \ a record-breaking 20 feet into the air to catch a fly. The event, which took \ place in Thompson's backyard, is now being investigated by scientists for potential \ breaches in the laws of physics. Local authorities are considering a town festival \ to celebrate what is being hailed as 'The Leap of the Century." my_image = bentoml.images.Image(python_version="3.11") \ .python_packages("torch", "transformers") @bentoml.service( image=my_image, resources={"cpu": "2"}, traffic={"timeout": 30}, ) class Summarization: # Define the Hugging Face model as a class variable model_path = bentoml.models.HuggingFaceModel("sshleifer/distilbart-cnn-12-6") def __init__(self) -> None: # Load model into pipeline self.pipeline = pipeline('summarization', model=self.model_path) @bentoml.api def summarize(self, text: str = EXAMPLE_INPUT) -> str: result = self.pipeline(text) return f"Hello world! Here's your summary: {result[0]['summary_text']}"
https://github.com/bentoml/BentoLangGraph/blob/main/langgraph-anthropic/service.py
from __future__ import annotations import typing, logging, traceback import bentoml from langchain_core.messages import HumanMessage from langchain_anthropic import ChatAnthropic from langchain_core.tools import tool from langgraph.graph import END, START, StateGraph, MessagesState from langgraph.prebuilt import ToolNode from langchain_community.tools import DuckDuckGoSearchRun logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) IMAGE = bentoml.images.PythonImage( python_version="3.11", lock_python_packages=False, python_requirements="requirements.txt" ) @tool def search(query: str): """A wrapper around DuckDuckGo Search. Useful for when you need to answer questions about current events, current weather, latest news, up-to-date information, etc. Input should be a search query. """ duckduckgo_search = DuckDuckGoSearchRun() res = duckduckgo_search.invoke({"query": query}) return [res] # Define the function that determines whether to continue or not def should_continue(state: MessagesState) -> typing.Literal["tools", END]: messages = state["messages"] last_message = messages[-1] # If the LLM makes a tool call, then we route to the "tools" node if last_message.tool_calls: return "tools" # Otherwise, we stop (reply to the user) return END @bentoml.service( name="langgraph-anthropic-search-agent", workers=2, resources={"cpu": "2000m"}, envs=[{"name": "ANTHROPIC_API_KEY"}], traffic={"concurrency": 16, "external_queue": True}, labels={"owner": "bentoml-team", "project": "langgraph-anthropic"}, image=IMAGE, ) class SearchAgentService: @bentoml.on_startup def initialize_app(self): tools = [search] tool_node = ToolNode(tools) model = ChatAnthropic(model="claude-3-7-sonnet-20250219", temperature=0).bind_tools(tools) # Define the function that calls the model def call_model(state: MessagesState): messages = state["messages"] response = model.invoke(messages) # We return a list, because this will get added to the existing list return {"messages": [response]} # Define a new graph workflow = StateGraph(MessagesState) # Define the two nodes we will cycle between workflow.add_node("agent", call_model) workflow.add_node("tools", tool_node) # Set the entrypoint as `agent` # This means that this node is the first one called workflow.add_edge(START, "agent") # We now add a conditional edge workflow.add_conditional_edges( # First, we define the start node. We use `agent`. # This means these are the edges taken after the `agent` node is called. "agent", # Next, we pass in the function that will determine which node is called next. should_continue, ) # We now add a normal edge from `tools` to `agent`. # This means that after `tools` is called, `agent` node is called next. workflow.add_edge("tools", "agent") self.app = workflow.compile() @bentoml.task async def invoke( self, input_query: str = "What is the weather in San Francisco today?", ) -> str: try: final_state = await self.app.ainvoke({"messages": [HumanMessage(content=input_query)]}) return final_state["messages"][-1].content except Exception as e: logger.error(f"An error occurred: {e}") logger.error(traceback.format_exc()) return "I'm sorry, but I encountered an error while processing your request. Please try again later." @bentoml.api async def stream( self, input_query: str = "What is the weather in San Francisco today?", ) -> typing.AsyncGenerator[str, None]: async for event in self.app.astream_events({"messages": [HumanMessage(content=input_query)]}, version="v2"): yield str(event) + "\n"
https://github.com/bentoml/BentoYolo/blob/main/service.py
from __future__ import annotations import json import os import typing as t from pathlib import Path import bentoml from bentoml.validators import ContentType Image = t.Annotated[Path, ContentType("image/*")] image = bentoml.images.Image(python_version='3.11', lock_python_packages=False) \ .system_packages('libglib2.0-0', 'libsm6', 'libxext6', 'libxrender1', 'libgl1-mesa-glx') \ .requirements_file('requirements.txt') @bentoml.service(resources={"gpu": 1}, image=image) class YoloService: def __init__(self): from ultralytics import YOLO yolo_model = os.getenv("YOLO_MODEL", "yolo11n.pt") self.model = YOLO(yolo_model) @bentoml.api(batchable=True) def predict(self, images: list[Image]) -> list[list[dict]]: results = self.model.predict(source=images) return [json.loads(result.tojson()) for result in results] @bentoml.api def render(self, image: Image) -> Image: result = self.model.predict(image)[0] output = image.parent.joinpath(f"{image.stem}_result{image.suffix}") result.save(str(output)) return output
出处:http://www.cnblogs.com/lightsong/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接。