详细介绍:小杰-大模型(twelve)——大模型部署与应用——gradipo-实现UI界面
对话机器人gradio
gradio 介绍
Gradio 是一个简单易用的 Python 库,能够帮助开发者快速搭建用户友好的 Web 应用,特别适合用于机器学习模型的展示。本课程将使用 Gradio 来搭建一个可以与 FastAPI 后端交互的对话机器人。
Gradio 组件讲解
首先介绍 Gradio 的核心组件,包括:
- Gradio Blocks:用于组织界面布局的容器。
- Slider:用于调整生成参数,如 temperature 和 top_p。
- Textbox:用户输入对话的地方。
- Button:发送用户输入或清空历史记录。
- Chatbot:用于显示对话历史的组件。
安装gradio
pip install gradio==5.25.2 -i https://pypi.tuna.tsinghua.edu.cn/simple
导入依赖库
import gradio as gr
import requests
定义后端 API 的 URL
backend_url = "http://127.0.0.1:6606/chat"
定义与后端交互的函数
- prompt:用户的输入。
- sys_prompt:系统提示语(可用于引导模型行为)。
- history:保存对话历史记录的变量。
- temperature:用于控制生成的多样性,数值越高,生成的文本越随机。
- top_p:用于控制采样的多样性。
- max_tokens:最大生成文本的长度。
- stream:是否启用流式输出。
def chat_with_backend(prompt, sys_prompt, history, history_len, temperature, top_p, max_tokens, stream):
# 构建请求数据
data = {
"query": prompt,
"sys_prompt": sys_prompt,
"history_len": history_len,
"history": history,
"temperature": temperature,
"top_p": top_p,
"max_tokens": max_tokens,
}
# 发送请求到 FastAPI 后端
try:
response = requests.post(backend_url, json=data, stream=True)
if response.status_code == 200:
chunks = ""
if stream:
for chunk in response.iter_content(chunk_size=None, decode_unicode=True):
if chunk:
chunks += chunk
chat_history_display = [(entry["role"], entry["content"]) for entry in history]
chat_history_display.append(("user", prompt))
chat_history_display.append(("assistant", chunks))
# # 体验流式输出
# sleep(0.1)
yield chat_history_display, gr.update(value='')
else:
for chunk in response.iter_content(chunk_size=None, decode_unicode=True):
chunks += chunk
chat_history_display = [(entry["role"], entry["content"]) for entry in history]
chat_history_display.append(("user", prompt))
chat_history_display.append(("assistant", chunks))
yield chat_history_display, gr.update(value='')
history.append({"role": "user", "content": prompt})
history.append({"role": "assistant", "content": chunks})
else:
return "请求失败,请检查后台服务器是否正常运行。"
except Exception as e:
return f"发生错误:{e}"
清空对话历史
此函数用于清空当前的对话历史记录。
def clear_history(history):
history.clear()
return "", ""
autodl vllm方式
使用 Gradio 搭建前端界面
gr.Blocks()表示页面的容器,gr.Row()表示一行,两个gr.Row()表示分成了两行,在第二行
使用gr.Column分成了两列,scale用来控制两列的占比。第二列中又被分为两行,第一行显示聊天记录(聊天窗)
,第二行用来显示输入框等组件。clear_button和submit_button分别是两个按钮组件,用于清空历史和发送。
点击clear_button将运行clear_history函数,输入参数是history,输出是chatbot, prompt(清空聊天窗和输入框),
点击submit_button将运行chat_with_backend函数,输入参数是prompt, sys_prompt, history, history_len, temperature, top_p, max_tokens, stream,
输出是chatbot, prompt(更新聊天窗,清空输入框)。
history = gr.State([])记录对话状态。
import gradio as gr
import requests
# 定义后台的fastapi的URL
backend_url = "http://127.0.0.1:6067/chat"
def chat_with_backend(prompt, history, sys_prompt, history_len, temperature, top_p, max_tokens, stream):
# history:["role": "user", "metadata":{'title':None},"content":"xxxx"],去掉metadata字段
history_none_meatdata = [{"role": h.get("role"), "content": h.get("content")} for h in history]
# 构建请求的数据
data = {
"query": prompt,
"sys_prompt": sys_prompt,
"history": history_none_meatdata,
"history_len": history_len,
"temperature": temperature,
"top_p": top_p,
"max_tokens": max_tokens
}
response = requests.post(backend_url, json=data, stream=True)
if response.status_code == 200:
chunks = ""
if stream:
for chunk in response.iter_content(chunk_size=None, decode_unicode=True):
chunks += chunk
yield chunks
else:
for chunk in response.iter_content(chunk_size=None, decode_unicode=True):
chunks += chunk
yield chunks
# 使用gr.Blocks创建一个块,并设置可以填充高度和宽度
with gr.Blocks(fill_width=True,fill_height=True) as demo:
# 创建一个标签页
with gr.Tab(" 聊天机器人"):
#添加标题
gr.Markdown("## 聊天机器人")
# 创建一个行布局
with gr.Row():
# 创一个左侧的列布局
with gr.Column(scale=1,variant="panel") as sidebar_left:
sys_prompt=gr.Textbox(label="系统提示词", value="You are a helpful assistant")
history_len=gr.Slider(minimum=1,maximum=10,value=1, label="保留历史对话的数量")
temperature=gr.Slider(minimum=0.01, maximum=2.0, value=0.5, step=0.01, label="temperature")
top_p = gr.Slider(minimum=0.01, maximum=1.0, value=0.5, step=0.01, label="top_p")
max_tokens = gr.Slider(minimum=512, maximum=4096, value=1024, step=8, label="max_tokens")
stream = gr.Checkbox(label="stream", value=True)
#创建右侧的布局,设置比例为20
with gr.Column(scale=10) as main:
# 创建聊天机器人的聊天界面,高度为500px
chatbot=gr.Chatbot(type="messages",height=500)
# 创建chatinterface, 用于处理聊天的逻辑
gr.ChatInterface(fn=chat_with_backend,
type='messages',
chatbot=chatbot,
additional_inputs=[
sys_prompt,
history_len,
temperature,
top_p,
max_tokens,
stream
]
)
demo.launch()
启动聊天
FastAPI端
from fastapi import FastAPI, Body
# pip install opeai==1.93.0
from openai import AsyncOpenAI
from typing import List
from fastapi.responses import StreamingResponse
# 初始化FastAPI应用
app = FastAPI()
# 初始化openai的客户端
api_key = "EMPTY"
base_url = "http://127.0.0.1:10222/v1"
aclient = AsyncOpenAI(api_key=api_key, base_url=base_url)
# 初始化对话列表
messages = []
# 定义路由,实现接口对接
@app.post("/chat")
async def chat(
query: str = Body(..., description="用户输入"),
sys_prompt: str = Body("你是一个有用的助手。", description="系统提示词"),
history: List = Body([], description="历史对话"),
history_len: int = Body(1, description="保留历史对话的轮数"),
temperature: float = Body(0.5, description="LLM采样温度"),
top_p: float = Body(0.5, description="LLM采样概率"),
max_tokens: int = Body(None, description="LLM最大token数量")
):
global messages
# 控制历史记录长度
if history_len > 0:
history = history[-2 * history_len:]
# 清空消息列表
messages.clear()
messages.append({"role": "system", "content": sys_prompt})
# 在message中添加历史记录
messages.extend(history)
# 在message中添加用户的prompt
messages.append({"role": "user", "content": query})
# 发送请求
response = await aclient.chat.completions.create(
model="Qwen2___5-0___5B-Instruct",
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stream=True
)
# 响应流式输出并返回
async def generate_response():
async for chunk in response:
chunk_msg = chunk.choices[0].delta.content
if chunk_msg:
yield chunk_msg
# 流式的响应fastapi的客户端
return StreamingResponse(generate_response(), media_type="text/plain")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=6066, log_level="info")
启动vllm
python -m vllm.entrypoints.openai.api_server --port 10222 --model /root/models/Qwen/Qwen2___5-0___5B-Instruct --served-model-name Qwen2___5-0___5B-Instruct
启动fastapi
python chatbot_fastapi.py
启动webui
python chatbot_gradio.py
本地的
前端程序+fastapi+ollama大模型
前端界面程序
import gradio as gr
import requests
# 定义后台的fastapi的URL
backend_url = "http://127.0.0.1:6067/chat"
def chat_with_backend(prompt, history, sys_prompt, history_len, temperature, top_p, max_tokens, stream):
# history:["role": "user", "metadata":{'title':None},"content":"xxxx"],去掉metadata字段
history_none_meatdata = [{"role": h.get("role"), "content": h.get("content")} for h in history]
# 构建请求的数据
data = {
"query": prompt,
"sys_prompt": sys_prompt,
"history": history_none_meatdata,
"history_len": history_len,
"temperature": temperature,
"top_p": top_p,
"max_tokens": max_tokens
}
response = requests.post(backend_url, json=data, stream=True)
if response.status_code == 200:
chunks = ""
if stream:
for chunk in response.iter_content(chunk_size=None, decode_unicode=True):
chunks += chunk
yield chunks
else:
for chunk in response.iter_content(chunk_size=None, decode_unicode=True):
chunks += chunk
yield chunks
# 使用gr.Blocks创建一个块,并设置可以填充高度和宽度
with gr.Blocks(fill_width=True,fill_height=True) as demo:
# 创建一个标签页
with gr.Tab(" 聊天机器人"):
#添加标题
gr.Markdown("## 聊天机器人")
# 创建一个行布局
with gr.Row():
# 创一个左侧的列布局
with gr.Column(scale=1,variant="panel") as sidebar_left:
sys_prompt=gr.Textbox(label="系统提示词", value="You are a helpful assistant")
history_len=gr.Slider(minimum=1,maximum=10,value=1, label="保留历史对话的数量")
temperature=gr.Slider(minimum=0.01, maximum=2.0, value=0.5, step=0.01, label="temperature")
top_p = gr.Slider(minimum=0.01, maximum=1.0, value=0.5, step=0.01, label="top_p")
max_tokens = gr.Slider(minimum=512, maximum=4096, value=1024, step=8, label="max_tokens")
stream = gr.Checkbox(label="stream", value=True)
#创建右侧的布局,设置比例为20
with gr.Column(scale=10) as main:
# 创建聊天机器人的聊天界面,高度为500px
chatbot=gr.Chatbot(type="messages",height=500)
# 创建chatinterface, 用于处理聊天的逻辑
gr.ChatInterface(fn=chat_with_backend,
type='messages',
chatbot=chatbot,
additional_inputs=[
sys_prompt,
history_len,
temperature,
top_p,
max_tokens,
stream
]
)
demo.launch()
fastapi+ollama大模型
from fastapi import FastAPI, Body
# pip install opeai==1.93.0
from openai import AsyncOpenAI
from typing import List
from fastapi.responses import StreamingResponse
# 初始化FastAPI应用
app = FastAPI()
# 初始化openai的客户端
api_key = "EMPTY"
base_url = "http://127.0.0.1:11434/v1"
aclient = AsyncOpenAI(api_key=api_key, base_url=base_url)
# 初始化对话列表
messages = []
# 定义路由,实现接口对接
@app.post("/chat")
async def chat(
query: str = Body(..., description="用户输入"),
sys_prompt: str = Body("你是一个有用的助手。", description="系统提示词"),
history: List = Body([], description="历史对话"),
history_len: int = Body(1, description="保留历史对话的轮数"),
temperature: float = Body(0.5, description="LLM采样温度"),
top_p: float = Body(0.5, description="LLM采样概率"),
max_tokens: int = Body(None, description="LLM最大token数量")
):
global messages
# 控制历史记录长度
if history_len > 0:
history = history[-2 * history_len:]
# 清空消息列表
messages.clear()
messages.append({"role": "system", "content": sys_prompt})
# 在message中添加历史记录
messages.extend(history)
# 在message中添加用户的prompt
messages.append({"role": "user", "content": query})
# 发送请求
response = await aclient.chat.completions.create(
model="qwen2.5:0.5b",
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stream=True
)
# 响应流式输出并返回
async def generate_response():
async for chunk in response:
chunk_msg = chunk.choices[0].delta.content
if chunk_msg:
yield chunk_msg
# 流式的响应fastapi的客户端
return StreamingResponse(generate_response(), media_type="text/plain")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=6067, log_level="info")
ollama启动本地大模型
ollama serve

浙公网安备 33010602011771号