手撸AI对话助手带上思考过程
之前文章《用 LangChain 驱动本地 Ollama 模型》讲叙了使用 LangChain 进行大模型对话。
大模型的响应时间一般都会比较长,那么如何考虑给用户更好的体验呢?

流式输出
类似打字机一样的效果,按token输出。
安装依赖
pip install -U uvicorn "fastapi[standard]" "langchain[openai]"
调用流式输出
核心方法:stream/astream
import json
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from langchain_openai import ChatOpenAI
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.post("/api/bot/chat")
async def bot_chat(request: dict):
query = request.get("query", "你好")
llm = ChatOpenAI(
model="qwen3.5:35b",
base_url="http://192.168.31.24:4000",
api_key="your api key",
temperature=0.7,
streaming=True,
)
system_prompt = (
"你是一个会展示思考过程的AI。"
)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": query},
]
async def generate():
# 直接用 LLM astream,逐 token 流出
async for chunk in llm.astream(messages):
content = chunk.content or ""
if not content:
continue
yield json.dumps({
"type": "chunk",
"content": content,
}, ensure_ascii=False) + "\n"
yield json.dumps({"type": "done"}) + "\n"
return StreamingResponse(
generate(),
media_type="application/x-ndjson",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
测试
curl --location --request POST 'http://127.0.0.1:8000/api/bot/chat' \
--header 'Accept: application/json' \
--header 'Content-Type: application/json' \
--data-raw '{
"query": "请一步步思考:2+3等于多少?"
}'

思考过程
提示词
在大模型的响应参数里,只有部分模型是带有reasoning,如果要兼容大部分模型,就要换种方式,输出时带标签标识。
#... 其它不变
system_prompt = (
"你是一个会展示思考过程的AI。\n"
"请先输出你的思考过程(用<THINK>标签包裹),"
"然后再输出最终答案(用<FINAL>标签包裹)。\n\n"
"示例:\n"
"<THINK>这里是推理过程</THINK>\n"
"<FINAL>这里是最终答案</FINAL>"
)
#... 其它不变
页面实现
通过fetch实现简单示例效果
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>MiMo Chat</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
:root {
--bg: #0a0a0f;
--surface: #12121a;
--border: #2a2a3a;
--think-bg: #13130a;
--think-border: #3a3a1a;
--think-text: #d4a843;
--think-label: #b8941f;
--answer-text: #e0e0e0;
--accent: #6366f1;
--accent-glow: rgba(99, 102, 241, 0.15);
--muted: #6b6b80;
}
body {
font-family: 'Inter', -apple-system, sans-serif;
background: var(--bg);
color: var(--answer-text);
min-height: 100vh;
display: flex;
flex-direction: column;
align-items: center;
}
.chat-container {
width: 100%;
max-width: 760px;
padding: 24px 16px;
display: flex;
flex-direction: column;
gap: 20px;
}
.input-area {
position: sticky;
top: 0;
z-index: 10;
background: var(--bg);
padding: 16px 0;
display: flex;
gap: 10px;
}
.input-area textarea {
flex: 1;
background: var(--surface);
border: 1px solid var(--border);
border-radius: 12px;
padding: 12px 16px;
color: var(--answer-text);
font-family: inherit;
font-size: 14px;
resize: none;
outline: none;
min-height: 44px;
max-height: 120px;
transition: border-color 0.2s;
}
.input-area textarea:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px var(--accent-glow);
}
.send-btn {
background: var(--accent);
border: none;
border-radius: 12px;
padding: 0 20px;
color: #fff;
font-weight: 600;
font-size: 14px;
cursor: pointer;
transition: all 0.2s;
white-space: nowrap;
}
.send-btn:hover {
opacity: 0.85;
}
.send-btn:disabled {
opacity: 0.4;
cursor: not-allowed;
}
.message {
background: var(--surface);
border: 1px solid var(--border);
border-radius: 16px;
overflow: hidden;
animation: slideUp 0.3s ease;
}
@keyframes slideUp {
from {
opacity: 0;
transform: translateY(12px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
.message-header {
padding: 12px 16px;
border-bottom: 1px solid var(--border);
display: flex;
justify-content: space-between;
font-size: 12px;
color: var(--muted);
}
.message-body {
padding: 16px;
}
.think-section {
background: var(--think-bg);
border: 1px solid var(--think-border);
border-radius: 10px;
margin-bottom: 12px;
overflow: hidden;
display: none;
}
.think-section.visible {
display: block;
}
.think-header {
padding: 10px 14px;
display: flex;
align-items: center;
gap: 8px;
cursor: pointer;
user-select: none;
font-size: 13px;
color: var(--think-label);
font-weight: 500;
}
.think-arrow {
display: inline-block;
transition: transform 0.25s;
font-size: 10px;
}
.think-arrow.collapsed {
transform: rotate(-90deg);
}
.think-content {
padding: 0 14px 12px;
font-family: 'JetBrains Mono', monospace;
font-size: 13px;
line-height: 1.75;
color: var(--think-text);
white-space: pre-wrap;
word-break: break-word;
max-height: 800px;
opacity: 1;
overflow: hidden;
transition: max-height 0.3s, padding 0.3s, opacity 0.3s;
}
.think-content.collapsed {
max-height: 0;
padding: 0 14px;
opacity: 0;
}
.answer-content {
font-size: 15px;
line-height: 1.8;
white-space: pre-wrap;
word-break: break-word;
}
.cursor {
display: inline-block;
width: 2px;
height: 1.1em;
background: var(--accent);
margin-left: 1px;
animation: blink 0.7s step-end infinite;
vertical-align: text-bottom;
}
@keyframes blink {
50% {
opacity: 0;
}
}
.loading-dots {
display: inline-flex;
gap: 3px;
margin-left: auto;
}
.loading-dots span {
width: 5px;
height: 5px;
background: var(--think-label);
border-radius: 50%;
animation: bounce 1.2s infinite;
}
.loading-dots span:nth-child(2) {
animation-delay: 0.15s;
}
.loading-dots span:nth-child(3) {
animation-delay: 0.3s;
}
@keyframes bounce {
0%, 80%, 100% {
transform: scale(0.5);
opacity: 0.3;
}
40% {
transform: scale(1);
opacity: 1;
}
}
</style>
</head>
<body>
<div class="chat-container">
<div class="input-area">
<textarea id="queryInput" placeholder="输入问题..." rows="1" onkeydown="if(event.key==='Enter'&&!event.shiftKey){event.preventDefault();sendMessage();}">
请一步步思考:2+3等于多少?
</textarea>
<button class="send-btn" id="sendBtn" onclick="sendMessage()">发送</button>
</div>
<div id="messages"></div>
</div>
<script>
let isSending = false;
async function sendMessage() {
if (isSending) return;
const query = document.getElementById('queryInput').value.trim();
if (!query) return;
isSending = true;
document.getElementById('sendBtn').disabled = true;
const msgId = crypto.randomUUID().replace(/-/g, '');
const conversationId = localStorage.getItem('convId') || crypto.randomUUID().replace(/-/g, '');
localStorage.setItem('convId', conversationId);
const msgEl = createMessage(msgId);
document.getElementById('messages').prepend(msgEl);
const state = {
raw: '',
lastThinkLen: 0,
lastAnswerLen: 0,
thinkCollapsed: false,
};
try {
const resp = await fetch('http://127.0.0.1:8000/api/bot/chat', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({
msgId, conversationId, query,
isEditedQuery: false,
modelConfig: {enableThinking: true, webSearchStatus: "disabled", model: ""},
multiMedias: [],
}),
});
const reader = resp.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
while (true) {
const {done, value} = await reader.read();
if (done) break;
buffer += decoder.decode(value, {stream: true});
const lines = buffer.split('\n');
buffer = lines.pop();
for (const line of lines) {
if (!line.trim()) continue;
let data;
try {
data = JSON.parse(line);
} catch (e) {
continue;
}
if (data.type === 'chunk') {
state.raw += data.content;
render(msgEl, state);
} else if (data.type === 'done') {
render(msgEl, state);
removeCursor(msgEl);
}
}
}
if (buffer.trim()) {
try {
const d = JSON.parse(buffer);
if (d.type === 'chunk') state.raw += d.content;
} catch (e) {
}
}
render(msgEl, state);
removeCursor(msgEl);
} catch (err) {
msgEl.querySelector('.answer-content').textContent = '错误: ' + err.message;
} finally {
isSending = false;
document.getElementById('sendBtn').disabled = false;
}
}
/**
* 当标签被 chunk 切断时(如 </THI → NK>),正则的 |$ 兜底
* 会把部分标签(</THI)当成内容显示。
*
* 示例:
* raw = "<THINK>2+3等于5</THI"
* thinkMatch[1] = "2+3等于5</THI" ← 正则兜底,残留标签混入
* raw.includes('</THINK>') = false ← 完整标签还没到
* stripPartial(thinkText, '</THI') ← 从末尾剥掉残留
* → "2+3等于5" ← 干净
*
* raw = "<THINK>2+3等于5</THINK>"
* thinkMatch[1] = "2+3等于5" ← 正则精确匹配,无残留
* raw.includes('</THINK>') = true
* → 不需要剥
*/
function render(msgEl, state) {
const raw = state.raw;
const thinkMatch = raw.match(/<THINK>([\s\S]*?)(?:<\/THINK>|$)/);
const answerMatch = raw.match(/<FINAL>([\s\S]*?)(?:<\/FINAL>|$)/);
let thinkText = thinkMatch ? thinkMatch[1] : '';
let answerText = answerMatch ? answerMatch[1] : '';
// 标签未完整时,剥掉末尾的残留片段
if (!raw.includes('</THINK>')) {
thinkText = stripPartial(thinkText, '</THINK>');
}
if (!raw.includes('</FINAL>')) {
answerText = stripPartial(answerText, '</FINAL>');
}
// Think 区域
if (thinkText.length > 0) {
const section = msgEl.querySelector('.think-section');
const content = msgEl.querySelector('.think-content');
section.classList.add('visible');
if (thinkText.length > state.lastThinkLen) {
const delta = thinkText.substring(state.lastThinkLen);
content.appendChild(document.createTextNode(delta));
content.scrollTop = content.scrollHeight;
state.lastThinkLen = thinkText.length;
}
if (raw.includes('</THINK>') && !state.thinkCollapsed) {
state.thinkCollapsed = true;
const dots = section.querySelector('.loading-dots');
if (dots) dots.style.display = 'none';
section.querySelector('.think-arrow').classList.add('collapsed');
content.classList.add('collapsed');
}
}
// Answer 区域
if (answerText.length > 0) {
const content = msgEl.querySelector('.answer-content');
if (answerText.length > state.lastAnswerLen) {
const delta = answerText.substring(state.lastAnswerLen);
const oldCursor = content.querySelector('.cursor');
if (oldCursor) oldCursor.remove();
content.appendChild(document.createTextNode(delta));
const cursor = document.createElement('span');
cursor.className = 'cursor';
content.appendChild(cursor);
state.lastAnswerLen = answerText.length;
}
}
}
/**
* 从文本末尾剥掉部分标签
* 比如 stripPartial("2+3等于5</THI", "</THINK") → "2+3等于5"
* 逐个尝试 "</", "</T", "</TH", ... 直到完整标签
*/
function stripPartial(text, fullTag) {
for (let i = 2; i <= fullTag.length; i++) {
const suffix = fullTag.substring(0, i);
if (text.endsWith(suffix)) {
return text.slice(0, -suffix.length);
}
}
return text;
}
function createMessage(msgId) {
const el = document.createElement('div');
el.className = 'message';
el.id = `msg-${msgId}`;
el.innerHTML = `
<div class="message-header">
<span>AI 助手</span>
<span class="timestamp">${new Date().toLocaleTimeString()}</span>
</div>
<div class="message-body">
<div class="think-section">
<div class="think-header" onclick="toggleThink(this)">
<span class="think-arrow">▼</span>
<span>思考过程</span>
<div class="loading-dots"><span></span><span></span><span></span></div>
</div>
<div class="think-content"></div>
</div>
<div class="answer-content"></div>
</div>
`;
return el;
}
function removeCursor(msgEl) {
msgEl.querySelectorAll('.cursor').forEach(c => c.remove());
}
function toggleThink(header) {
header.querySelector('.think-arrow').classList.toggle('collapsed');
header.parentElement.querySelector('.think-content').classList.toggle('collapsed');
}
</script>
</body>
</html>

思考模型
如果业务需求需要大模型输出JSON格式,可以考虑使用仅思考模型,在回复前总会思考,后续有时间说这个方案

浙公网安备 33010602011771号