上周三凌晨两点,我被一条告警吵醒:"SSE 连接超时,AI 代码生成服务不可用"。排查了半小时发现是 Token 过期导致 401 Unauthorized,但更糟糕的是前端没有正确处理这个错误,导致用户看到的是一片空白的 Monaco Editor,而不是友好的错误提示。这篇文章记录我从这次事故中学到的所有经验,以及如何正确实现 Monaco Editor 与 SSE 流式输出的完整集成方案。
为什么选择 SSE 而不是轮询?
在我司的实际业务场景中,AI 代码生成服务的平均响应延迟约为 800ms,生成一段完整函数需要 3-5 秒。如果使用传统轮询,前端每 500ms 请求一次状态,服务器压力巨大且用户体验很差。而 Server-Sent Events (SSE) 实现了服务端推送,前端只需建立一次连接,AI 生成的每一个 token 都能实时到达。
使用 HolySheep AI 的流式 API,配合国内直连节点,平均延迟低于 50ms,比海外 API 快了整整 15 倍。我实测 GPT-4.1 模型生成 500 行代码,全程流式输出丝滑流畅。
后端 FastAPI 流式接口实现
首先需要搭建一个接收 AI 流式响应并转发给前端的接口。这里使用 FastAPI 的 StreamingResponse:
pip install fastapi uvicorn sse-starlette aiohttp
import asyncio
import json
from typing import AsyncGenerator
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from sse_starlette.sse import EventSourceResponse
import aiohttp
app = FastAPI()
HOLYSHEEP_API_KEY = "YOUR_HOLYSHEEP_API_KEY"
HOLYSHEEP_BASE_URL = "https://api.holysheep.ai/v1"
async def stream_ai_completion(
prompt: str,
model: str = "gpt-4.1"
) -> AsyncGenerator[str, None]:
"""
HolySheep AI 流式输出生成器
生成 SSE 格式的 data chunk
"""
headers = {
"Authorization": f"Bearer {HOLYSHEEP_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": True,
"temperature": 0.7,
"max_tokens": 2000
}
async with aiohttp.ClientSession() as session:
async with session.post(
f"{HOLYSHEEP_BASE_URL}/chat/completions",
headers=headers,
json=payload,
timeout=aiohttp.ClientTimeout(total=120)
) as response:
if response.status == 401:
yield "data: {\"error\": \"API密钥无效或已过期\", \"code\": \"UNAUTHORIZED\"}\n\n"
return
elif response.status != 200:
error_text = await response.text()
yield f"data: {{\"error\": \"API请求失败: {response.status}\", \"detail\": \"{error_text}\"}}\n\n"
return
# 解析流式响应
async for line in response.content:
line = line.decode('utf-8').strip()
if not line or not line.startswith('data: '):
continue
data = line[6:] # 去掉 "data: " 前缀
if data == "[DONE]":
yield "data: {\"done\": true}\n\n"
break
try:
chunk = json.loads(data)
# 提取 content 增量
if 'choices' in chunk and len(chunk['choices']) > 0:
delta = chunk['choices'][0].get('delta', {})
content = delta.get('content', '')
if content:
yield f"data: {{\"content\": {json.dumps(content)}, \"type\": \"chunk\"}}\n\n"
except json.JSONDecodeError:
continue
@app.post("/api/generate-code")
async def generate_code(prompt: str, model: str = "gpt-4.1"):
return StreamingResponse(
stream_ai_completion(prompt, model),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
)
前端 Monaco Editor SSE 客户端
现在实现前端的流式渲染逻辑。我选择原生 EventSource 配合 fetch 实现,原因是 EventSource 不支持 POST 请求和自定义 headers,所以我们用 fetch 的 ReadableStream 来替代:
// sse-client.js
class AICodeStreamClient {
constructor(options = {}) {
this.apiEndpoint = options.apiEndpoint || '/api/generate-code';
this.model = options.model || 'gpt-4.1';
this.onChunk = options.onChunk || (() => {});
this.onComplete = options.onComplete || (() => {});
this.onError = options.onError || (() => {});
this.controller = null;
}
async start(prompt) {
try {
const response = await fetch(this.apiEndpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ prompt, model: this.model }),
});
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.error || HTTP ${response.status});
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
let fullContent = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop() || '';
for (const line of lines) {
if (!line.startsWith('data: ')) continue;
const data = line.slice(6).trim();
if (!data) continue;
try {
const parsed = JSON.parse(data);
if (parsed.error) {
this.onError({
message: parsed.error,
code: parsed.code || 'UNKNOWN'
});
return;
}
if (parsed.done) {
this.onComplete({ content: fullContent });
return;
}
if (parsed.content) {
fullContent += parsed.content;
this.onChunk({
delta: parsed.content,
fullContent: fullContent
});
}
} catch (e) {
console.warn('SSE 解析失败:', e, '原始数据:', data);
}
}
}
this.onComplete({ content: fullContent });
} catch (error) {
if (error.name === 'AbortError') {
this.onError({ message: '请求被取消', code: 'ABORTED' });
} else {
this.onError({
message: error.message || '网络连接失败',
code: 'CONNECTION_ERROR'
});
}
}
}
abort() {
if (this.controller) {
this.controller.abort();
}
}
}
// monaco-integrate.html
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>AI 代码生成器 - Monaco + SSE 实时渲染</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/min/vs/editor/editor.main.css">
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
background: #1e1e1e; color: #d4d4d4; height: 100vh; display: flex; flex-direction: column;
}
.header {
padding: 16px 24px; background: #252526; border-bottom: 1px solid #3c3c3c;
display: flex; align-items: center; gap: 16px;
}
.prompt-input {
flex: 1; padding: 12px 16px; background: #3c3c3c; border: 1px solid #555;
border-radius: 6px; color: #d4d4d4; font-size: 14px; outline: none;
}
.prompt-input:focus { border-color: #007acc; }
.btn {
padding: 12px 24px; border: none; border-radius: 6px; cursor: pointer;
font-weight: 600; font-size: 14px; transition: all 0.2s;
}
.btn-primary { background: #007acc; color: #fff; }
.btn-primary:hover { background: #0062a3; }
.btn-danger { background: #d32f2f; color: #fff; }
.btn:disabled { opacity: 0.5; cursor: not-allowed; }
#editor { flex: 1; }
.status-bar {
padding: 8px 16px; background: #007acc; color: #fff; font-size: 12px;
display: flex; justify-content: space-between;
}
.error-toast {
position: fixed; bottom: 60px; left: 50%; transform: translateX(-50%);
background: #d32f2f; padding: 16px 24px; border-radius: 8px;
display: none; z-index: 1000; max-width: 400px; text-align: center;
}
</style>
</head>
<body>
<div class="header">
<input type="text" class="prompt-input" id="promptInput"
placeholder="描述你想要生成的代码,例如:写一个 Python 快速排序算法">
<button class="btn btn-primary" id="generateBtn" onclick="startGeneration()">生成代码</button>
<button class="btn btn-danger" id="stopBtn" onclick="stopGeneration()" disabled>停止</button>
</div>
<div id="editor"></div>
<div class="status-bar">
<span id="statusText">就绪</span>
<span id="tokenCount">Token: 0</span>
</div>
<div class="error-toast" id="errorToast"></div>
<script src="https://cdn.jsdelivr.net/npm/[email protected]/min/vs/loader.js"></script>
<script>
let editor;
let streamClient;
let tokenCount = 0;
require.config({ paths: { vs: 'https://cdn.jsdelivr.net/npm/[email protected]/min/vs' } });
require(['vs/editor/editor.main'], function () {
editor = monaco.editor.create(document.getElementById('editor'), {
value: '// AI 生成的代码将显示在这里',
language: 'python',
theme: 'vs-dark',
fontSize: 14,
minimap: { enabled: true },
automaticLayout: true,
wordWrap: 'on'
});
});
function showError(message) {
const toast = document.getElementById('errorToast');
toast.textContent = message;
toast.style.display = 'block';
setTimeout(() => { toast.style.display = 'none'; }, 5000);
}
function updateStatus(text) {
document.getElementById('statusText').textContent = text;
}
async function startGeneration() {
const prompt = document.getElementById('promptInput').value.trim();
if (!prompt) {
showError('请输入代码生成描述');
return;
}
// 检测语言
const langMap = {
'python': 'python', 'py': 'python',
'javascript': 'javascript', 'js': 'javascript',
'typescript': 'typescript', 'ts': 'typescript',
'java': 'java', 'c++': 'cpp', 'cpp': 'cpp',
'go': 'go', 'rust': 'rust', 'ruby': 'ruby'
};
let detectedLang = 'python';
for (const [kw, lang] of Object.entries(langMap)) {
if (prompt.toLowerCase().includes(kw)) {
detectedLang = lang;
break;
}
}
// 重置状态
editor.setValue('');
tokenCount = 0;
document.getElementById('tokenCount').textContent = 'Token: 0';
document.getElementById('generateBtn').disabled = true;
document.getElementById('stopBtn').disabled = false;
updateStatus('正在生成...');
streamClient = new AICodeStreamClient({
apiEndpoint: '/api/generate-code',
model: 'gpt-4.1',
onChunk: ({ delta, fullContent }) => {
editor.setValue(fullContent);
tokenCount += delta.length;
document.getElementById('tokenCount').textContent = Token: ${tokenCount};
updateStatus(生成中... ${tokenCount} chars);
},
onComplete: ({ content }) => {
editor.setValue(content);
document.getElementById('generateBtn').disabled = false;
document.getElementById('stopBtn').disabled = true;
updateStatus(生成完成 (${tokenCount} chars));
},
onError: ({ message, code }) => {
showError(错误 [${code}]: ${message});
document.getElementById('generateBtn').disabled = false;
document.getElementById('stopBtn').disabled = true;
updateStatus('生成失败');
}
});
streamClient.start(prompt);
}
function stopGeneration() {
if (streamClient) {
streamClient.abort();
document.getElementById('generateBtn').disabled = false;
document.getElementById('stopBtn').disabled = true;
updateStatus('已停止');
}
}
</script>
<script src="sse-client.js"></script>
</body>
</html>
常见报错排查
在我实施这套方案的过程中,遇到了三个主要的坑,这里逐一说明并给出解决方案。
1. 401 Unauthorized - API 密钥无效
这个错误最常出现在团队协作时,API Key 没有正确配置或者已经过期。HolySheep AI 的免费额度用完后也会触发这个错误。
# 排查步骤:
1. 检查 Key 是否正确配置
curl -X POST https://api.holysheep.ai/v1/models \
-H "Authorization: Bearer YOUR_HOLYSHEEP_API_KEY"
2. 如果返回 {"error": {"message": "Invalid API key", "type": "invalid_request_error"}}
说明 Key 无效,需要去 https://www.holysheep.ai/register 重新获取
3. 检查账户余额和免费额度
curl https://api.holysheep.ai/v1/usage \
-H "Authorization: Bearer YOUR_HOLYSHEEP_API_KEY"
# 解决方案:使用环境变量管理密钥
import os
from dotenv import load_dotenv
load_dotenv() # 从 .env 文件加载环境变量
HOLYSHEEP_API_KEY = os.getenv("HOLYSHEEP_API_KEY")
if not HOLYSHEEP_API_KEY:
raise ValueError("HOLYSHEEP_API_KEY 环境变量未设置,请访问 https://www.holysheep.ai/register 注册获取")
2. CORS 跨域问题
前端直接调用 HolySheep API 时,浏览器会拦截请求。这是因为我们的域名不在 API 的允许列表中。解决方案是通过后端代理转发:
# 后端代理中间件 - 在 FastAPI 中添加
from fastapi import Request
from starlette.middleware.base import BaseHTTPMiddleware
class CORSMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
response = await call_next(request)
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Methods"] = "POST, GET, OPTIONS"
response.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization"
return response
app.add_middleware(CORSMiddleware)
或者更精确的配置
@app.middleware("http")
async def add_cors_headers(request: Request, call_next):
if request.url.path.startswith("/api/"):
response = await call_next(request)
response.headers["Access-Control-Allow-Origin"] = "https://your-frontend.com"
response.headers["Access-Control-Allow-Credentials"] = "true"
return response
return await call_next(request)
3. SSE 连接超时断开
长时间运行的 AI 生成任务容易被 Nginx 或负载均衡器强制断开。我在使用 HolySheep 的 Claude Sonnet 4.5 模型生成复杂代码时遇到过这个问题。
# Nginx 配置调整 - 在 location 块中添加
location /api/generate-code {
proxy_pass http://127.0.0.1:8000;
proxy_http_version 1.1;
proxy_set_header Connection '';
proxy_buffering off;
proxy_cache off;
# 超时设置 - 关键!
proxy_read_timeout 300s;
proxy_send_timeout 300s;
proxy_connect_timeout 75s;
# Chunked 编码支持
chunked_transfer_encoding on;
}
后端 FastAPI 也需要配置
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware as FastAPICORSMiddleware
app = FastAPI(
lifespan=[],
docs_url=None,
redoc_url=None,
)
在 stream_ai_completion 函数中添加心跳
async def stream_ai_completion(prompt: str, model: str = "gpt-4.1"):
# ... 原有的 API 调用逻辑 ...
# 每 15 秒发送一次注释行作为心跳
last_heartbeat = asyncio.get_event_loop().time()
async for line in response.content:
current_time = asyncio.get_event_loop().time()
if current_time - last_heartbeat > 15:
yield ": heartbeat\n\n" # SSE 注释行,不触发解析
last_heartbeat = current_time
# ... 其余解析逻辑 ...
性能优化与最佳实践
经过几个月的线上运行,我总结出以下优化经验:
- 流式缓冲策略:不要每收到一个 token 就更新 Monaco Editor,这会导致严重的卡顿。建议累积 10-20 个字符或 50ms 间隔再更新一次
- 内存泄漏防护:每次生成完成后务必销毁 ReadableStream 和 decoder,防止内存持续增长
- 模型选择:根据我的实测,DeepSeek V3.2 的 output 价格仅 $0.42/MTok,是 GPT-4.1 ($8/MTok) 的 1/19,适合对成本敏感的场景
- 降级方案:建议配置 fallback 模型,当主模型不可用时自动切换
# 推荐的流式渲染优化版本
class OptimizedAIMonacoRenderer {
constructor(editor, options = {}) {
this.editor = editor;
this.buffer = '';
this.bufferSize = options.bufferSize || 20; // 累积 20 字符更新
this.flushInterval = options.flushInterval || 50; // 或 50ms 强制更新
this.lastFlush = Date.now();
}
appendChunk(delta) {
this.buffer += delta;
// 满足任一条件就刷新
if (this.buffer.length >= this.bufferSize ||
Date.now() - this.lastFlush >= this.flushInterval) {
this.flush();
}
}
flush() {