ChatGLM-6B API设计：RESTful接口开发指南-平芜编程栈

ChatGLM-6B API设计：RESTful接口开发指南

1. 为什么需要专业的API设计

当你把ChatGLM-6B模型部署好，能通过命令行或网页界面和它对话时，可能觉得已经完成了大部分工作。但实际在工程落地中，真正考验能力的往往是API设计环节——它决定了你的模型服务能否被其他系统稳定调用、能否应对真实业务流量、能否在生产环境中长期可靠运行。

我见过太多团队把模型跑起来就以为万事大吉，结果上线后遇到各种问题：用户并发一高服务就卡死，没有认证机制导致资源被滥用，日志缺失让故障排查无从下手，限流策略缺失导致GPU显存被耗尽……这些问题都不是模型本身的问题，而是API设计没到位。

API不是简单地把模型封装成HTTP接口就完事了。它是一套完整的生产级服务契约，需要考虑安全性、可靠性、可观测性和可维护性。本文会带你从零开始，构建一个真正能上生产的ChatGLM-6B RESTful API服务，不讲空泛理论，只给可直接落地的代码和实践建议。

2. 基础API服务搭建

2.1 快速启动原生API服务

ChatGLM-6B官方仓库已经提供了基础的API实现，位于api.py文件中。这个版本使用FastAPI框架，轻量且性能优秀。我们先从这个起点出发，然后逐步增强功能。

首先确保已安装必要依赖：

pip install fastapi uvicorn transformers torch sentencepiece accelerate

创建一个基础的api.py文件（基于官方实现优化）：

# api.py from fastapi import FastAPI, HTTPException, Request from transformers import AutoTokenizer, AutoModel import torch import uvicorn import json import datetime import time app = FastAPI( title="ChatGLM-6B API Service", description="生产级ChatGLM-6B RESTful API服务", version="1.0.0" ) # 全局模型和tokenizer实例 model = None tokenizer = None def load_model(): """加载模型，支持量化配置""" global model, tokenizer # 根据硬件环境选择加载方式 try: # 尝试加载INT4量化版本（推荐用于生产环境） tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True) model = AutoModel.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True).float() print(" 已加载INT4量化模型（推荐生产环境使用）") except Exception as e: print(f" INT4模型加载失败，尝试FP16版本: {e}") try: tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda() print(" 已加载FP16模型（需GPU支持）") except Exception as e2: raise RuntimeError(f"模型加载失败: {e2}") @app.on_event("startup") async def startup_event(): """应用启动时加载模型""" print(" 正在加载ChatGLM-6B模型...") start_time = time.time() load_model() end_time = time.time() print(f"⏱ 模型加载完成，耗时: {end_time - start_time:.2f}秒") model.eval() @app.get("/") async def root(): return { "message": "ChatGLM-6B API服务已启动", "status": "healthy", "model": "ChatGLM-6B", "version": "1.0.0" } @app.post("/chat") async def chat_endpoint(request: Request): """核心聊天接口""" try: data = await request.json() # 提取参数，设置默认值 prompt = data.get("prompt", "").strip() history = data.get("history", []) max_length = data.get("max_length", 2048) top_p = data.get("top_p", 0.7) temperature = data.get("temperature", 0.95) # 输入验证 if not prompt: raise HTTPException(status_code=400, detail="prompt参数不能为空") if len(prompt) > 1000: raise HTTPException(status_code=400, detail="prompt长度不能超过1000字符") # 执行模型推理 response, updated_history = model.chat( tokenizer, prompt, history=history, max_length=max_length, top_p=top_p, temperature=temperature ) # 构建响应 now = datetime.datetime.now() return { "response": response, "history": updated_history, "status": 200, "time": now.strftime("%Y-%m-%d %H:%M:%S"), "request_id": f"req_{int(time.time())}_{hash(prompt) % 10000}" } except torch.cuda.OutOfMemoryError: raise HTTPException(status_code=503, detail="GPU内存不足，请减少请求并发或使用量化模型") except Exception as e: raise HTTPException(status_code=500, detail=f"服务内部错误: {str(e)}") if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)

启动服务：

python api.py

测试接口：

curl -X POST "http://127.0.0.1:8000/chat" \ -H "Content-Type: application/json" \ -d '{"prompt": "你好，你是谁？", "history": []}'

这个基础版本已经比官方原版更实用：增加了输入验证、错误处理、请求ID生成和更友好的响应结构。但距离生产环境还有很大差距，接下来我们逐项增强。

3. 认证与授权机制

3.1 API密钥认证实现

没有认证的API就像敞开的大门，任何人都可以随意调用，不仅消耗你的计算资源，还可能带来安全风险。我们采用简单的API密钥认证方案，既轻量又足够安全。

首先创建一个密钥管理模块：

# auth.py import secrets import sqlite3 from datetime import datetime, timedelta from typing import Optional, Dict, Any class APIKeyManager: def __init__(self, db_path: str = "api_keys.db"): self.db_path = db_path self._init_database() def _init_database(self): """初始化数据库""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS api_keys ( id INTEGER PRIMARY KEY AUTOINCREMENT, key_hash TEXT UNIQUE NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, expires_at TIMESTAMP, is_active BOOLEAN DEFAULT 1, description TEXT, usage_count INTEGER DEFAULT 0, last_used TIMESTAMP ) ''') conn.commit() conn.close() def create_api_key(self, description: str = "", expires_in_days: int = 365) -> str: """创建新的API密钥""" key = secrets.token_urlsafe(32) key_hash = secrets.token_urlsafe(32) # 实际存储哈希，不存明文 expires_at = datetime.now() + timedelta(days=expires_in_days) conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute(''' INSERT INTO api_keys (key_hash, expires_at, description) VALUES (?, ?, ?) ''', (key_hash, expires_at, description)) conn.commit() conn.close() return key def validate_api_key(self, api_key: str) -> bool: """验证API密钥是否有效""" if not api_key: return False conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute(''' SELECT is_active, expires_at FROM api_keys WHERE key_hash = ? AND is_active = 1 ''', (api_key,)) result = cursor.fetchone() conn.close() if not result: return False is_active, expires_at = result if expires_at and datetime.now() > datetime.fromisoformat(expires_at.replace('Z', '+00:00')): return False return True def get_usage_stats(self) -> Dict[str, Any]: """获取API密钥使用统计""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute(''' SELECT COUNT(*) as total_keys, SUM(CASE WHEN is_active = 1 THEN 1 ELSE 0 END) as active_keys, MAX(last_used) as last_used FROM api_keys ''') result = cursor.fetchone() conn.close() return { "total_keys": result[0] if result else 0, "active_keys": result[1] if result else 0, "last_used": result[2] if result else None } # 使用示例 # key_manager = APIKeyManager() # new_key = key_manager.create_api_key("生产环境主密钥", expires_in_days=365) # print(f"新密钥: {new_key}")

然后在主API中集成认证：

# 在api.py顶部添加 from fastapi import Depends, HTTPException, status from fastapi.security import APIKeyHeader from auth import APIKeyManager # 初始化密钥管理器 key_manager = APIKeyManager() # API密钥头 api_key_header = APIKeyHeader( name="X-API-Key", auto_error=False, description="API密钥，用于身份验证" ) async def verify_api_key(api_key: str = Depends(api_key_header)): """API密钥验证依赖""" if not api_key: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="缺少API密钥", headers={"WWW-Authenticate": "X-API-Key"}, ) if not key_manager.validate_api_key(api_key): raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail="无效或已过期的API密钥", ) return api_key # 修改chat_endpoint，添加认证依赖 @app.post("/chat") async def chat_endpoint( request: Request, api_key: str = Depends(verify_api_key) # 添加这一行 ): # ...原有代码保持不变... pass

现在调用API需要提供密钥头：

curl -X POST "http://127.0.0.1:8000/chat" \ -H "Content-Type: application/json" \ -H "X-API-Key: your-api-key-here" \ -d '{"prompt": "你好，你是谁？", "history": []}'

3.2 密钥管理API

为了方便管理密钥，我们添加管理接口：

# 在api.py中添加管理端点 @app.get("/admin/keys") async def list_api_keys(api_key: str = Depends(verify_api_key)): """列出所有API密钥（仅管理员可用）""" # 这里可以添加更严格的权限检查 stats = key_manager.get_usage_stats() return { "stats": stats, "message": "密钥管理接口，用于监控和管理API访问" } @app.post("/admin/keys/generate") async def generate_api_key( description: str = "", expires_in_days: int = 365, api_key: str = Depends(verify_api_key) ): """生成新的API密钥""" new_key = key_manager.create_api_key(description, expires_in_days) return { "key": new_key, "description": description, "expires_in_days": expires_in_days, "message": "新API密钥已生成，请妥善保管" }

4. 限流与资源保护

4.1 基于Redis的分布式限流

单机限流在多实例部署时无法工作，我们需要分布式限流方案。这里使用Redis实现令牌桶算法：

# rate_limit.py import redis import time from typing import Optional, Tuple class RedisRateLimiter: def __init__(self, redis_url: str = "redis://localhost:6379/0"): self.redis_client = redis.from_url(redis_url) def is_allowed(self, key: str, max_requests: int, window_seconds: int) -> Tuple[bool, int, int]: """ 检查请求是否被允许 Returns: Tuple[bool, current_count, remaining_window] """ # 使用Redis的原子操作 pipe = self.redis_client.pipeline() # 获取当前计数 pipe.get(f"rate_limit:{key}:count") # 获取过期时间 pipe.ttl(f"rate_limit:{key}:count") # 执行管道 results = pipe.execute() current_count = int(results[0]) if results[0] else 0 ttl = results[1] if results[1] > 0 else 0 # 如果key不存在或已过期，重置 if not results[0] or ttl <= 0: pipe.setex(f"rate_limit:{key}:count", window_seconds, 1) pipe.execute() return True, 1, window_seconds # 检查是否超过限制 if current_count >= max_requests: return False, current_count, ttl # 增加计数 pipe.incr(f"rate_limit:{key}:count") # 设置过期时间（如果还没有） if ttl <= 0: pipe.expire(f"rate_limit:{key}:count", window_seconds) pipe.execute() return True, current_count + 1, ttl # 使用示例 # limiter = RedisRateLimiter() # allowed, count, ttl = limiter.is_allowed("user:123", 10, 60)

在API中集成限流：

# 在api.py中添加 from rate_limit import RedisRateLimiter # 初始化限流器 limiter = RedisRateLimiter("redis://localhost:6379/0") @app.middleware("http") async def rate_limit_middleware(request: Request, call_next): """全局限流中间件""" # 基于IP地址限流 client_ip = request.client.host # 对/chat端点进行限流 if request.url.path == "/chat": allowed, count, ttl = limiter.is_allowed( f"ip:{client_ip}", max_requests=30, # 每分钟30次 window_seconds=60 ) if not allowed: return JSONResponse( status_code=429, content={ "error": "请求过于频繁", "retry_after": ttl, "message": f"您已达到每分钟30次的请求限制，{ttl}秒后可重试" } ) response = await call_next(request) return response

4.2 模型资源保护策略

除了请求频率限制，我们还需要保护模型本身的资源使用：

# resource_protection.py import threading import time from typing import Dict, List, Optional class ModelResourceGuard: def __init__(self, max_concurrent_requests: int = 5): self.max_concurrent_requests = max_concurrent_requests self.current_requests = 0 self.lock = threading.Lock() self.request_queue = [] self.last_cleanup = time.time() def acquire(self, timeout: float = 30.0) -> bool: """获取执行许可""" start_time = time.time() while time.time() - start_time < timeout: with self.lock: if self.current_requests < self.max_concurrent_requests: self.current_requests += 1 return True # 等待一小段时间再重试 time.sleep(0.1) return False def release(self): """释放执行许可""" with self.lock: if self.current_requests > 0: self.current_requests -= 1 def get_status(self) -> Dict: """获取资源状态""" with self.lock: return { "current_requests": self.current_requests, "max_concurrent_requests": self.max_concurrent_requests, "utilization_percent": (self.current_requests / self.max_concurrent_requests) * 100 } # 在api.py中使用 resource_guard = ModelResourceGuard(max_concurrent_requests=3) @app.post("/chat") async def chat_endpoint( request: Request, api_key: str = Depends(verify_api_key) ): # 在执行模型推理前获取资源许可 if not resource_guard.acquire(timeout=10.0): raise HTTPException( status_code=503, detail="服务繁忙，请稍后重试" ) try: # ...原有推理逻辑... pass finally: # 确保释放资源 resource_guard.release()

5. 监控与可观测性

5.1 请求日志与性能监控

生产环境必须有完善的日志记录和性能监控：

# monitoring.py import logging import time from datetime import datetime from typing import Dict, Any # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('chatglm_api.log'), logging.StreamHandler() ] ) logger = logging.getLogger("chatglm_api") class APIMonitor: def __init__(self): self.request_count = 0 self.error_count = 0 self.total_response_time = 0.0 self.max_response_time = 0.0 self.min_response_time = float('inf') self.lock = threading.Lock() def log_request(self, request_data: Dict, response_data: Dict, response_time: float, status_code: int): """记录请求详情""" with self.lock: self.request_count += 1 if status_code >= 400: self.error_count += 1 self.total_response_time += response_time self.max_response_time = max(self.max_response_time, response_time) self.min_response_time = min(self.min_response_time, response_time) # 记录详细日志 logger.info( f"REQ {request_data.get('prompt', '')[:50]}... | " f"STATUS {status_code} | " f"TIME {response_time:.2f}s | " f"LEN {len(response_data.get('response', ''))} chars | " f"KEY {request_data.get('api_key', 'N/A')[:8]}..." ) def get_metrics(self) -> Dict[str, Any]: """获取监控指标""" with self.lock: avg_response_time = (self.total_response_time / self.request_count) if self.request_count > 0 else 0 return { "total_requests": self.request_count, "error_rate": (self.error_count / self.request_count * 100) if self.request_count > 0 else 0, "avg_response_time": round(avg_response_time, 3), "max_response_time": round(self.max_response_time, 3), "min_response_time": round(self.min_response_time, 3) if self.min_response_time != float('inf') else 0, "uptime": datetime.now().isoformat() } # 初始化监控器 monitor = APIMonitor() # 在chat_endpoint中添加监控 @app.post("/chat") async def chat_endpoint( request: Request, api_key: str = Depends(verify_api_key) ): start_time = time.time() try: # ...原有逻辑... data = await request.json() # ...处理逻辑... response_time = time.time() - start_time monitor.log_request(data, response, response_time, 200) return response except HTTPException as e: response_time = time.time() - start_time monitor.log_request({}, {"error": str(e)}, response_time, e.status_code) raise e except Exception as e: response_time = time.time() - start_time monitor.log_request({}, {"error": str(e)}, response_time, 500) raise e

5.2 健康检查与指标端点

添加健康检查和指标暴露端点：

# 在api.py中添加 from fastapi.responses import JSONResponse @app.get("/health") async def health_check(): """健康检查端点""" # 检查模型是否加载 model_status = "loaded" if model is not None else "not loaded" # 检查Redis连接 redis_status = "ok" try: # 这里可以添加Redis连接检查 pass except: redis_status = "unavailable" return { "status": "healthy", "model_status": model_status, "redis_status": redis_status, "timestamp": datetime.now().isoformat() } @app.get("/metrics") async def get_metrics(): """Prometheus格式指标端点""" metrics = monitor.get_metrics() # 返回Prometheus格式的文本 return Response( content=f"""# HELP chatglm_api_requests_total Total number of API requests # TYPE chatglm_api_requests_total counter chatglm_api_requests_total {metrics['total_requests']} # HELP chatglm_api_errors_total Total number of API errors # TYPE chatglm_api_errors_total counter chatglm_api_errors_total {metrics['error_count']} # HELP chatglm_api_response_time_seconds Average response time in seconds # TYPE chatglm_api_response_time_seconds gauge chatglm_api_response_time_seconds {metrics['avg_response_time']} # HELP chatglm_api_error_rate_percent Error rate percentage # TYPE chatglm_api_error_rate_percent gauge chatglm_api_error_rate_percent {metrics['error_rate']} """, media_type="text/plain" ) @app.get("/status") async def get_status(): """详细状态信息""" return { "status": "operational", "metrics": monitor.get_metrics(), "resource_guard": resource_guard.get_status(), "key_stats": key_manager.get_usage_stats(), "uptime": datetime.now().isoformat() }

6. 生产环境部署配置

6.1 完整的生产配置

创建一个生产就绪的启动脚本：

# main.py from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.gzip import GZipMiddleware import uvicorn import os from api import app # 导入我们的API应用 # 创建生产环境应用实例 production_app = FastAPI( title="ChatGLM-6B 生产API服务", description="企业级ChatGLM-6B RESTful API服务", version="1.0.0", docs_url="/docs" if os.getenv("DEBUG", "false").lower() == "true" else None, redoc_url=None ) # 添加CORS中间件 production_app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # 添加GZIP压缩 production_app.add_middleware(GZipMiddleware, minimum_size=1000) # 挂载API路由 production_app.mount("/", app) if __name__ == "__main__": # 生产环境配置 uvicorn.run( "main:production_app", host="0.0.0.0", port=int(os.getenv("PORT", "8000")), workers=int(os.getenv("WORKERS", "2")), reload=os.getenv("RELOAD", "false").lower() == "true", log_level="info", access_log=True, proxy_headers=True, forwarded_allow_ips="*", limit_concurrency=100, timeout_keep_alive=5, ssl_keyfile=os.getenv("SSL_KEYFILE"), ssl_certfile=os.getenv("SSL_CERTFILE") )

6.2 Docker部署配置

创建Dockerfile：

# Dockerfile FROM python:3.9-slim # 设置工作目录 WORKDIR /app # 复制依赖文件 COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt # 复制应用代码 COPY . . # 创建非root用户 RUN useradd -m -u 1001 -g root appuser USER appuser # 暴露端口 EXPOSE 8000 # 启动命令 CMD ["uvicorn", "main:production_app", "--host", "0.0.0.0:8000", "--port", "8000", "--workers", "2"]

创建requirements.txt：

fastapi==0.104.1 uvicorn==0.23.2 transformers==4.34.0 torch==2.1.0 sentencepiece==0.1.99 accelerate==0.23.0 redis==4.6.0 pydantic==2.4.2

创建docker-compose.yml：

# docker-compose.yml version: '3.8' services: chatglm-api: build: . ports: - "8000:8000" environment: - PORT=8000 - WORKERS=2 - DEBUG=false - REDIS_URL=redis://redis:6379/0 depends_on: - redis restart: unless-stopped deploy: resources: limits: memory: 8G cpus: '2.0' reservations: memory: 4G cpus: '0.5' redis: image: redis:7-alpine ports: - "6379:6379" command: redis-server --save 60 1 --loglevel warning restart: unless-stopped deploy: resources: limits: memory: 1G cpus: '0.5'

启动服务：