在前面的文章中,我们讨论了 AI Agent 的各个模块实现。今天,我想聊聊如何把这个系统部署到生产环境。说实话,这个过程比想象的要复杂得多,因为 AI Agent 系统有很多特殊的运维需求。
从一次部署事故说起
还记得第一次部署 AI Agent 到生产环境时的场景:
我:系统测试都通过了,可以部署了
运维:好的,按常规 Python 应用部署
(部署完成后)
用户:为什么响应这么慢?
监控:API 费用飙升...
我:...(这才发现漏掉了很多细节)
这次经历让我意识到:AI Agent 不是普通的 Web 应用,它需要特别的部署和运维策略。
环境准备
首先,我们需要准备好部署环境:
# 1. 创建部署目录
mkdir -p /app/ai-agent
cd /app/ai-agent
# 2. 创建虚拟环境
python -m venv venv
source venv/bin/activate
# 3. 安装依赖
pip install -r requirements.txt
# 4. 准备配置文件
cat > config.yaml << EOF
environment: production
log_level: INFO
# AI 模型配置
model:
provider: openai
model_name: gpt-4
temperature: 0.7
max_tokens: 2000
retry_count: 3
timeout: 30
# 向量数据库配置
vector_store:
type: milvus
host: milvus.internal
port: 19530
collection: agent_knowledge
# 缓存配置
cache:
type: redis
url: redis://redis.internal:6379
ttl: 3600
# 监控配置
monitoring:
prometheus_port: 9090
grafana_port: 3000
alert_webhook: "https://hooks.slack.com/..."
EOF
# 5. 准备 Docker 配置
cat > Dockerfile << EOF
FROM python:3.11-slim
WORKDIR /app
# 安装系统依赖
RUN apt-get update && apt-get install -y \\
build-essential \\
curl \\
&& rm -rf /var/lib/apt/lists/*
# 复制应用代码
COPY . .
# 安装 Python 依赖
RUN pip install --no-cache-dir -r requirements.txt
# 暴露端口
EXPOSE 8000 9090
# 启动命令
CMD ["uvicorn", "agent.main:app", "--host", "0.0.0.0", "--port", "8000"]
EOF
# 6. 准备 docker-compose 配置
cat > docker-compose.yml << EOF
version: '3.8'
services:
agent:
build: .
ports:
- "8000:8000"
- "9090:9090"
environment:
- ENVIRONMENT=production
- OPENAI_API_KEY=\${OPENAI_API_KEY}
volumes:
- ./config.yaml:/app/config.yaml
depends_on:
- redis
- milvus
deploy:
replicas: 3
resources:
limits:
cpus: '1'
memory: 2G
reservations:
cpus: '0.5'
memory: 1G
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis_data:/data
milvus:
image: milvusdb/milvus:latest
ports:
- "19530:19530"
volumes:
- milvus_data:/var/lib/milvus
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
volumes:
redis_data:
milvus_data:
prometheus_data:
grafana_data:
EOF
部署脚本
接下来是部署脚本:
import subprocess
import yaml
import os
from typing import List, Dict
import asyncio
from datetime import datetime
class Deployment:
def __init__(
self,
config_path: str,
environment: str
):
self.config = self._load_config(config_path)
self.environment = environment
async def deploy(self):
# 1. 验证环境
self._validate_environment()
# 2. 准备资源
await self._prepare_resources()
# 3. 部署应用
await self._deploy_application()
# 4. 健康检查
await self._health_check()
# 5. 切换流量
await self._switch_traffic()
def _load_config(
self,
path: str
) -> Dict:
with open(path) as f:
return yaml.safe_load(f)
def _validate_environment(self):
# 检查必要的环境变量
required_vars = [
"OPENAI_API_KEY",
"MILVUS_HOST",
"REDIS_URL"
]
missing = [
var for var in required_vars
if not os.getenv(var)
]
if missing:
raise ValueError(
f"Missing environment variables: {missing}"
)
async def _prepare_resources(self):
# 1. 创建网络
subprocess.run([
"docker", "network", "create",
"ai-agent-network"
])
# 2. 启动依赖服务
subprocess.run([
"docker-compose",
"up", "-d",
"redis", "milvus"
])
# 3. 等待服务就绪
await self._wait_for_services([
("redis", 6379),
("milvus", 19530)
])
async def _deploy_application(self):
# 1. 构建镜像
subprocess.run([
"docker-compose",
"build",
"agent"
])
# 2. 更新配置
self._update_config()
# 3. 滚动部署
for i in range(3): # 3个副本
# 启动新实例
subprocess.run([
"docker-compose",
"up", "-d",
f"agent_{i}"
])
# 等待就绪
await self._wait_for_health_check(
f"agent_{i}"
)
# 如果有旧实例,关闭它
old_container = f"agent_old_{i}"
if self._container_exists(old_container):
subprocess.run([
"docker-compose",
"stop",
old_container
])
async def _health_check(self):
# 检查所有实例
endpoints = [
"http://localhost:8000/health",
"http://localhost:8001/health",
"http://localhost:8002/health"
]
async def check_endpoint(url: str):
import aiohttp
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return response.status == 200
results = await asyncio.gather(*[
check_endpoint(url)
for url in endpoints
])
if not all(results):
raise Exception("Health check failed")
async def _switch_traffic(self):
# 1. 更新负载均衡器配置
self._update_nginx_config()
# 2. 重载 Nginx
subprocess.run([
"nginx", "-s", "reload"
])
# 3. 验证流量切换
await self._verify_traffic()
def _update_config(self):
# 根据环境更新配置
config = self.config[self.environment]
# 写入配置文件
with open("config.yaml", "w") as f:
yaml.dump(config, f)
async def _wait_for_services(
self,
services: List[tuple]
):
import socket
async def check_port(
host: str,
port: int
) -> bool:
try:
socket.create_connection(
(host, port),
timeout=1
)
return True
except:
return False
for host, port in services:
while not await check_port(host, port):
print(f"Waiting for {host}:{port}...")
await asyncio.sleep(1)
监控配置
部署完成后,我们需要配置监控:
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'ai-agent'
static_configs:
- targets: ['localhost:8000']
metrics_path: '/metrics'
scheme: 'http'
# grafana/dashboards/agent.json
{
"dashboard": {
"id": null,
"title": "AI Agent Dashboard",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"datasource": "Prometheus",
"targets": [
{
"expr": "rate(requests_total[5m])",
"legendFormat": "{{type}}"
}
]
},
{
"title": "Latency",
"type": "graph",
"datasource": "Prometheus",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(request_latency_bucket[5m]))",
"legendFormat": "p95"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"datasource": "Prometheus",
"targets": [
{
"expr": "rate(requests_error[5m])",
"legendFormat": "{{error}}"
}
]
},
{
"title": "Token Usage",
"type": "graph",
"datasource": "Prometheus",
"targets": [
{
"expr": "rate(token_usage[5m])",
"legendFormat": "tokens/s"
}
]
}
]
}
}
运维脚本
最后是一些常用的运维脚本:
class Operations:
def __init__(
self,
config_path: str
):
self.config = self._load_config(config_path)
async def scale(
self,
replicas: int
):
# 扩缩容
subprocess.run([
"docker-compose",
"up", "-d",
"--scale", f"agent={replicas}"
])
async def rollback(
self,
version: str
):
# 回滚到指定版本
subprocess.run([
"docker-compose",
"down", "agent"
])
subprocess.run([
"docker", "tag",
f"ai-agent:{version}",
"ai-agent:latest"
])
subprocess.run([
"docker-compose",
"up", "-d",
"agent"
])
async def backup(self):
# 备份数据
timestamp = datetime.now().strftime(
"%Y%m%d_%H%M%S"
)
# 备份向量数据库
subprocess.run([
"docker-compose",
"exec", "milvus",
"milvus_backup",
f"/backup/milvus_{timestamp}"
])
# 备份 Redis 数据
subprocess.run([
"docker-compose",
"exec", "redis",
"redis-cli",
"save"
])
# 压缩备份文件
subprocess.run([
"tar", "czf",
f"backup_{timestamp}.tar.gz",
"backup/"
])
async def rotate_logs(self):
# 日志轮转
subprocess.run([
"docker-compose",
"exec", "agent",
"logrotate",
"/etc/logrotate.d/agent"
])
async def update_model(
self,
new_model: str
):
# 更新 AI 模型
config = self.config.copy()
config["model"]["model_name"] = new_model
# 更新配置
with open("config.yaml", "w") as f:
yaml.dump(config, f)
# 重启服务
await self.restart()
async def restart(self):
# 重启服务
subprocess.run([
"docker-compose",
"restart",
"agent"
])
应急预案
最后,我们还需要准备应急预案:
class EmergencyPlan:
def __init__(
self,
alert_manager
):
self.alert_manager = alert_manager
async def handle_high_load(self):
# 处理高负载
try:
# 1. 增加实例数
await Operations().scale(5)
# 2. 开启请求限流
await self._enable_rate_limit()
# 3. 通知团队
await self.alert_manager.notify(
"Handling high load situation"
)
except Exception as e:
await self.alert_manager.notify_error(e)
async def handle_api_error(self):
# 处理 API 错误
try:
# 1. 切换备用 API
await self._switch_to_backup_api()
# 2. 清理缓存
await self._clear_cache()
# 3. 重启服务
await Operations().restart()
except Exception as e:
await self.alert_manager.notify_error(e)
async def handle_data_corruption(self):
# 处理数据损坏
try:
# 1. 停止服务
subprocess.run([
"docker-compose",
"stop",
"agent"
])
# 2. 恢复备份
await self._restore_backup()
# 3. 重启服务
await Operations().restart()
except Exception as e:
await self.alert_manager.notify_error(e)
实践心得
在实施这套部署和运维方案的过程中,我总结了几点经验:
环境隔离很重要
- 开发环境要模拟生产
- 测试环境要独立部署
- 生产环境要严格控制
自动化是关键
- 部署流程自动化
- 监控告警自动化
- 日常运维自动化
成本控制要重视
- API 调用要优化
- 资源使用要监控
- 费用预算要控制
写在最后
一个成熟的 AI Agent 系统需要完善的部署和运维体系。就像照顾一个婴儿一样,需要细心呵护,及时响应,才能保证它健康成长。
在下一篇文章中,我会讲解如何优化 AI Agent 的性能和成本。如果你对部署和运维有什么想法,欢迎在评论区交流。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。