在前面的文章中,我们讨论了 AI Agent 的各个模块实现。今天,我想聊聊如何把这个系统部署到生产环境。说实话,这个过程比想象的要复杂得多,因为 AI Agent 系统有很多特殊的运维需求。
还记得第一次部署 AI Agent 到生产环境时的场景:
我:系统测试都通过了,可以部署了 运维:好的,按常规 Python 应用部署 (部署完成后) 用户:为什么响应这么慢? 监控:API 费用飙升... 我:...(这才发现漏掉了很多细节)
这次经历让我意识到:AI Agent 不是普通的 Web 应用,它需要特别的部署和运维策略。
首先,我们需要准备好部署环境:
# 1. 创建部署目录 mkdir -p /app/ai-agent cd /app/ai-agent # 2. 创建虚拟环境 python -m venv venv source venv/bin/activate # 3. 安装依赖 pip install -r requirements.txt # 4. 准备配置文件 cat > config.yaml << EOF environment: production log_level: INFO # AI 模型配置 model: provider: openai model_name: gpt-4 temperature: 0.7 max_tokens: 2000 retry_count: 3 timeout: 30 # 向量数据库配置 vector_store: type: milvus host: milvus.internal port: 19530 collection: agent_knowledge # 缓存配置 cache: type: redis url: redis://redis.internal:6379 ttl: 3600 # 监控配置 monitoring: prometheus_port: 9090 grafana_port: 3000 alert_webhook: "https://hooks.slack.com/..." EOF # 5. 准备 Docker 配置 cat > Dockerfile << EOF FROM python:3.11-slim WORKDIR /app # 安装系统依赖 RUN apt-get update && apt-get install -y \\ build-essential \\ curl \\ && rm -rf /var/lib/apt/lists/* # 复制应用代码 COPY . . # 安装 Python 依赖 RUN pip install --no-cache-dir -r requirements.txt # 暴露端口 EXPOSE 8000 9090 # 启动命令 CMD ["uvicorn", "agent.main:app", "--host", "0.0.0.0", "--port", "8000"] EOF # 6. 准备 docker-compose 配置 cat > docker-compose.yml << EOF version: '3.8' services: agent: build: . ports: - "8000:8000" - "9090:9090" environment: - ENVIRONMENT=production - OPENAI_API_KEY=\${OPENAI_API_KEY} volumes: - ./config.yaml:/app/config.yaml depends_on: - redis - milvus deploy: replicas: 3 resources: limits: cpus: '1' memory: 2G reservations: cpus: '0.5' memory: 1G redis: image: redis:7-alpine ports: - "6379:6379" volumes: - redis_data:/data milvus: image: milvusdb/milvus:latest ports: - "19530:19530" volumes: - milvus_data:/var/lib/milvus prometheus: image: prom/prometheus:latest ports: - "9090:9090" volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml - prometheus_data:/prometheus grafana: image: grafana/grafana:latest ports: - "3000:3000" volumes: - grafana_data:/var/lib/grafana volumes: redis_data: milvus_data: prometheus_data: grafana_data: EOF
接下来是部署脚本:
import subprocess import yaml import os from typing import List, Dict import asyncio from datetime import datetime class Deployment: def __init__( self, config_path: str, environment: str ): self.config = self._load_config(config_path) self.environment = environment async def deploy(self): # 1. 验证环境 self._validate_environment() # 2. 准备资源 await self._prepare_resources() # 3. 部署应用 await self._deploy_application() # 4. 健康检查 await self._health_check() # 5. 切换流量 await self._switch_traffic() def _load_config( self, path: str ) -> Dict: with open(path) as f: return yaml.safe_load(f) def _validate_environment(self): # 检查必要的环境变量 required_vars = [ "OPENAI_API_KEY", "MILVUS_HOST", "REDIS_URL" ] missing = [ var for var in required_vars if not os.getenv(var) ] if missing: raise ValueError( f"Missing environment variables: {missing}" ) async def _prepare_resources(self): # 1. 创建网络 subprocess.run([ "docker", "network", "create", "ai-agent-network" ]) # 2. 启动依赖服务 subprocess.run([ "docker-compose", "up", "-d", "redis", "milvus" ]) # 3. 等待服务就绪 await self._wait_for_services([ ("redis", 6379), ("milvus", 19530) ]) async def _deploy_application(self): # 1. 构建镜像 subprocess.run([ "docker-compose", "build", "agent" ]) # 2. 更新配置 self._update_config() # 3. 滚动部署 for i in range(3): # 3个副本 # 启动新实例 subprocess.run([ "docker-compose", "up", "-d", f"agent_{i}" ]) # 等待就绪 await self._wait_for_health_check( f"agent_{i}" ) # 如果有旧实例,关闭它 old_container = f"agent_old_{i}" if self._container_exists(old_container): subprocess.run([ "docker-compose", "stop", old_container ]) async def _health_check(self): # 检查所有实例 endpoints = [ "http://localhost:8000/health", "http://localhost:8001/health", "http://localhost:8002/health" ] async def check_endpoint(url: str): import aiohttp async with aiohttp.ClientSession() as session: async with session.get(url) as response: return response.status == 200 results = await asyncio.gather(*[ check_endpoint(url) for url in endpoints ]) if not all(results): raise Exception("Health check failed") async def _switch_traffic(self): # 1. 更新负载均衡器配置 self._update_nginx_config() # 2. 重载 Nginx subprocess.run([ "nginx", "-s", "reload" ]) # 3. 验证流量切换 await self._verify_traffic() def _update_config(self): # 根据环境更新配置 config = self.config[self.environment] # 写入配置文件 with open("config.yaml", "w") as f: yaml.dump(config, f) async def _wait_for_services( self, services: List[tuple] ): import socket async def check_port( host: str, port: int ) -> bool: try: socket.create_connection( (host, port), timeout=1 ) return True except: return False for host, port in services: while not await check_port(host, port): print(f"Waiting for {host}:{port}...") await asyncio.sleep(1)
部署完成后,我们需要配置监控:
# prometheus.yml global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: - job_name: 'ai-agent' static_configs: - targets: ['localhost:8000'] metrics_path: '/metrics' scheme: 'http' # grafana/dashboards/agent.json { "dashboard": { "id": null, "title": "AI Agent Dashboard", "panels": [ { "title": "Request Rate", "type": "graph", "datasource": "Prometheus", "targets": [ { "expr": "rate(requests_total[5m])", "legendFormat": "{{type}}" } ] }, { "title": "Latency", "type": "graph", "datasource": "Prometheus", "targets": [ { "expr": "histogram_quantile(0.95, rate(request_latency_bucket[5m]))", "legendFormat": "p95" } ] }, { "title": "Error Rate", "type": "graph", "datasource": "Prometheus", "targets": [ { "expr": "rate(requests_error[5m])", "legendFormat": "{{error}}" } ] }, { "title": "Token Usage", "type": "graph", "datasource": "Prometheus", "targets": [ { "expr": "rate(token_usage[5m])", "legendFormat": "tokens/s" } ] } ] } }
最后是一些常用的运维脚本:
class Operations: def __init__( self, config_path: str ): self.config = self._load_config(config_path) async def scale( self, replicas: int ): # 扩缩容 subprocess.run([ "docker-compose", "up", "-d", "--scale", f"agent={replicas}" ]) async def rollback( self, version: str ): # 回滚到指定版本 subprocess.run([ "docker-compose", "down", "agent" ]) subprocess.run([ "docker", "tag", f"ai-agent:{version}", "ai-agent:latest" ]) subprocess.run([ "docker-compose", "up", "-d", "agent" ]) async def backup(self): # 备份数据 timestamp = datetime.now().strftime( "%Y%m%d_%H%M%S" ) # 备份向量数据库 subprocess.run([ "docker-compose", "exec", "milvus", "milvus_backup", f"/backup/milvus_{timestamp}" ]) # 备份 Redis 数据 subprocess.run([ "docker-compose", "exec", "redis", "redis-cli", "save" ]) # 压缩备份文件 subprocess.run([ "tar", "czf", f"backup_{timestamp}.tar.gz", "backup/" ]) async def rotate_logs(self): # 日志轮转 subprocess.run([ "docker-compose", "exec", "agent", "logrotate", "/etc/logrotate.d/agent" ]) async def update_model( self, new_model: str ): # 更新 AI 模型 config = self.config.copy() config["model"]["model_name"] = new_model # 更新配置 with open("config.yaml", "w") as f: yaml.dump(config, f) # 重启服务 await self.restart() async def restart(self): # 重启服务 subprocess.run([ "docker-compose", "restart", "agent" ])
最后,我们还需要准备应急预案:
class EmergencyPlan: def __init__( self, alert_manager ): self.alert_manager = alert_manager async def handle_high_load(self): # 处理高负载 try: # 1. 增加实例数 await Operations().scale(5) # 2. 开启请求限流 await self._enable_rate_limit() # 3. 通知团队 await self.alert_manager.notify( "Handling high load situation" ) except Exception as e: await self.alert_manager.notify_error(e) async def handle_api_error(self): # 处理 API 错误 try: # 1. 切换备用 API await self._switch_to_backup_api() # 2. 清理缓存 await self._clear_cache() # 3. 重启服务 await Operations().restart() except Exception as e: await self.alert_manager.notify_error(e) async def handle_data_corruption(self): # 处理数据损坏 try: # 1. 停止服务 subprocess.run([ "docker-compose", "stop", "agent" ]) # 2. 恢复备份 await self._restore_backup() # 3. 重启服务 await Operations().restart() except Exception as e: await self.alert_manager.notify_error(e)
在实施这套部署和运维方案的过程中,我总结了几点经验:
一个成熟的 AI Agent 系统需要完善的部署和运维体系。就像照顾一个婴儿一样,需要细心呵护,及时响应,才能保证它健康成长。
在下一篇文章中,我会讲解如何优化 AI Agent 的性能和成本。如果你对部署和运维有什么想法,欢迎在评论区交流。
原文链接:https://blog.csdn.net/ChengFengTech/article/details/145375761?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522bd9563279a13c1db79b8fed8874505e2%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fblog.%2522%257D&request_id=bd9563279a13c1db79b8fed8874505e2&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~blog~first_rank_ecpm_v1~times_rank-22-145375761-null-null.nonecase&utm_term=AI+AIAgent
评论 ( 0 )