diff --git a/docker/compose/docker-compose.nilai-router-1.yml b/docker/compose/docker-compose.nilai-router-1.yml new file mode 100644 index 00000000..229c437d --- /dev/null +++ b/docker/compose/docker-compose.nilai-router-1.yml @@ -0,0 +1,48 @@ +services: + qwen3_coder_30b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + command: > + --model Qwen/Qwen3-Coder-30B-A3B-Instruct + --gpu-memory-utilization 0.95 + --max-model-len 100000 + --max-num-batched-tokens 8192 + --tensor-parallel-size 1 + --dtype bfloat16 + --kv-cache-dtype fp8 + --uvicorn-log-level warning + environment: + - SVC_HOST=qwen3_coder_30b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 180s + timeout: 10s +volumes: + hugging_face_models: + diff --git a/docker/compose/docker-compose.nilai-router-2.yml b/docker/compose/docker-compose.nilai-router-2.yml new file mode 100644 index 00000000..6ac36b71 --- /dev/null +++ b/docker/compose/docker-compose.nilai-router-2.yml @@ -0,0 +1,93 @@ +services: + gpt_oss_20b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + command: > + --model openai/gpt-oss-20b + --gpu-memory-utilization 0.75 + --max-model-len 100000 + --max-num-batched-tokens 100000 + --tensor-parallel-size 1 + --uvicorn-log-level warning + environment: + - SVC_HOST=gpt_oss_20b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 180s + timeout: 10s + + qwen3_thinking_4b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + gpt_oss_20b_gpu: + condition: service_healthy + command: > + --model Qwen/Qwen3-4B-Thinking-2507 + --gpu-memory-utilization 0.20 + --max-model-len 10000 + --max-num-batched-tokens 10000 + --tensor-parallel-size 1 + --dtype bfloat16 + --kv-cache-dtype fp8 + --uvicorn-log-level warning + environment: + - SVC_HOST=qwen3_thinking_4b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 60s + timeout: 10s +volumes: + hugging_face_models: + diff --git a/docker/compose/docker-compose.nilai-router-3.yml b/docker/compose/docker-compose.nilai-router-3.yml new file mode 100644 index 00000000..68b0fc52 --- /dev/null +++ b/docker/compose/docker-compose.nilai-router-3.yml @@ -0,0 +1,95 @@ +services: + arch_router_1_5b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + command: > + --model katanemo/Arch-Router-1.5B + --gpu-memory-utilization 0.15 + --max-model-len 8000 + --max-num-batched-tokens 8000 + --tensor-parallel-size 1 + --dtype bfloat16 + --kv-cache-dtype fp8 + --uvicorn-log-level warning + environment: + - SVC_HOST=arch_router_1_5b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=false + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 60s + timeout: 10s + + qwen3_vl_4b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + arch_router_1_5b_gpu: + condition: service_healthy + command: > + --model Qwen/Qwen3-VL-4B-Instruct + --gpu-memory-utilization 0.8 + --max-model-len 10000 + --max-num-batched-tokens 10000 + --tensor-parallel-size 1 + --dtype bfloat16 + --kv-cache-dtype fp8 + --uvicorn-log-level warning + environment: + - SVC_HOST=qwen3_vl_4b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + - MULTIMODAL_SUPPORT=true + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 60s + timeout: 10s +volumes: + hugging_face_models: \ No newline at end of file