From a9fd60fb7a97993bca4c46ec235d6f71984d65db Mon Sep 17 00:00:00 2001 From: Joel Drotleff Date: Wed, 4 Feb 2026 17:58:58 -0800 Subject: [PATCH 1/3] Remove legacy llama.cpp inference code Remove dead code for the llama.cpp Docker Compose service that was replaced by SGLang. This includes: - docker-compose.yml.tmpl: Remove inference-engine service block - config.yml.tmpl: Remove 12 inference_* config fields - manager.go: Remove DefaultInference* constants, Inference* struct fields, EnableInferenceEngine toggle, and validation block - up.go/root.go: Remove --enable-inference-engine CLI flag - daemon/api_types.go, handlers.go: Remove EnableInferenceEngine from API - manager_test.go: Update tests to remove llama.cpp references The SGLang inference engine (silo inference up/down) remains unchanged. Agent-thread: https://ampcode.com/threads/T-019c2b39-fff2-70df-b334-fc8a0b56c719 Co-authored-by: Amp Amp-Thread-ID: https://ampcode.com/threads/T-019c2b39-fff2-70df-b334-fc8a0b56c719 --- internal/assets/config.yml.tmpl | 15 ----- internal/assets/docker-compose.yml.tmpl | 43 +------------- internal/cli/root.go | 13 ++-- internal/cli/up.go | 2 - internal/config/manager.go | 79 ++----------------------- internal/config/manager_test.go | 70 +++++++--------------- internal/daemon/api_types.go | 7 +-- internal/daemon/handlers.go | 1 - 8 files changed, 36 insertions(+), 194 deletions(-) diff --git a/internal/assets/config.yml.tmpl b/internal/assets/config.yml.tmpl index 72a89e3..2853e50 100644 --- a/internal/assets/config.yml.tmpl +++ b/internal/assets/config.yml.tmpl @@ -6,20 +6,5 @@ port: {{.Port}} llm_base_url: "{{.LLMBaseURL}}" default_model: "{{.DefaultModel}}" -# Inference Engine Configuration -inference_port: {{.InferencePort}} -inference_model_file: "{{.InferenceModelFile}}" -inference_shm_size: "{{.InferenceShmSize}}" -inference_context_size: {{.InferenceContextSize}} -inference_batch_size: {{.InferenceBatchSize}} -inference_gpu_layers: {{.InferenceGPULayers}} -inference_tensor_split: "{{.InferenceTensorSplit}}" -inference_main_gpu: {{.InferenceMainGPU}} -inference_threads: {{.InferenceThreads}} -inference_http_threads: {{.InferenceHTTPThreads}} -inference_fit: "{{.InferenceFit}}" -inference_gpu_devices: '{{.InferenceGPUDevices}}' - # Service Toggles -enable_inference_engine: {{.EnableInferenceEngine}} enable_proxy_agent: {{.EnableProxyAgent}} diff --git a/internal/assets/docker-compose.yml.tmpl b/internal/assets/docker-compose.yml.tmpl index 66cb4cb..5bf35de 100644 --- a/internal/assets/docker-compose.yml.tmpl +++ b/internal/assets/docker-compose.yml.tmpl @@ -53,48 +53,7 @@ services: depends_on: - backend restart: unless-stopped -{{if .EnableInferenceEngine}} - inference-engine: - image: ghcr.io/ggml-org/llama.cpp:full-cuda - ports: - - "{{.InferencePort}}:{{.InferencePort}}" - volumes: - - {{.DataDir}}/models:/models:ro - shm_size: '{{.InferenceShmSize}}' - ipc: host - command: - - --server - - -m - - /models/{{.InferenceModelFile}} - - -c - - "{{.InferenceContextSize}}" - - -b - - "{{.InferenceBatchSize}}" - - -ngl - - "{{.InferenceGPULayers}}" - - --tensor-split - - {{.InferenceTensorSplit}} - - -mg - - "{{.InferenceMainGPU}}" - - -t - - "{{.InferenceThreads}}" - - --threads-http - - "{{.InferenceHTTPThreads}}" - - -fit - - "{{.InferenceFit}}" - - --host - - 0.0.0.0 - - --port - - "{{.InferencePort}}" - deploy: - resources: - reservations: - devices: - - driver: nvidia - device_ids: [{{.InferenceGPUDevices}}] - capabilities: [gpu] - restart: unless-stopped -{{end}} + {{if .EnableProxyAgent}} silo-proxy-agent: image: eternis/silo-proxy-agent diff --git a/internal/cli/root.go b/internal/cli/root.go index 587629f..b6e3663 100644 --- a/internal/cli/root.go +++ b/internal/cli/root.go @@ -9,13 +9,12 @@ import ( ) var ( - verbose bool - configDir string - imageTag string - port int - enableInferenceEngine bool - enableProxyAgent bool - log *logger.Logger + verbose bool + configDir string + imageTag string + port int + enableProxyAgent bool + log *logger.Logger ) var rootCmd = &cobra.Command{ diff --git a/internal/cli/up.go b/internal/cli/up.go index fda7a13..1d83fd4 100644 --- a/internal/cli/up.go +++ b/internal/cli/up.go @@ -38,7 +38,6 @@ By default, the inference engine is NOT started. Use --all to include it.`, if port > 0 { cfg.Port = port } - cfg.EnableInferenceEngine = enableInferenceEngine cfg.EnableProxyAgent = enableProxyAgent if err := config.Validate(cfg); err != nil { @@ -130,7 +129,6 @@ func init() { upCmd.Flags().StringVar(&imageTag, "image-tag", config.DefaultImageTag, "Docker image tag (first install only)") upCmd.Flags().IntVar(&port, "port", config.DefaultPort, "Application port (first install only)") - upCmd.Flags().BoolVar(&enableInferenceEngine, "enable-inference-engine", config.DefaultEnableInferenceEngine, "Enable local inference engine (first install only)") upCmd.Flags().BoolVar(&enableProxyAgent, "enable-proxy-agent", config.DefaultEnableProxyAgent, "Enable proxy agent (first install only)") upCmd.Flags().BoolVar(&upAll, "all", false, "Include inference engine") } diff --git a/internal/config/manager.go b/internal/config/manager.go index e344f86..f20bb6f 100644 --- a/internal/config/manager.go +++ b/internal/config/manager.go @@ -17,24 +17,9 @@ const ( DefaultLLMBaseURL = "http://host.docker.internal:30000/v1" DefaultModel = "glm47-awq" - // Inference engine defaults (legacy llama.cpp - kept for compatibility) - DefaultInferencePort = 30000 - DefaultInferenceModelFile = "GLM-4.7-Q4_K_M.gguf" - DefaultInferenceShmSize = "16g" - DefaultInferenceContextSize = 8192 - DefaultInferenceBatchSize = 256 - DefaultInferenceGPULayers = 999 - DefaultInferenceTensorSplit = "1,1,1" - DefaultInferenceMainGPU = 0 - DefaultInferenceThreads = 16 - DefaultInferenceHTTPThreads = 8 - DefaultInferenceFit = "off" - DefaultInferenceGPUDevices = `"0", "1", "2"` - // Service toggles - DefaultEnableInferenceEngine = false - DefaultEnableProxyAgent = false - DefaultEnableDeepResearch = true + DefaultEnableProxyAgent = false + DefaultEnableDeepResearch = true // Deep research defaults DefaultDeepResearchImage = "ghcr.io/eternisai/deep_research:sha-2e9f2ef" @@ -118,24 +103,9 @@ type Config struct { DataDir string `yaml:"-"` SocketFile string `yaml:"-"` - // Inference engine configuration - InferencePort int `yaml:"inference_port"` - InferenceModelFile string `yaml:"inference_model_file"` - InferenceShmSize string `yaml:"inference_shm_size"` - InferenceContextSize int `yaml:"inference_context_size"` - InferenceBatchSize int `yaml:"inference_batch_size"` - InferenceGPULayers int `yaml:"inference_gpu_layers"` - InferenceTensorSplit string `yaml:"inference_tensor_split"` - InferenceMainGPU int `yaml:"inference_main_gpu"` - InferenceThreads int `yaml:"inference_threads"` - InferenceHTTPThreads int `yaml:"inference_http_threads"` - InferenceFit string `yaml:"inference_fit"` - InferenceGPUDevices string `yaml:"inference_gpu_devices"` - // Service toggles - EnableInferenceEngine bool `yaml:"enable_inference_engine"` - EnableProxyAgent bool `yaml:"enable_proxy_agent"` - EnableDeepResearch bool `yaml:"enable_deep_research"` + EnableProxyAgent bool `yaml:"enable_proxy_agent"` + EnableDeepResearch bool `yaml:"enable_deep_research"` // Deep research configuration DeepResearchImage string `yaml:"deep_research_image"` @@ -167,24 +137,9 @@ func NewDefaultConfig(paths *Paths) *Config { DataDir: paths.AppDataDir, SocketFile: paths.SocketFile, - // Inference engine defaults - InferencePort: DefaultInferencePort, - InferenceModelFile: DefaultInferenceModelFile, - InferenceShmSize: DefaultInferenceShmSize, - InferenceContextSize: DefaultInferenceContextSize, - InferenceBatchSize: DefaultInferenceBatchSize, - InferenceGPULayers: DefaultInferenceGPULayers, - InferenceTensorSplit: DefaultInferenceTensorSplit, - InferenceMainGPU: DefaultInferenceMainGPU, - InferenceThreads: DefaultInferenceThreads, - InferenceHTTPThreads: DefaultInferenceHTTPThreads, - InferenceFit: DefaultInferenceFit, - InferenceGPUDevices: DefaultInferenceGPUDevices, - // Service toggles - EnableInferenceEngine: DefaultEnableInferenceEngine, - EnableProxyAgent: DefaultEnableProxyAgent, - EnableDeepResearch: DefaultEnableDeepResearch, + EnableProxyAgent: DefaultEnableProxyAgent, + EnableDeepResearch: DefaultEnableDeepResearch, // Deep research defaults DeepResearchImage: DefaultDeepResearchImage, @@ -351,28 +306,6 @@ func Validate(config *Config) error { return fmt.Errorf("default_model cannot be empty") } - // Inference engine validation (only when enabled) - if config.EnableInferenceEngine { - if config.InferencePort < 1 || config.InferencePort > 65535 { - return fmt.Errorf("inference_port must be between 1 and 65535") - } - if config.InferenceModelFile == "" { - return fmt.Errorf("inference_model_file cannot be empty") - } - if config.InferenceContextSize < 1 { - return fmt.Errorf("inference_context_size must be positive") - } - if config.InferenceBatchSize < 1 { - return fmt.Errorf("inference_batch_size must be positive") - } - if config.InferenceThreads < 1 { - return fmt.Errorf("inference_threads must be positive") - } - if config.InferenceHTTPThreads < 1 { - return fmt.Errorf("inference_http_threads must be positive") - } - } - return nil } diff --git a/internal/config/manager_test.go b/internal/config/manager_test.go index ebc21bb..d6b918d 100644 --- a/internal/config/manager_test.go +++ b/internal/config/manager_test.go @@ -28,8 +28,8 @@ func TestLoadOrDefault_NewFile(t *testing.T) { if cfg.Port != DefaultPort { t.Errorf("Expected Port=%d, got %d", DefaultPort, cfg.Port) } - if cfg.EnableInferenceEngine != DefaultEnableInferenceEngine { - t.Errorf("Expected EnableInferenceEngine=%v, got %v", DefaultEnableInferenceEngine, cfg.EnableInferenceEngine) + if cfg.EnableProxyAgent != DefaultEnableProxyAgent { + t.Errorf("Expected EnableProxyAgent=%v, got %v", DefaultEnableProxyAgent, cfg.EnableProxyAgent) } } @@ -49,7 +49,7 @@ func TestLoadOrDefault_ExistingFileWithMissingFields(t *testing.T) { image_tag: "0.1.2" port: 8080 llm_base_url: "http://custom-url:9000/v1" -default_model: "custom-model.gguf" +default_model: "custom-model" ` if err := os.WriteFile(paths.ConfigFile, []byte(partialConfig), 0644); err != nil { t.Fatalf("Failed to write test config: %v", err) @@ -72,17 +72,11 @@ default_model: "custom-model.gguf" } // Check that missing fields are filled with defaults - if cfg.InferencePort != DefaultInferencePort { - t.Errorf("Expected default InferencePort=%d, got %d", DefaultInferencePort, cfg.InferencePort) - } - if cfg.EnableInferenceEngine != DefaultEnableInferenceEngine { - t.Errorf("Expected default EnableInferenceEngine=%v, got %v", DefaultEnableInferenceEngine, cfg.EnableInferenceEngine) - } if cfg.EnableProxyAgent != DefaultEnableProxyAgent { t.Errorf("Expected default EnableProxyAgent=%v, got %v", DefaultEnableProxyAgent, cfg.EnableProxyAgent) } - if cfg.InferenceGPULayers != DefaultInferenceGPULayers { - t.Errorf("Expected default InferenceGPULayers=%d, got %d", DefaultInferenceGPULayers, cfg.InferenceGPULayers) + if cfg.EnableDeepResearch != DefaultEnableDeepResearch { + t.Errorf("Expected default EnableDeepResearch=%v, got %v", DefaultEnableDeepResearch, cfg.EnableDeepResearch) } } @@ -102,21 +96,9 @@ func TestLoadOrDefault_ExistingFileWithAllFields(t *testing.T) { image_tag: "0.1.2" port: 9999 llm_base_url: "http://my-llm:5000/v1" -default_model: "my-model.gguf" -inference_port: 40000 -inference_model_file: "my-inference.gguf" -inference_shm_size: "32g" -inference_context_size: 16384 -inference_batch_size: 512 -inference_gpu_layers: 50 -inference_tensor_split: "2,2,2" -inference_main_gpu: 1 -inference_threads: 32 -inference_http_threads: 16 -inference_fit: "on" -inference_gpu_devices: "\"1\", \"2\"" -enable_inference_engine: true +default_model: "my-model" enable_proxy_agent: true +enable_deep_research: true ` if err := os.WriteFile(paths.ConfigFile, []byte(completeConfig), 0644); err != nil { t.Fatalf("Failed to write test config: %v", err) @@ -131,18 +113,12 @@ enable_proxy_agent: true if cfg.Port != 9999 { t.Errorf("Expected preserved Port=9999, got %d", cfg.Port) } - if cfg.InferencePort != 40000 { - t.Errorf("Expected preserved InferencePort=40000, got %d", cfg.InferencePort) - } - if cfg.InferenceGPULayers != 50 { - t.Errorf("Expected preserved InferenceGPULayers=50, got %d", cfg.InferenceGPULayers) - } - if !cfg.EnableInferenceEngine { - t.Errorf("Expected preserved EnableInferenceEngine=true, got false") - } if !cfg.EnableProxyAgent { t.Errorf("Expected preserved EnableProxyAgent=true, got false") } + if !cfg.EnableDeepResearch { + t.Errorf("Expected preserved EnableDeepResearch=true, got false") + } } func TestMergeConfigs(t *testing.T) { @@ -150,18 +126,16 @@ func TestMergeConfigs(t *testing.T) { Version: "0.1.1", ImageTag: "0.1.1", Port: 8080, - // Missing: LLMBaseURL, DefaultModel, InferencePort, etc. + // Missing: LLMBaseURL, DefaultModel, etc. } defaults := &Config{ - Version: DefaultVersion, - ImageTag: DefaultImageTag, - Port: DefaultPort, - LLMBaseURL: DefaultLLMBaseURL, - DefaultModel: DefaultModel, - InferencePort: DefaultInferencePort, - EnableInferenceEngine: DefaultEnableInferenceEngine, - EnableProxyAgent: DefaultEnableProxyAgent, + Version: DefaultVersion, + ImageTag: DefaultImageTag, + Port: DefaultPort, + LLMBaseURL: DefaultLLMBaseURL, + DefaultModel: DefaultModel, + EnableProxyAgent: DefaultEnableProxyAgent, } merged := mergeConfigs(existing, defaults) @@ -181,9 +155,6 @@ func TestMergeConfigs(t *testing.T) { if merged.DefaultModel != DefaultModel { t.Errorf("Expected merged.DefaultModel=%s, got %s", DefaultModel, merged.DefaultModel) } - if merged.InferencePort != DefaultInferencePort { - t.Errorf("Expected merged.InferencePort=%d, got %d", DefaultInferencePort, merged.InferencePort) - } } func TestFindMissingFields(t *testing.T) { @@ -211,10 +182,9 @@ port: 8080 // Check that specific fields are in the missing list expectedMissing := map[string]bool{ - "llm_base_url": true, - "default_model": true, - "enable_inference_engine": true, - "enable_proxy_agent": true, + "llm_base_url": true, + "default_model": true, + "enable_proxy_agent": true, } for _, field := range missing { diff --git a/internal/daemon/api_types.go b/internal/daemon/api_types.go index 9043cd8..f8b871f 100644 --- a/internal/daemon/api_types.go +++ b/internal/daemon/api_types.go @@ -19,10 +19,9 @@ type LogEntry struct { // UpRequest represents request body for /api/v1/up type UpRequest struct { - ImageTag string `json:"image_tag,omitempty"` - Port int `json:"port,omitempty"` - EnableInferenceEngine bool `json:"enable_inference_engine,omitempty"` - EnableProxyAgent bool `json:"enable_proxy_agent,omitempty"` + ImageTag string `json:"image_tag,omitempty"` + Port int `json:"port,omitempty"` + EnableProxyAgent bool `json:"enable_proxy_agent,omitempty"` } // RestartRequest represents request body for /api/v1/restart diff --git a/internal/daemon/handlers.go b/internal/daemon/handlers.go index 7aed4a7..a88caf0 100644 --- a/internal/daemon/handlers.go +++ b/internal/daemon/handlers.go @@ -82,7 +82,6 @@ func (s *Server) handleUp(w http.ResponseWriter, r *http.Request) { if req.Port > 0 { cfg.Port = req.Port } - cfg.EnableInferenceEngine = req.EnableInferenceEngine cfg.EnableProxyAgent = req.EnableProxyAgent // Validate config From f78bf825355fafbc870b8ce4283cdd5a1a87a45f Mon Sep 17 00:00:00 2001 From: Joel Drotleff Date: Wed, 4 Feb 2026 18:40:07 -0800 Subject: [PATCH 2/3] docs: update AGENTS.md for llama.cpp removal and deep research deployment - Remove deprecated inference_* config fields from Config struct - Remove llama.cpp-related config.yml schema fields - Add Deep Research Deployment section documenting: - GHCR vs Docker Hub registry differences - SHA-based vs semver versioning - Update flow for propagating changes - GHCR authentication requirements - Graceful pull handling Agent-thread: https://ampcode.com/threads/T-019c2ba5-483b-743c-91b9-47a419fab06b Co-authored-by: Amp Amp-Thread-ID: https://ampcode.com/threads/T-019c2ba5-483b-743c-91b9-47a419fab06b --- AGENTS.md | 84 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 8fec5e8..4eda88f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -122,23 +122,17 @@ silo/ **Config Struct** (`manager.go`): ```go type Config struct { - Version string // CLI version - ImageTag string // Docker image tag - Port int // Frontend port - LLMBaseURL string // Inference engine URL - DefaultModel string // Default LLM model - InferencePort int - InferenceModelFile string // GGUF filename - InferenceShmSize string - InferenceContextSize int - InferenceBatchSize int - InferenceGPULayers int // 999 = all layers on GPU - InferenceTensorSplit string - InferenceMainGPU int - InferenceThreads int - InferenceHTTPThreads int - InferenceFit string - InferenceGPUDevices string // Quoted CSV: "0", "1", "2" + Version string // CLI version + ImageTag string // Docker image tag + Port int // Frontend port + LLMBaseURL string // Inference engine URL + DefaultModel string // Default LLM model + EnableProxyAgent bool // Enable remote proxy agent + EnableDeepResearch bool // Enable deep research service + DeepResearchImage string // GHCR image (sha-tagged) + DeepResearchPort int // Default 3031 + SearchProvider string // "perplexity" or "tavily" + PerplexityAPIKey string // Required for deep research } ``` @@ -224,29 +218,18 @@ version: "0.1.2" # CLI version image_tag: "0.1.2" # Docker image tag port: 80 # Frontend port llm_base_url: "http://inference-engine:30000/v1" -default_model: "GLM-4.7-Q4_K_M.gguf" -inference_port: 30000 -inference_model_file: "GLM-4.7-Q4_K_M.gguf" -inference_shm_size: "16g" -inference_context_size: 8192 -inference_batch_size: 256 -inference_gpu_layers: 999 # 999 = all layers on GPU -inference_tensor_split: "1,1,1" -inference_main_gpu: 0 -inference_threads: 16 -inference_http_threads: 8 -inference_fit: "off" -inference_gpu_devices: "\"0\", \"1\", \"2\"" # Quoted CSV for YAML +default_model: "model-name" # Service toggles -enable_inference_engine: false # Enable llama.cpp inference enable_proxy_agent: false # Enable remote proxy agent enable_deep_research: true # Enable deep research service # Deep research configuration -deep_research_image: "ghcr.io/eternisai/deep_research:sha-ff37ec2" +# NOTE: Image uses SHA tags from GHCR (not semver from Docker Hub) +# The default is pinned in manager.go and auto-updated during silo upgrade +deep_research_image: "ghcr.io/eternisai/deep_research:sha-XXXXXXX" deep_research_port: 3031 -search_provider: "perplexity" +search_provider: "perplexity" # "perplexity" or "tavily" perplexity_api_key: "" # Required for deep research web search ``` @@ -265,10 +248,43 @@ perplexity_api_key: "" # Required for deep research web search 1. **Template-driven Configuration**: docker-compose.yml and config.yml generated from embedded templates with user values 2. **Single-responsibility Packages**: Each package handles one concern (installer, updater, docker, config) 3. **Stateful Operations**: Tracks install timestamps and versions in state.json -4. **Selective Image Pulls**: Only pulls backend/frontend (inference engine image is larger, pre-packaged) +4. **Selective Image Pulls**: Pulls backend/frontend/deep-research; inference managed separately via docker run 5. **Non-blocking Updates**: Version checks warn but don't fail operations 6. **Graceful Degradation**: Warns on errors but continues where possible +## Deep Research Deployment + +The deep research service uses a different deployment model than frontend/backend: + +| Aspect | Frontend / Backend | Deep Research | +|--------|-------------------|---------------| +| **Registry** | Docker Hub (`eternis/silo-box-*`) | GHCR (`ghcr.io/eternisai/deep_research`) | +| **Versioning** | Semantic versioning (`0.1.2`) | Commit SHA tags (`sha-2e9f2ef`) | +| **Update Source** | CLI queries Docker Hub API for latest | Version pinned as `DefaultDeepResearchImage` in `manager.go` | +| **Pull Criticality** | Critical (blocks upgrade on failure) | Non-critical (warns but continues) | + +### Update Flow + +1. Push changes to `silo_deep_research` repo +2. GitHub Actions builds and pushes to GHCR with `sha-{commit}` tag +3. Update `DefaultDeepResearchImage` constant in `internal/config/manager.go` +4. Release new CLI version (`gh workflow run Release`) +5. Users run `silo upgrade` to get new image + +### GHCR Authentication + +If the deep research image is private, users need GHCR auth: + +```bash +# Create PAT (classic) with read:packages scope +# If org uses SAML SSO, authorize PAT for the org +echo "YOUR_PAT" | docker login ghcr.io -u YOUR_USERNAME --password-stdin +``` + +### Graceful Pull Handling + +The CLI pulls services individually. If deep research fails to pull (auth issues), it logs a warning but continues deploying frontend/backend. + ## Development Workflow ```bash From 1d220f3377346cd929623729b4893d002e12884a Mon Sep 17 00:00:00 2001 From: Joel Drotleff Date: Wed, 4 Feb 2026 19:44:33 -0800 Subject: [PATCH 3/3] Add deep research and SGLang fields to config template --- internal/assets/config.yml.tmpl | 37 +++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/internal/assets/config.yml.tmpl b/internal/assets/config.yml.tmpl index 2853e50..54f2cbd 100644 --- a/internal/assets/config.yml.tmpl +++ b/internal/assets/config.yml.tmpl @@ -8,3 +8,40 @@ default_model: "{{.DefaultModel}}" # Service Toggles enable_proxy_agent: {{.EnableProxyAgent}} +enable_deep_research: {{.EnableDeepResearch}} + +# Deep Research +deep_research_image: "{{.DeepResearchImage}}" +deep_research_port: {{.DeepResearchPort}} +search_provider: "{{.SearchProvider}}" +perplexity_api_key: "{{.PerplexityAPIKey}}" + +# SGLang Inference Engine (managed separately from docker-compose) +sglang: + enabled: {{.SGLang.Enabled}} + image: "{{.SGLang.Image}}" + container_name: "{{.SGLang.ContainerName}}" + port: {{.SGLang.Port}} + gpu_devices:{{- if .SGLang.GPUDevices}} +{{- range .SGLang.GPUDevices}} + - "{{.}}" +{{- end}} +{{- else}} [] +{{- end}} + shm_size: "{{.SGLang.ShmSize}}" + model_path: "{{.SGLang.ModelPath}}" + huggingface_cache: "{{.SGLang.HuggingFaceCache}}" + dp_size: {{.SGLang.DPSize}} + tp_size: {{.SGLang.TPSize}} + max_running_requests: {{.SGLang.MaxRunningRequests}} + max_total_tokens: {{.SGLang.MaxTotalTokens}} + context_length: {{.SGLang.ContextLength}} + mem_fraction_static: {{.SGLang.MemFractionStatic}} + chunked_prefill_size: {{.SGLang.ChunkedPrefillSize}} + schedule_policy: "{{.SGLang.SchedulePolicy}}" + kv_cache_dtype: "{{.SGLang.KVCacheDtype}}" + attention_backend: "{{.SGLang.AttentionBackend}}" + disable_radix_cache: {{.SGLang.DisableRadixCache}} + reasoning_parser: "{{.SGLang.ReasoningParser}}" + trust_remote_code: {{.SGLang.TrustRemoteCode}} + log_level: "{{.SGLang.LogLevel}}"