Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cmd/api/api/builds.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,16 @@ func (s *ApiService) CreateBuild(ctx context.Context, request oapi.CreateBuildRe
Code: "invalid_source",
Message: err.Error(),
}, nil
case errors.Is(err, builds.ErrResourcesExhausted):
return oapi.CreateBuild503JSONResponse{
Body: oapi.Error{
Code: "resources_exhausted",
Message: "insufficient resources for build, please retry later",
},
Headers: oapi.CreateBuild503ResponseHeaders{
RetryAfter: 30,
},
}, nil
default:
log.ErrorContext(ctx, "failed to create build", "error", err)
return oapi.CreateBuild500JSONResponse{
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ require (
github.com/nrednav/cuid2 v1.1.0
github.com/oapi-codegen/nethttp-middleware v1.1.2
github.com/oapi-codegen/runtime v1.1.2
github.com/opencontainers/go-digest v1.0.0
github.com/opencontainers/image-spec v1.1.1
github.com/opencontainers/runtime-spec v1.2.1
github.com/opencontainers/umoci v0.6.0
Expand Down Expand Up @@ -90,7 +91,6 @@ require (
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
github.com/oasdiff/yaml v0.0.0-20250309154309-f31be36b4037 // indirect
github.com/oasdiff/yaml3 v0.0.0-20250309153720-d2182401db90 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/perimeterx/marshmallow v1.1.5 // indirect
github.com/pierrec/lz4/v4 v4.1.22 // indirect
github.com/pkg/errors v0.9.1 // indirect
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,6 @@ github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/go-containerregistry v0.20.6 h1:cvWX87UxxLgaH76b4hIvya6Dzz9qHB31qAwjAohdSTU=
github.com/google/go-containerregistry v0.20.6/go.mod h1:T0x8MuoAoKX/873bkeSfLD2FAkwCDf9/HZgsFJ02E2Y=
github.com/google/subcommands v1.2.0 h1:vWQspBTo2nEqTUFita5/KeEWlUL8kQObDFbub/EN9oE=
github.com/google/subcommands v1.2.0/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
Expand Down
47 changes: 16 additions & 31 deletions lib/builds/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ The build system provides source-to-image builds inside ephemeral Cloud Hypervis
```
┌─────────────────────────────────────────────────────────────────┐
│ Hypeman API │
│ POST /builds → BuildManager → BuildQueue
│ POST /builds → BuildManager → Builder VM
│ │ │
│ Start() → VsockHandler (port 5001) │
└─────────────────────────────────────────────────────────────────┘
Expand Down Expand Up @@ -53,19 +53,6 @@ The build system provides source-to-image builds inside ephemeral Cloud Hypervis
| `BuildProvenance` | Audit trail for reproducibility |
| `BuildPolicy` | Resource limits and network policy |

### Build Queue (`queue.go`)

In-memory queue with configurable concurrency:

```go
queue := NewBuildQueue(maxConcurrent)
position := queue.Enqueue(buildID, request, startFunc)
queue.Cancel(buildID)
queue.GetPosition(buildID)
```

**Recovery**: On startup, `listPendingBuilds()` scans disk metadata for incomplete builds and re-enqueues them in FIFO order.

### Storage (`storage.go`)

Builds are persisted to `$DATA_DIR/builds/{id}/`:
Expand All @@ -87,14 +74,15 @@ Orchestrates the build lifecycle:

1. Validate request and store source
2. Write build config to disk
3. Enqueue build job
4. Create source volume from archive
5. Create config volume with `build.json`
6. Create builder VM with both volumes attached
7. Wait for build completion
8. Update metadata and cleanup
3. Create source volume from archive
4. Create config volume with `build.json`
5. Create builder VM with both volumes attached
6. Wait for build completion via vsock
7. Update metadata and cleanup

Builds start immediately when created. If host resources are exhausted, the build will fail with an appropriate error message.

**Important**: The `Start()` method must be called to start the vsock handler for builder communication.
**Recovery**: On startup, `listPendingBuilds()` scans disk metadata for incomplete builds (building/pushing status) and restarts them.

### Cache System (`cache.go`)

Expand Down Expand Up @@ -136,9 +124,7 @@ OpenTelemetry metrics for monitoring:
| Metric | Type | Description |
|--------|------|-------------|
| `hypeman_build_duration_seconds` | Histogram | Build duration |
| `hypeman_builds_total` | Counter | Total builds by status/runtime |
| `hypeman_build_queue_length` | Gauge | Pending builds in queue |
| `hypeman_builds_active` | Gauge | Currently running builds |
| `hypeman_builds_total` | Counter | Total builds by status |

### Builder Agent (`builder_agent/main.go`)

Expand Down Expand Up @@ -196,7 +182,7 @@ CMD [\"node\", \"index.js\"]" \
```json
{
"id": "abc123",
"status": "queued",
"status": "building",
"created_at": "2025-01-15T10:00:00Z"
}
```
Expand Down Expand Up @@ -231,11 +217,11 @@ Builder VMs authenticate to the registry using short-lived JWT tokens:
## Build Status Flow

```
queued → building → pushing → ready
↘ ↗
failed
cancelled
building → pushing → ready
↘ ↗
failed
cancelled
```

## Security Model
Expand Down Expand Up @@ -348,7 +334,6 @@ Each build records provenance for reproducibility:
go test ./lib/builds/... -v

# Test specific components
go test ./lib/builds/queue_test.go ./lib/builds/queue.go ./lib/builds/types.go -v
go test ./lib/builds/cache_test.go ./lib/builds/cache.go ./lib/builds/types.go ./lib/builds/errors.go -v
go test ./lib/builds/registry_token_test.go ./lib/builds/registry_token.go -v
```
Expand Down
3 changes: 3 additions & 0 deletions lib/builds/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,7 @@ var (

// ErrBuildInProgress is returned when trying to cancel a build that's already complete
ErrBuildInProgress = errors.New("build in progress")

// ErrResourcesExhausted is returned when there are insufficient resources to start a build
ErrResourcesExhausted = errors.New("insufficient resources for build")
)
55 changes: 17 additions & 38 deletions lib/builds/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ func DefaultConfig() Config {
type manager struct {
config Config
paths *paths.Paths
queue *BuildQueue
instanceManager instances.Manager
volumeManager volumes.Manager
secretProvider SecretProvider
Expand Down Expand Up @@ -116,7 +115,6 @@ func NewManager(
m := &manager{
config: config,
paths: p,
queue: NewBuildQueue(config.MaxConcurrentBuilds),
instanceManager: instanceMgr,
volumeManager: volumeMgr,
secretProvider: secretProvider,
Expand Down Expand Up @@ -171,7 +169,7 @@ func (m *manager) CreateBuild(ctx context.Context, req CreateBuildRequest, sourc
// Create build metadata
meta := &buildMetadata{
ID: id,
Status: StatusQueued,
Status: StatusBuilding,
Request: &req,
CreatedAt: time.Now(),
}
Expand Down Expand Up @@ -222,17 +220,12 @@ func (m *manager) CreateBuild(ctx context.Context, req CreateBuildRequest, sourc
return nil, fmt.Errorf("write build config: %w", err)
}

// Enqueue the build
queuePos := m.queue.Enqueue(id, req, func() {
m.runBuild(context.Background(), id, req, policy)
})
// Start the build immediately in background
go m.runBuild(context.Background(), id, req, policy)

build := meta.toBuild()
if queuePos > 0 {
build.QueuePosition = &queuePos
}

m.logger.Info("build created", "id", id, "queue_position", queuePos)
m.logger.Info("build created", "id", id)
return build, nil
}

Expand Down Expand Up @@ -386,6 +379,11 @@ func (m *manager) executeBuild(ctx context.Context, id string, req CreateBuildRe
},
})
if err != nil {
// Check if this is a resource exhaustion error
errStr := err.Error()
if strings.Contains(errStr, "exceeds") && strings.Contains(errStr, "limit") {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unclear how this ErrResourcesExhausted error makes it back to the API handler - since executeBuild is called from runBuild which runs in a background goroutine, the caller has already received a 202 by the time this runs. the 503 path in the handler seems unreachable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're absolutely right - that was exactly the issue. I've fixed this by adding a synchronous preflight resource check in CreateBuild that runs before spawning the async goroutine:

// Preflight check: verify resources are available before accepting the build
builderMemory := int64(policy.MemoryMB) * 1024 * 1024
if err := m.instanceManager.CheckResourceAvailability(ctx, policy.CPUs, builderMemory); err != nil {
    if errors.Is(err, instances.ErrResourcesExhausted) {
        return nil, fmt.Errorf("%w: %v", ErrResourcesExhausted, err)
    }
    return nil, fmt.Errorf("check resource availability: %w", err)
}

Added a new CheckResourceAvailability() method to instances.Manager that checks per-instance and aggregate limits without actually creating an instance. Now the 503 path is reachable when resources are truly exhausted at build creation time.

Also added proper sentinel errors (ErrResourcesExhausted) to both packages so we can use errors.Is() instead of brittle string matching.

return nil, fmt.Errorf("%w: %v", ErrResourcesExhausted, err)
}
return nil, fmt.Errorf("create builder instance: %w", err)
}

Expand Down Expand Up @@ -693,14 +691,7 @@ func (m *manager) GetBuild(ctx context.Context, id string) (*Build, error) {
return nil, err
}

build := meta.toBuild()

// Add queue position if queued
if meta.Status == StatusQueued {
build.QueuePosition = m.queue.GetPosition(id)
}

return build, nil
return meta.toBuild(), nil
}

// ListBuilds returns all builds
Expand All @@ -712,35 +703,22 @@ func (m *manager) ListBuilds(ctx context.Context) ([]*Build, error) {

builds := make([]*Build, 0, len(metas))
for _, meta := range metas {
build := meta.toBuild()
if meta.Status == StatusQueued {
build.QueuePosition = m.queue.GetPosition(meta.ID)
}
builds = append(builds, build)
builds = append(builds, meta.toBuild())
}

return builds, nil
}

// CancelBuild cancels a pending build
// CancelBuild cancels a running build
func (m *manager) CancelBuild(ctx context.Context, id string) error {
meta, err := readMetadata(m.paths, id)
if err != nil {
return err
}

switch meta.Status {
case StatusQueued:
// Remove from queue
if m.queue.Cancel(id) {
m.updateStatus(id, StatusCancelled, nil)
return nil
}
return ErrBuildInProgress // Was already picked up

case StatusBuilding, StatusPushing:
// Can't cancel a running build easily
// Would need to terminate the builder instance
// Terminate the builder instance to cancel the build
if meta.BuilderInstance != nil {
m.instanceManager.DeleteInstance(ctx, *meta.BuilderInstance)
}
Expand Down Expand Up @@ -936,7 +914,7 @@ func (m *manager) RecoverPendingBuilds() {
meta := meta // Shadow loop variable for closure capture
m.logger.Info("recovering build", "id", meta.ID, "status", meta.Status)

// Re-enqueue the build
// Start the build directly
if meta.Request != nil {
// Regenerate registry token since the original token may have expired
// during server downtime. Token TTL is minimum 30 minutes.
Expand All @@ -949,13 +927,14 @@ func (m *manager) RecoverPendingBuilds() {
continue
}

m.queue.Enqueue(meta.ID, *meta.Request, func() {
// Start the build in background
go func() {
policy := DefaultBuildPolicy()
if meta.Request.BuildPolicy != nil {
policy = *meta.Request.BuildPolicy
}
m.runBuild(context.Background(), meta.ID, *meta.Request, &policy)
})
}()
}
}

Expand Down
Loading