Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ func LoadConfig(path string) (Config, error) {
viper.SetDefault("fim::periodicConfig::followSymlinks", false)
viper.SetDefault("fim::exporters::stdoutExporter", false)
// Host sensor defaults
viper.SetDefault("hostSensorEnabled", true)
viper.SetDefault("hostSensorEnabled", false)
viper.SetDefault("hostSensorInterval", 5*time.Minute)

viper.AutomaticEnv()
Expand Down
73 changes: 70 additions & 3 deletions pkg/containerwatcher/v2/tracer_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@ import (
"github.com/kubescape/node-agent/pkg/utils"
)

// TracerManager manages the lifecycle of all eBPF and custom tracers
//
// Responsibilities:
// - Register and coordinate ~20 different tracers (exec, network, dns, http, etc.)
// - Stagger tracer startup to prevent concurrent eBPF loading (reduces peak RSS from 1.4GB to 500MB)
// - Handle graceful shutdown of all tracers
//
// Key implementation detail: Tracers spawn goroutines in Start() for eBPF loading.
// Without controlled startup, all tracers load concurrently, causing kernel memory pressure.
type TracerManager struct {
cfg config.Config
tracers map[utils.EventType]containerwatcher.TracerInterface
Expand Down Expand Up @@ -41,20 +50,70 @@ func (tm *TracerManager) GetAllTracers() map[utils.EventType]containerwatcher.Tr
return tm.tracers
}

// StartAllTracers initializes and starts all tracers with staggered startup
//
// Memory optimization strategy:
// Tracers load eBPF programs into kernel space via Inspektor Gadget. Without delays,
// all ~20 tracers call runtime.RunGadget() in a tight loop, causing concurrent
// bpf() syscalls. This results in peak RSS of ~1.4GB (kernel allocates separate
// regions for each tracer's eBPF programs/maps).
//
// Adding delays between tracers allows sequential eBPF loading, reducing peak to
// ~500MB. The delay enables kernel to share BTF data and establish shared maps
// before the next tracer loads. Go heap memory remains unaffected (~100-180MB).
//
// Delay value must exceed eBPF initialization time (~250-500ms per tracer).
// Current value of 2s ensures no overlap; 1s would also work per testing.
// 0ms or time.After(0) provides no benefit (equivalent to tight loop).
func (tm *TracerManager) StartAllTracers(ctx context.Context) error {
tm.tracerFactory.CreateAllTracers(tm)

if err := tm.startProcfsTracer(ctx); err != nil {
return err
}

tracerCount := 0
for _, tracer := range tm.tracers {
if tracer.IsEnabled(tm.cfg) {
if !tracer.IsEnabled(tm.cfg) {
continue
}

select {
case <-ctx.Done():
return ctx.Err()
default:
// Start tracer and continue on error instead of failing entire startup.
// This keeps partial tracer availability even if some tracers fail to initialize.
// Combined with staggered delays, this prevents cascading failures from overwhelming the system.
if err := tracer.Start(ctx); err != nil {
return err
logger.L().Error("error starting tracer", helpers.String("tracer", tracer.GetName()), helpers.Error(err))
continue
}
tracerCount++
logger.L().Info("Started tracer", helpers.String("tracer", tracer.GetName()), helpers.Int("count", tracerCount))
}

logger.L().Info("Started tracer", helpers.String("tracer", tracer.GetName()))
// Wait before starting next tracer to prevent concurrent eBPF loading

// Memory reduction mechanism:
// Each tracer.Start() spawns a goroutine that calls runtime.RunGadget(),
// which loads eBPF programs into kernel space via bpf() syscalls.
// Concurrent loading causes kernel to allocate separate memory regions
// for each tracer's eBPF programs and maps, leading to ~1.4GB peak RSS.
//
// Sequential loading allows kernel to:
// 1. Share BTF (BPF Type Format) data between eBPF programs
// 2. Establish shared maps (socket enrichment, kube metadata) once
// 3. Avoid memory fragmentation from concurrent bpf() syscalls
//
// Note: Delay value must exceed eBPF load time (~250-500ms per tracer).
// Values of 1s, 2s, and 5s all achieve same memory profile (~500MB peak).
// The key is preventing overlap, not the exact delay duration.
// Go heap remains stable (~100-180MB); reduction is in kernel eBPF memory.
select {
case <-time.After(2 * time.Second):
case <-ctx.Done():
return ctx.Err()
}
}

Expand All @@ -73,6 +132,12 @@ func (tm *TracerManager) StopAllTracers() error {
return lastErr
}

// startProcfsTracer starts the procfs tracer ahead of eBPF tracers
//
// The procfs tracer scans /proc for container/process information and needs
// time to initialize before other tracers (which may depend on this data).
// This prevents potential race conditions where eBPF tracers attempt to
// enrich process info before procfs has completed its initial scan.
func (tm *TracerManager) startProcfsTracer(ctx context.Context) error {
if tracer, exists := tm.GetTracer(utils.ProcfsEventType); exists {
delete(tm.tracers, utils.ProcfsEventType)
Expand All @@ -84,6 +149,8 @@ func (tm *TracerManager) startProcfsTracer(ctx context.Context) error {
}
}

// Wait for procfs scan to complete before starting eBPF tracers.
// This delay ensures container/process data is ready for enrichment.
select {
case <-time.After(tm.cfg.ProcfsScanInterval):
case <-ctx.Done():
Expand Down
Loading