From 842a02bf4124f14c60e783caa9bab2bc8254907d Mon Sep 17 00:00:00 2001 From: Ashraf Fouda Date: Tue, 16 Dec 2025 13:19:13 +0200 Subject: [PATCH] improve logging Signed-off-by: Ashraf Fouda --- pkg/container/container.go | 7 ++- pkg/flist/cleanup.go | 4 +- pkg/network/networker.go | 3 ++ pkg/network/nr/net_resource.go | 3 ++ pkg/primitives/zdb/zdb.go | 7 +++ pkg/provision/engine.go | 7 +++ pkg/provision/provisiner.go | 11 +++++ pkg/storage_light/storage.go | 4 +- pkg/vm/ch.go | 50 +++++++++++++++++++-- pkg/vm/machine.go | 3 ++ pkg/vm/manager.go | 80 ++++++++++++++++++++++++++++++++-- pkg/zdb/pool.go | 2 + 12 files changed, 170 insertions(+), 11 deletions(-) diff --git a/pkg/container/container.go b/pkg/container/container.go index 26398529..65dea4f8 100644 --- a/pkg/container/container.go +++ b/pkg/container/container.go @@ -142,7 +142,10 @@ func (c *Module) startup() error { // Run creates and starts a container func (c *Module) Run(ns string, data pkg.Container) (id pkg.ContainerID, err error) { - log.Debug().Str("ns", ns).Msg("starting container") + log.Info(). + Str("namespace", ns). + Str("data", fmt.Sprintf("%+v", data)). + Msgf("starting container") // create a new client connected to the default socket path for containerd client, err := containerd.New(c.containerd) @@ -570,7 +573,7 @@ func (c *Module) SignalDelete(ns string, id pkg.ContainerID) error { // Delete stops and remove a container func (c *Module) Delete(ns string, id pkg.ContainerID) error { - log.Debug().Str("id", string(id)).Str("ns", ns).Msg("delete container") + log.Info().Str("namespace", ns).Str("container-id", string(id)).Msg("deleting container") client, err := containerd.New(c.containerd) if err != nil { diff --git a/pkg/flist/cleanup.go b/pkg/flist/cleanup.go index fb1a9fa1..a751f4b8 100644 --- a/pkg/flist/cleanup.go +++ b/pkg/flist/cleanup.go @@ -111,11 +111,11 @@ func (f *flistModule) cleanUnusedMounts() error { delete(roTargets, info.Pid) } if len(roTargets) == 0 { - log.Debug().Msg("no unused mounts detected") + log.Info().Msg("no unused mounts detected") } // cleaning up remaining un-used mounts for pid, mount := range roTargets { - log.Debug().Int64("source", pid).Msgf("cleaning up mount: %+v", mount) + log.Info().Int64("source", pid).Msgf("cleaning up mount: %+v", mount) if err := f.system.Unmount(mount.Target, 0); err != nil { log.Error().Err(err).Str("target", mount.Target).Msg("failed to clean up mount") continue diff --git a/pkg/network/networker.go b/pkg/network/networker.go index d63ef2f8..e83d786a 100644 --- a/pkg/network/networker.go +++ b/pkg/network/networker.go @@ -1068,6 +1068,7 @@ func (n *networker) DeleteNR(wl gridtypes.WorkloadID) error { return err } + log.Info().Str("network-id", string(netID)).Msg("deleting network resource") nr := nr.New(netNR, n.myceliumKeyDir) if err := nr.Delete(); err != nil { @@ -1116,6 +1117,8 @@ func (n *networker) SetPublicConfig(cfg pkg.PublicConfig) error { return nil } + log.Info().Str("ipv4", cfg.IPv4.String()).Str("ipv6", cfg.IPv6.String()).Str("gateway", cfg.GW4.String()).Msg("configuring public network") + id := n.identity.NodeID(context.Background()) _, err = public.EnsurePublicSetup(id, environment.MustGet().PubVlan, &cfg) if err != nil { diff --git a/pkg/network/nr/net_resource.go b/pkg/network/nr/net_resource.go index 51836804..23a80acf 100644 --- a/pkg/network/nr/net_resource.go +++ b/pkg/network/nr/net_resource.go @@ -203,6 +203,7 @@ func (nr *NetResource) WGName() (string, error) { // Create setup the basic components of the network resource // network namespace, bridge, wireguard interface and veth pair func (nr *NetResource) Create() error { + log.Info().Str("network-id", nr.ID()).Str("subnet", nr.resource.Subnet.String()).Msg("creating network resource") log.Debug().Str("nr", nr.String()).Msg("create network resource") if err := nr.ensureNRBridge(); err != nil { @@ -604,6 +605,8 @@ func (nr *NetResource) ConfigureWG(privateKey string) error { // Delete removes all the interfaces and namespaces created by the Create method func (nr *NetResource) Delete() error { + log.Info().Str("network-id", nr.ID()).Str("subnet", nr.resource.Subnet.String()).Msg("deleting network resource") + netnsName, err := nr.Namespace() if err != nil { return err diff --git a/pkg/primitives/zdb/zdb.go b/pkg/primitives/zdb/zdb.go index 0f312ee8..250b16b5 100644 --- a/pkg/primitives/zdb/zdb.go +++ b/pkg/primitives/zdb/zdb.go @@ -484,6 +484,13 @@ func (p *Manager) createZdbContainerLight(ctx context.Context, device pkg.Device } func (p *Manager) zdbRun(ctx context.Context, name string, rootfs string, cmd string, netns string, volumepath string, socketdir string) error { + log.Info(). + Str("zdb-name", name). + Str("network-ns", netns). + Str("volume", volumepath). + Str("rootfs", rootfs). + Msg("starting ZDB container") + cont := stubs.NewContainerModuleStub(p.zbus) // we do data migration here because this is called diff --git a/pkg/provision/engine.go b/pkg/provision/engine.go index 14cb8800..7cb580af 100644 --- a/pkg/provision/engine.go +++ b/pkg/provision/engine.go @@ -347,6 +347,11 @@ func (e *NativeEngine) Admins() Twins { // Provision workload func (e *NativeEngine) Provision(ctx context.Context, deployment gridtypes.Deployment) error { + log.Info(). + Uint32("twin", deployment.TwinID). + Uint64("contract", deployment.ContractID). + Msg("scheduling deployment for provision") + if deployment.Version != 0 { return errors.Wrap(ErrInvalidVersion, "expected version to be 0 on deployment creation") } @@ -987,6 +992,8 @@ func (e *NativeEngine) updateDeployment(ctx context.Context, ops []gridtypes.Upg // DecommissionCached implements the zbus interface func (e *NativeEngine) DecommissionCached(id string, reason string) error { + log.Info().Str("workload-id", id).Str("reason", reason).Msg("decommissioning cached workload") + globalID := gridtypes.WorkloadID(id) twin, dlID, name, err := globalID.Parts() if err != nil { diff --git a/pkg/provision/provisiner.go b/pkg/provision/provisiner.go index 9729848c..07d2cf22 100644 --- a/pkg/provision/provisiner.go +++ b/pkg/provision/provisiner.go @@ -6,6 +6,7 @@ import ( "fmt" "github.com/pkg/errors" + "github.com/rs/zerolog/log" "github.com/threefoldtech/zosbase/pkg/gridtypes" ) @@ -120,6 +121,8 @@ func (p *mapProvisioner) Initialize(ctx context.Context) error { // Provision implements provision.Provisioner func (p *mapProvisioner) Provision(ctx context.Context, wl *gridtypes.WorkloadWithID) (result gridtypes.Result, err error) { + log.Info().Str("workload-id", string(wl.ID)).Str("workload-type", string(wl.Type)).Msg("provisioning workload") + manager, ok := p.managers[wl.Type] if !ok { return result, fmt.Errorf("unknown workload type '%s' for reservation id '%s'", wl.Type, wl.ID) @@ -135,6 +138,8 @@ func (p *mapProvisioner) Provision(ctx context.Context, wl *gridtypes.WorkloadWi // Decommission implementation for provision.Provisioner func (p *mapProvisioner) Deprovision(ctx context.Context, wl *gridtypes.WorkloadWithID) error { + log.Info().Str("workload-id", string(wl.ID)).Str("workload-type", string(wl.Type)).Msg("deprovisioning workload") + manager, ok := p.managers[wl.Type] if !ok { return fmt.Errorf("unknown workload type '%s' for reservation id '%s'", wl.Type, wl.ID) @@ -145,6 +150,8 @@ func (p *mapProvisioner) Deprovision(ctx context.Context, wl *gridtypes.Workload // Pause a workload func (p *mapProvisioner) Pause(ctx context.Context, wl *gridtypes.WorkloadWithID) (gridtypes.Result, error) { + log.Info().Str("workload-id", string(wl.ID)).Str("workload-type", string(wl.Type)).Msg("pausing workload") + if wl.Result.State != gridtypes.StateOk { return wl.Result, fmt.Errorf("can only pause workloads in ok state") } @@ -172,6 +179,8 @@ func (p *mapProvisioner) Pause(ctx context.Context, wl *gridtypes.WorkloadWithID // Resume a workload func (p *mapProvisioner) Resume(ctx context.Context, wl *gridtypes.WorkloadWithID) (gridtypes.Result, error) { + log.Info().Str("workload-id", string(wl.ID)).Str("workload-type", string(wl.Type)).Msg("resuming workload") + if wl.Result.State != gridtypes.StatePaused { return wl.Result, fmt.Errorf("can only resume workloads in paused state") } @@ -198,6 +207,8 @@ func (p *mapProvisioner) Resume(ctx context.Context, wl *gridtypes.WorkloadWithI // Provision implements provision.Provisioner func (p *mapProvisioner) Update(ctx context.Context, wl *gridtypes.WorkloadWithID) (result gridtypes.Result, err error) { + log.Info().Str("workload-id", string(wl.ID)).Str("workload-type", string(wl.Type)).Msg("updating workload") + manager, ok := p.managers[wl.Type] if !ok { return result, fmt.Errorf("unknown workload type '%s' for reservation id '%s'", wl.Type, wl.ID) diff --git a/pkg/storage_light/storage.go b/pkg/storage_light/storage.go index 95a4aa5a..39186265 100644 --- a/pkg/storage_light/storage.go +++ b/pkg/storage_light/storage.go @@ -193,7 +193,7 @@ func (s *Module) poolType(pool filesystem.Pool, vm bool) (zos.DeviceType, error) } func (s *Module) mountPool(device filesystem.DeviceInfo, vm bool) { - log.Debug().Any("device", device).Msg("mounting device") + log.Info().Str("device", device.Path).Str("label", device.Label).Msg("mounting storage device") if device.IsPXEPartition() { log.Debug().Str("device", device.Path).Msg("skip device has 'ZOSPXE' label") @@ -231,9 +231,11 @@ func (s *Module) mountPool(device filesystem.DeviceInfo, vm bool) { case zos.SSDDevice: s.totalSSD += usage.Size s.ssds = append(s.ssds, pool) + log.Info().Str("device", device.Path).Str("type", "SSD").Str("pool", pool.Name()).Uint64("size", usage.Size).Msg("mounted storage pool") case zos.HDDDevice: s.totalHDD += usage.Size s.hdds = append(s.hdds, pool) + log.Info().Str("device", device.Path).Str("type", "HDD").Str("pool", pool.Name()).Uint64("size", usage.Size).Msg("mounted storage pool") default: log.Error().Str("type", string(typ)).Str("device", device.Path).Msg("unknown device type") } diff --git a/pkg/vm/ch.go b/pkg/vm/ch.go index 3218f828..1fe2aac5 100644 --- a/pkg/vm/ch.go +++ b/pkg/vm/ch.go @@ -50,7 +50,7 @@ func (m *Machine) startCloudConsole(ctx context.Context, namespace string, netwo logs, } - log.Debug().Msgf("running cloud-console : %+v", args) + log.Info().Str("vm-id", m.ID).Str("namespace", namespace).Str("console-ip", networkAddr.IP.String()).Msgf("starting cloud-console on port %d", port) cmd := exec.CommandContext(ctx, "busybox", args...) if err := cmd.Start(); err != nil { @@ -100,6 +100,7 @@ func (m *Machine) startCloudConsoleLight(ctx context.Context, namespace string, fmt.Sprint(port), logs, } + log.Info().Str("vm-id", m.ID).Str("namespace", namespace).Str("mycelium-ip", mycIp).Msgf("starting cloud-console on port %d", port) log.Debug().Msgf("running cloud-console : %+v", args) cmd := exec.CommandContext(ctx, "busybox", args...) @@ -228,7 +229,8 @@ func (m *Machine) Run(ctx context.Context, socket, logs string) (pkg.MachineInfo // this. fullArgs = append(fullArgs, "setsid", chBin) fullArgs = append(fullArgs, argsList...) - log.Debug().Msgf("ch: %+v", fullArgs) + log.Info().Str("vm-id", m.ID).Uint8("cpu", uint8(m.Config.CPU)).Str("memory", m.Config.Mem.String()).Msg("starting cloud-hypervisor VM") + log.Info().Msgf("ch: %+v", fullArgs) cmd := exec.CommandContext(ctx, "busybox", fullArgs...) cmd.Stdout = logFd @@ -280,6 +282,48 @@ func (m *Machine) Run(ctx context.Context, socket, logs string) (pkg.MachineInfo if err != nil { return pkg.MachineInfo{}, errors.Wrapf(err, "failed to Inspect vm with id: '%s'", m.ID) } + + // Log VM network interfaces with full address information + logEvent := log.Info().Str("vm-id", m.ID) + for idx, ifc := range m.Interfaces { + logEvent = logEvent. + Str(fmt.Sprintf("interface-%d-id", idx), ifc.ID). + Str(fmt.Sprintf("interface-%d-tap", idx), ifc.Tap). + Str(fmt.Sprintf("interface-%d-mac", idx), ifc.Mac) + + // Log IP addresses from NetworkInfo if available + if m.NetworkInfo != nil && idx < len(m.NetworkInfo.Ifaces) { + vmIface := m.NetworkInfo.Ifaces[idx] + + // Log all IP addresses + for ipIdx, ip := range vmIface.IPs { + logEvent = logEvent.Str(fmt.Sprintf("interface-%d-ip-%d", idx, ipIdx), ip.String()) + } + + // Log gateways + if vmIface.IP4DefaultGateway != nil { + logEvent = logEvent.Str(fmt.Sprintf("interface-%d-ipv4-gateway", idx), vmIface.IP4DefaultGateway.String()) + } + if vmIface.IP6DefaultGateway != nil { + logEvent = logEvent.Str(fmt.Sprintf("interface-%d-ipv6-gateway", idx), vmIface.IP6DefaultGateway.String()) + } + + // Log public IP flags + if vmIface.PublicIPv4 { + logEvent = logEvent.Bool(fmt.Sprintf("interface-%d-public-ipv4", idx), true) + } + if vmIface.PublicIPv6 { + logEvent = logEvent.Bool(fmt.Sprintf("interface-%d-public-ipv6", idx), true) + } + + // Log network ID if present + if vmIface.NetID != "" { + logEvent = logEvent.Str(fmt.Sprintf("interface-%d-network-id", idx), string(vmIface.NetID)) + } + } + } + logEvent.Msg("VM started with network interfaces and addresses") + consoleURL := "" for _, ifc := range m.Interfaces { if ifc.Console != nil { @@ -323,7 +367,7 @@ func (m *Machine) waitAndAdjOom(ctx context.Context, name string, socket string) ctx, ), func(err error, d time.Duration) { - log.Debug().Err(err).Str("id", name).Msg("vm is not up yet") + log.Info().Err(err).Str("id", name).Msg("vm is not up yet") }); err != nil { return err diff --git a/pkg/vm/machine.go b/pkg/vm/machine.go index a8985f55..a91cb236 100644 --- a/pkg/vm/machine.go +++ b/pkg/vm/machine.go @@ -9,6 +9,7 @@ import ( "github.com/pkg/errors" "github.com/rs/zerolog/log" + "github.com/threefoldtech/zosbase/pkg" "github.com/vishvananda/netlink" ) @@ -138,6 +139,8 @@ type Machine struct { // NoKeepAlive is not used by firecracker, but instead a marker // for the vm mananger to not restart the machine when it stops NoKeepAlive bool `json:"no-keep-alive"` + // NetworkInfo holds the full network configuration with IPs (not serialized to config file) + NetworkInfo *pkg.VMNetworkInfo `json:"-"` } // Save saves a machine into a file diff --git a/pkg/vm/manager.go b/pkg/vm/manager.go index f460b502..c06a7aba 100644 --- a/pkg/vm/manager.go +++ b/pkg/vm/manager.go @@ -551,6 +551,7 @@ func (m *Module) Run(vm pkg.VM) (pkg.MachineInfo, error) { Disks: disks, Devices: vm.Devices, NoKeepAlive: vm.NoKeepAlive, + NetworkInfo: &vm.Network, } log.Debug().Str("name", vm.Name).Msg("saving machine") @@ -569,6 +570,78 @@ func (m *Module) Run(vm pkg.VM) (pkg.MachineInfo, error) { m.failures.Set(vm.Name, permanent, cache.NoExpiration) } + // Log comprehensive VM configuration before starting + logEvent := log.Info(). + Str("vm-id", vm.Name). + Str("hostname", vm.Hostname). + Uint8("cpu", uint8(vm.CPU)). + Uint64("memory-bytes", uint64(vm.Memory)) + + // Log network interfaces with IPs and MACs + for idx, nic := range nics { + logEvent = logEvent. + Str(fmt.Sprintf("interface-%d-tap", idx), nic.Tap). + Str(fmt.Sprintf("interface-%d-mac", idx), nic.Mac) + + // Log console info if available + if nic.Console != nil { + logEvent = logEvent. + Str(fmt.Sprintf("interface-%d-console-ip", idx), nic.Console.ListenAddress.IP.String()). + Str(fmt.Sprintf("interface-%d-vm-ip", idx), nic.Console.VmAddress.IP.String()). + Str(fmt.Sprintf("interface-%d-namespace", idx), nic.Console.Namespace) + } + + // Find corresponding interface config to get IPs + if idx < len(vm.Network.Ifaces) { + ifcfg := vm.Network.Ifaces[idx] + if len(ifcfg.IPs) > 0 { + ips := make([]string, len(ifcfg.IPs)) + for i, ip := range ifcfg.IPs { + ips[i] = ip.String() + } + logEvent = logEvent.Strs(fmt.Sprintf("interface-%d-ips", idx), ips) + } + // Log gateways if present + if ifcfg.IP4DefaultGateway != nil { + logEvent = logEvent.Str(fmt.Sprintf("interface-%d-gw4", idx), ifcfg.IP4DefaultGateway.String()) + } + if ifcfg.IP6DefaultGateway != nil { + logEvent = logEvent.Str(fmt.Sprintf("interface-%d-gw6", idx), ifcfg.IP6DefaultGateway.String()) + } + // Log if this is a public interface + if ifcfg.PublicIPv4 { + logEvent = logEvent.Bool(fmt.Sprintf("interface-%d-public-ipv4", idx), true) + } + if ifcfg.PublicIPv6 { + logEvent = logEvent.Bool(fmt.Sprintf("interface-%d-public-ipv6", idx), true) + } + // Log network ID for private networks + if ifcfg.NetID != "" { + logEvent = logEvent.Str(fmt.Sprintf("interface-%d-network-id", idx), string(ifcfg.NetID)) + } + } + } + + // Log mounts + if len(cfg.Mounts) > 0 { + mountPaths := make([]string, len(cfg.Mounts)) + for i, mnt := range cfg.Mounts { + mountPaths[i] = fmt.Sprintf("%s->%s", mnt.Source, mnt.Target) + } + logEvent = logEvent.Strs("mounts", mountPaths) + } + + // Log disks + if len(disks) > 0 { + diskPaths := make([]string, len(disks)) + for i, disk := range disks { + diskPaths[i] = disk.Path + } + logEvent = logEvent.Strs("disks", diskPaths) + } + + logEvent.Msg("starting VM with full configuration") + machineInfo, err := machine.Run(ctx, m.socketPath(vm.Name), m.logsPath(vm.Name)) if err != nil { return pkg.MachineInfo{}, m.withLogs(m.logsPath(vm.Name), err) @@ -615,6 +688,7 @@ func (m *Module) removeConfig(name string) { // Delete deletes a machine by name (id) func (m *Module) Delete(name string) error { + log.Info().Str("vm-id", name).Msg("deleting VM") defer m.failures.Delete(name) // before we do anything we set failures to permanent to prevent monitoring from trying @@ -649,7 +723,7 @@ func (m *Module) Delete(name string) error { killAfter = 10 * time.Second ) - log.Debug().Str("name", name).Msg("shutting vm down [client]") + log.Info().Str("name", name).Msg("shutting vm down [client]") if err := client.Shutdown(ctx); err != nil { log.Error().Err(err).Str("name", name).Msg("failed to shutdown machine") } @@ -659,13 +733,13 @@ func (m *Module) Delete(name string) error { return nil } - log.Debug().Str("name", name).Msg("shutting vm down [sigterm]") + log.Info().Str("name", name).Msg("shutting vm down [sigterm]") if time.Since(now) > termAfter { _ = syscall.Kill(ps.Pid, syscall.SIGTERM) } if time.Since(now) > killAfter { - log.Debug().Str("name", name).Msg("shutting vm down [sigkill]") + log.Info().Str("name", name).Msg("shutting vm down [sigkill]") _ = syscall.Kill(ps.Pid, syscall.SIGKILL) break } diff --git a/pkg/zdb/pool.go b/pkg/zdb/pool.go index 5e0c56a8..16cde98b 100644 --- a/pkg/zdb/pool.go +++ b/pkg/zdb/pool.go @@ -9,6 +9,7 @@ import ( "regexp" "github.com/pkg/errors" + "github.com/rs/zerolog/log" "github.com/threefoldtech/zosbase/pkg/gridtypes" ) @@ -166,6 +167,7 @@ func (p *Index) IndexMode(name string) (mode IndexMode, err error) { // Delete both the data and an index of a namespace func (p *Index) Delete(name string) error { + log.Info().Str("namespace", name).Str("pool", p.root).Msg("deleting ZDB namespace") if err := p.valid(name); err != nil { return err }