Skip to content

Commit d95c9f5

Browse files
committed
feat: restructure the network/system/machine health checks
1 parent 99ceed3 commit d95c9f5

File tree

16 files changed

+584
-466
lines changed

16 files changed

+584
-466
lines changed

pkg/debugcmd/checks/check.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package checks
2+
3+
import (
4+
"context"
5+
6+
"github.com/threefoldtech/zosbase/pkg/gridtypes"
7+
"github.com/threefoldtech/zosbase/pkg/gridtypes/zos"
8+
)
9+
10+
type HealthCheck struct {
11+
Name string `json:"name"`
12+
OK bool `json:"ok"`
13+
Message string `json:"message,omitempty"`
14+
Evidence map[string]interface{} `json:"evidence,omitempty"`
15+
}
16+
17+
type CheckData struct {
18+
Twin uint32
19+
Contract uint64
20+
Workload gridtypes.Workload
21+
VM func(ctx context.Context, id string) bool
22+
Network func(ctx context.Context, id zos.NetID) string
23+
}
24+
25+
type NetworkCheck func(ctx context.Context, data *CheckData) HealthCheck
26+
27+
var NetworkChecks = []NetworkCheck{
28+
CheckNetworkConfig,
29+
CheckNetworkNamespace,
30+
CheckNetworkInterfaces,
31+
CheckNetworkBridge,
32+
CheckNetworkMycelium,
33+
}
34+
35+
type VMCheck func(ctx context.Context, data *CheckData) HealthCheck
36+
37+
var VMChecks = []VMCheck{
38+
CheckVMConfig,
39+
CheckVMVMD,
40+
CheckVMProcess,
41+
CheckVMDisks,
42+
CheckVMVirtioFS,
43+
}
44+
45+
type SystemCheck func(ctx context.Context, data *SystemProbeData) HealthCheck
46+
47+
var SystemProbeCheck = []SystemCheck{
48+
CheckSystemProbe,
49+
}

pkg/debugcmd/checks/network.go

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
package checks
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"os"
8+
"path/filepath"
9+
10+
cnins "github.com/containernetworking/plugins/pkg/ns"
11+
"github.com/threefoldtech/zosbase/pkg"
12+
"github.com/threefoldtech/zosbase/pkg/gridtypes/zos"
13+
"github.com/threefoldtech/zosbase/pkg/network/namespace"
14+
"github.com/threefoldtech/zosbase/pkg/network/nr"
15+
"github.com/threefoldtech/zosbase/pkg/versioned"
16+
"github.com/threefoldtech/zosbase/pkg/zinit"
17+
"github.com/vishvananda/netlink"
18+
)
19+
20+
const (
21+
networkdVolatileDir = "/var/run/cache/networkd"
22+
networksDir = "networks"
23+
myceliumKeyDir = "mycelium-key"
24+
)
25+
26+
// CheckNetworkConfig verifies network configuration file exists and is valid
27+
func CheckNetworkConfig(ctx context.Context, data *CheckData) HealthCheck {
28+
result := HealthCheck{
29+
Name: "network.config",
30+
OK: false,
31+
}
32+
33+
netID := zos.NetworkID(data.Twin, data.Workload.Name)
34+
netCfgPath := filepath.Join(networkdVolatileDir, networksDir, netID.String())
35+
36+
_, raw, err := versioned.ReadFile(netCfgPath)
37+
if err != nil {
38+
result.Message = fmt.Sprintf("config file not found: %v", err)
39+
result.Evidence = map[string]interface{}{"path": netCfgPath, "netid": netID.String()}
40+
return result
41+
}
42+
43+
var netCfg pkg.Network
44+
if err := json.Unmarshal(raw, &netCfg); err != nil {
45+
result.Message = fmt.Sprintf("config file invalid or unparseable: %v", err)
46+
result.Evidence = map[string]interface{}{"path": netCfgPath, "netid": netID.String()}
47+
return result
48+
}
49+
50+
if netCfg.NetID != netID {
51+
result.Message = fmt.Sprintf("config netid mismatch: expected %s, got %s", netID.String(), netCfg.NetID.String())
52+
result.Evidence = map[string]interface{}{"expected": netID.String(), "got": netCfg.NetID.String()}
53+
return result
54+
}
55+
56+
result.OK = true
57+
result.Message = "config valid"
58+
result.Evidence = map[string]interface{}{"path": netCfgPath, "netid": netID.String()}
59+
return result
60+
}
61+
62+
// CheckNetworkNamespace verifies network namespace exists and is accessible
63+
func CheckNetworkNamespace(ctx context.Context, data *CheckData) HealthCheck {
64+
result := HealthCheck{
65+
Name: "network.namespace",
66+
OK: false,
67+
}
68+
69+
netID := zos.NetworkID(data.Twin, data.Workload.Name)
70+
nsName := data.Network(ctx, netID)
71+
72+
if !namespace.Exists(nsName) {
73+
result.Message = "namespace not found"
74+
result.Evidence = map[string]interface{}{"namespace": nsName}
75+
return result
76+
}
77+
78+
result.OK = true
79+
result.Message = "namespace exists and accessible"
80+
result.Evidence = map[string]interface{}{"namespace": nsName}
81+
return result
82+
}
83+
84+
// CheckNetworkInterfaces verifies required network interfaces exist inside namespace
85+
func CheckNetworkInterfaces(ctx context.Context, data *CheckData) HealthCheck {
86+
result := HealthCheck{
87+
Name: "network.interfaces",
88+
OK: false,
89+
}
90+
91+
netID := zos.NetworkID(data.Twin, data.Workload.Name)
92+
nsName := data.Network(ctx, netID)
93+
94+
nrr := nr.New(pkg.Network{NetID: netID}, filepath.Join(networkdVolatileDir, myceliumKeyDir))
95+
wgIface, _ := nrr.WGName() // `w-*` iface
96+
nrIface, _ := nrr.NRIface() // `n-*` iface
97+
pubIface := "public" // `public` iface
98+
// TODO: add mycelium iface if configured for the network
99+
100+
netnsLinks := map[string]struct{}{}
101+
if netNS, err := namespace.GetByName(nsName); err == nil {
102+
_ = netNS.Do(func(_ cnins.NetNS) error {
103+
links, err := netlink.LinkList()
104+
if err == nil {
105+
for _, l := range links {
106+
netnsLinks[l.Attrs().Name] = struct{}{}
107+
}
108+
}
109+
return nil
110+
})
111+
netNS.Close()
112+
}
113+
114+
missing := []string{}
115+
if _, ok := netnsLinks[wgIface]; !ok {
116+
missing = append(missing, wgIface)
117+
}
118+
if _, ok := netnsLinks[nrIface]; !ok {
119+
missing = append(missing, nrIface)
120+
}
121+
if _, ok := netnsLinks[pubIface]; !ok {
122+
missing = append(missing, pubIface)
123+
}
124+
125+
if len(missing) > 0 {
126+
result.Message = fmt.Sprintf("missing interfaces: %v", missing)
127+
result.Evidence = map[string]interface{}{"namespace": nsName, "missing": missing}
128+
return result
129+
}
130+
131+
result.OK = true
132+
result.Message = "all required interfaces present"
133+
result.Evidence = map[string]interface{}{"namespace": nsName}
134+
return result
135+
}
136+
137+
// CheckNetworkBridge verifies network bridge exists and has members
138+
func CheckNetworkBridge(ctx context.Context, data *CheckData) HealthCheck {
139+
netDir := "/sys/class/net"
140+
brIfaceDir := "brif"
141+
142+
result := HealthCheck{
143+
Name: "network.bridge",
144+
OK: false,
145+
}
146+
147+
netID := zos.NetworkID(data.Twin, data.Workload.Name)
148+
nrr := nr.New(pkg.Network{NetID: netID}, filepath.Join(networkdVolatileDir, myceliumKeyDir))
149+
brName, _ := nrr.BridgeName()
150+
151+
if _, err := os.Stat(filepath.Join(netDir, brName)); err != nil {
152+
result.Message = fmt.Sprintf("bridge not found: %v", err)
153+
result.Evidence = map[string]interface{}{"bridge": brName}
154+
return result
155+
}
156+
157+
brifDir := filepath.Join(netDir, brName, brIfaceDir)
158+
ents, err := os.ReadDir(brifDir)
159+
if err != nil || len(ents) == 0 {
160+
result.Message = fmt.Sprintf("bridge has no members: %v", err)
161+
result.Evidence = map[string]interface{}{"bridge": brName}
162+
return result
163+
}
164+
165+
// TODO: check if the members are up interfaces
166+
167+
result.OK = true
168+
result.Message = "bridge has members"
169+
result.Evidence = map[string]interface{}{"bridge": brName}
170+
return result
171+
}
172+
173+
// CheckNetworkMycelium verifies mycelium service is running (if configured)
174+
func CheckNetworkMycelium(ctx context.Context, data *CheckData) HealthCheck {
175+
result := HealthCheck{
176+
Name: "network.mycelium",
177+
OK: false,
178+
}
179+
180+
netID := zos.NetworkID(data.Twin, data.Workload.Name)
181+
nrr := nr.New(pkg.Network{NetID: netID}, filepath.Join(networkdVolatileDir, myceliumKeyDir))
182+
service := nrr.MyceliumServiceName()
183+
184+
st, err := zinit.Default().Status(service)
185+
if err != nil {
186+
result.Message = fmt.Sprintf("cannot get service status: %v", err)
187+
result.Evidence = map[string]interface{}{"service": service}
188+
return result
189+
}
190+
191+
if !st.State.Is(zinit.ServiceStateRunning) {
192+
result.Message = fmt.Sprintf("service not running: %s", st.State.String())
193+
result.Evidence = map[string]interface{}{"service": service, "state": st.State.String()}
194+
return result
195+
}
196+
197+
result.OK = true
198+
result.Message = "service running"
199+
result.Evidence = map[string]interface{}{"service": service, "pid": st.Pid}
200+
return result
201+
}

pkg/debugcmd/checks/system.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package checks
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os/exec"
7+
"strings"
8+
"time"
9+
)
10+
11+
var systemProbeTimeout = 60 * time.Second
12+
13+
type SystemProbeData struct {
14+
Command string
15+
}
16+
17+
// CheckSystemProbe executes a custom system probe command
18+
func CheckSystemProbe(ctx context.Context, data *SystemProbeData) HealthCheck {
19+
result := HealthCheck{
20+
Name: "system.probe.custom",
21+
OK: false,
22+
Message: "system state probe execution",
23+
Evidence: map[string]interface{}{
24+
"probe_type": "custom",
25+
"exit_code": 0,
26+
},
27+
}
28+
29+
probeCtx, cancel := context.WithTimeout(ctx, systemProbeTimeout)
30+
defer cancel()
31+
32+
parts := strings.Fields(data.Command)
33+
if len(parts) == 0 {
34+
result.Message = "empty probe command"
35+
result.Evidence["error"] = "empty probe command"
36+
result.OK = false
37+
return result
38+
}
39+
40+
execCmd := exec.CommandContext(probeCtx, parts[0], parts[1:]...)
41+
output, err := execCmd.CombinedOutput()
42+
if err != nil {
43+
result.Message = fmt.Sprintf("probe command failed: %v", err)
44+
result.Evidence["error"] = err.Error()
45+
result.OK = false
46+
return result
47+
}
48+
49+
result.OK = true
50+
result.Message = "probe command executed successfully"
51+
result.Evidence["output"] = string(output)
52+
result.Evidence["exit_code"] = execCmd.ProcessState.ExitCode()
53+
return result
54+
}

0 commit comments

Comments
 (0)