Skip to content

Commit a05a1fa

Browse files
committed
Collect coredumps on all nodes
Configure all nodes to save coredumps, and collect any coredumps that were saved during the gather-core-dump step.
1 parent f92e0e4 commit a05a1fa

File tree

6 files changed

+151
-0
lines changed

6 files changed

+151
-0
lines changed

ci-operator/config/openshift/release/openshift-release-master__ci-4.20-upgrade-from-stable-4.19.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,12 @@ tests:
9797
observers:
9898
enable:
9999
- observers-resource-watch
100+
post:
101+
- chain: gather-core-dump
102+
- chain: ipi-azure-post
103+
pre:
104+
- ref: enable-node-coredumps
105+
- chain: ipi-azure-pre-stableinitial
100106
workflow: openshift-upgrade-azure-ovn
101107
timeout: 5h30m0s
102108
- as: e2e-aws-ovn-uwm

ci-operator/config/openshift/router/openshift-router-master.yaml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ tests:
6161
skip_if_only_changed: ^docs/|\.md$|^(?:.*/)?(?:\.gitignore|OWNERS|OWNERS_ALIASES|PROJECT|LICENSE)$
6262
steps:
6363
cluster_profile: gcp-3
64+
post:
65+
- chain: gather-core-dump
66+
- chain: ipi-gcp-post
67+
pre:
68+
- ref: enable-node-coredumps
69+
- chain: ipi-gcp-pre
6470
workflow: openshift-e2e-gcp
6571
- as: e2e-metal-ipi-ovn-ipv6
6672
cluster: build05
@@ -72,6 +78,12 @@ tests:
7278
DEVSCRIPTS_CONFIG: |
7379
IP_STACK=v6
7480
NETWORK_TYPE=OVNKubernetes
81+
post:
82+
- chain: gather-core-dump
83+
- chain: baremetalds-ofcir-post
84+
pre:
85+
- ref: enable-node-coredumps
86+
- chain: baremetalds-ofcir-pre
7587
workflow: baremetalds-e2e
7688
- as: e2e-metal-ipi-ovn-dualstack
7789
cluster: build05
@@ -83,6 +95,12 @@ tests:
8395
DEVSCRIPTS_CONFIG: |
8496
IP_STACK=v4v6
8597
NETWORK_TYPE=OVNKubernetes
98+
post:
99+
- chain: gather-core-dump
100+
- chain: baremetalds-ofcir-post
101+
pre:
102+
- ref: enable-node-coredumps
103+
- chain: baremetalds-ofcir-pre
86104
workflow: baremetalds-e2e
87105
- as: e2e-aws-serial
88106
skip_if_only_changed: ^docs/|\.md$|^(?:.*/)?(?:\.gitignore|OWNERS|OWNERS_ALIASES|PROJECT|LICENSE)$
@@ -100,6 +118,12 @@ tests:
100118
DEVSCRIPTS_CONFIG: |
101119
IP_STACK=v4v6
102120
NETWORK_TYPE=OVNKubernetes
121+
post:
122+
- chain: gather-core-dump
123+
- chain: baremetalds-ofcir-post
124+
pre:
125+
- ref: enable-node-coredumps
126+
- chain: baremetalds-ofcir-pre
103127
test:
104128
- as: baremetalds-e2e-conf-router
105129
commands: |
@@ -148,6 +172,12 @@ tests:
148172
skip_if_only_changed: ^docs/|\.md$|^(?:.*/)?(?:\.gitignore|OWNERS|OWNERS_ALIASES|PROJECT|LICENSE)$
149173
steps:
150174
cluster_profile: azure4
175+
post:
176+
- chain: gather-core-dump
177+
- chain: ipi-azure-post
178+
pre:
179+
- ref: enable-node-coredumps
180+
- chain: ipi-azure-pre
151181
workflow: openshift-upgrade-azure
152182
- always_run: false
153183
as: perfscale-aws-ingress-perf
@@ -162,8 +192,10 @@ tests:
162192
OPENSHIFT_INFRA_NODE_INSTANCE_TYPE: c5.4xlarge
163193
SET_ENV_BY_PLATFORM: custom
164194
post:
195+
- chain: gather-core-dump
165196
- chain: ipi-aws-post
166197
pre:
198+
- ref: enable-node-coredumps
167199
- chain: ipi-aws-pre
168200
- chain: create-infra-move-ingress-monitoring-registry
169201
test:
@@ -183,8 +215,10 @@ tests:
183215
OPENSHIFT_INFRA_NODE_INSTANCE_TYPE: c5.4xlarge
184216
SET_ENV_BY_PLATFORM: custom
185217
post:
218+
- chain: gather-core-dump
186219
- chain: ipi-aws-post
187220
pre:
221+
- ref: enable-node-coredumps
188222
- chain: ipi-aws-pre
189223
- ref: fips-check
190224
- chain: create-infra-move-ingress-monitoring-registry
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
approvers:
2+
- knobunc
3+
- Miciah
4+
- candita
5+
- rfredette
6+
- alebedev87
7+
- gcs278
8+
- Thealisyed
9+
- grzpiotrowski
10+
- rikatz
11+
- bentito
12+
options: {}
13+
reviewers:
14+
- knobunc
15+
- Miciah
16+
- candita
17+
- rfredette
18+
- alebedev87
19+
- gcs278
20+
- Thealisyed
21+
- grzpiotrowski
22+
- rikatz
23+
- bentito
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
set -o errexit
3+
set -o nounset
4+
set -o pipefail
5+
6+
# Create a machine config that installs a systemd unit on nodes. The systemd unit configures the nodes to save any
7+
# coredumps that are generated, which will be collected during the gather-extra step.
8+
9+
echo "Creating manifests to enable coredump collection on nodes"
10+
11+
for role in master worker; do
12+
cat > "${SHARED_DIR}/manifest_enable_node_coredumps_machineconfig_${role}.yaml" <<EOF
13+
apiVersion: machineconfiguration.openshift.io/v1
14+
kind: MachineConfig
15+
metadata:
16+
labels:
17+
machineconfiguration.openshift.io/role: $role
18+
name: enable-node-coredumps-${role}
19+
spec:
20+
config:
21+
ignition:
22+
version: 3.2.0
23+
systemd:
24+
units:
25+
- contents: |
26+
[Unit]
27+
After=multi-user.target
28+
29+
[Service]
30+
Type=oneshot
31+
ExecStart=sysctl -w fs.suid_dumpable=1
32+
33+
[Install]
34+
WantedBy=multi-user.target
35+
enabled: true
36+
name: enable-node-coredumps.service
37+
EOF
38+
echo "manifest_enable_node_coredumps_machineconfig_${role}.yaml"
39+
echo "---------------------------------------------"
40+
cat ${SHARED_DIR}/manifest_enable_node_coredumps_machineconfig_${role}.yaml
41+
done
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"path": "enable-node-coredumps/enable-node-coredumps-ref.yaml",
3+
"owners": {
4+
"approvers": [
5+
"knobunc",
6+
"Miciah",
7+
"candita",
8+
"rfredette",
9+
"alebedev87",
10+
"gcs278",
11+
"Thealisyed",
12+
"grzpiotrowski",
13+
"rikatz",
14+
"bentito"
15+
],
16+
"reviewers": [
17+
"knobunc",
18+
"Miciah",
19+
"candita",
20+
"rfredette",
21+
"alebedev87",
22+
"gcs278",
23+
"Thealisyed",
24+
"grzpiotrowski",
25+
"rikatz",
26+
"bentito"
27+
]
28+
}
29+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
ref:
2+
as: enable-node-coredumps
3+
from_image:
4+
namespace: origin
5+
name: centos
6+
tag: '8'
7+
commands: enable-node-coredumps-commands.sh
8+
resources:
9+
requests:
10+
cpu: 10m
11+
memory: 100Mi
12+
documentation: |-
13+
The coredump service configures nodes to save all generated coredumps. This is useful for debugging failures of
14+
components running in privileged pods, such as router pods.
15+
The service is deployed by injecting an installer manifest containing a MachineConfig. This contains one systemd
16+
unit, which sets the sysctl variable fs.suid_dumpable to 1, instructing systemd-coredump to save any coredumps it
17+
encounters into /var/lib/systemd/coredump/ . Those coredump files can then be collected using the gather-core-dump
18+
chain.

0 commit comments

Comments
 (0)