Skip to content

Commit 0eab17e

Browse files
committed
Collect coredumps on all nodes
Configure all nodes to save coredumps, and collect any coredumps that were saved during the gather-extra step.
1 parent e95b3d1 commit 0eab17e

File tree

7 files changed

+126
-0
lines changed

7 files changed

+126
-0
lines changed

ci-operator/config/openshift/release/openshift-release-master__ci-4.20-upgrade-from-stable-4.19.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ tests:
8989
interval: 168h
9090
steps:
9191
cluster_profile: azure-2
92+
pre:
93+
- ref: collect-coredumps
9294
env:
9395
BASE_DOMAIN: ci2.azure.devcluster.openshift.com
9496
CONTROL_PLANE_INSTANCE_TYPE: Standard_D8as_v5

ci-operator/config/openshift/router/openshift-router-master.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,8 @@ tests:
142142
- as: e2e-upgrade
143143
steps:
144144
cluster_profile: azure4
145+
pre:
146+
- ref: collect-coredumps
145147
workflow: openshift-upgrade-azure
146148
- always_run: false
147149
as: perfscale-aws-ingress-perf
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
approvers:
2+
- knobunc
3+
- Miciah
4+
- candita
5+
- rfredette
6+
- alebedev87
7+
- gcs278
8+
- Thealisyed
9+
- grzpiotrowski
10+
- rikatz
11+
- bentito
12+
options: {}
13+
reviewers:
14+
- knobunc
15+
- Miciah
16+
- candita
17+
- rfredette
18+
- alebedev87
19+
- gcs278
20+
- Thealisyed
21+
- grzpiotrowski
22+
- rikatz
23+
- bentito
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
set -o errexit
3+
set -o nounset
4+
set -o pipefail
5+
6+
# Create a machine config that installs a systemd unit on nodes. The systemd unit configures the nodes to save any
7+
# coredumps that are generated, which will be collected during the gather-extra step.
8+
9+
echo "Creating manifests to enable coredump collection"
10+
11+
for role in master worker; do
12+
cat > "${SHARED_DIR}/manifest_collect_coredumps_machineconfig_${role}.yaml" <<EOF
13+
apiVersion: machineconfiguration.openshift.io/v1
14+
kind: MachineConfig
15+
metadata:
16+
labels:
17+
machineconfiguration.openshift.io/role: $role
18+
name: collect-coredumps-${role}
19+
spec:
20+
config:
21+
ignition:
22+
version: 3.2.0
23+
systemd:
24+
units:
25+
- contents: |
26+
[Unit]
27+
After=multi-user.target
28+
29+
[Service]
30+
Type=oneshot
31+
ExecStart=sysctl -w fs.suid_dumpable=1
32+
33+
[Install]
34+
WantedBy=multi-user.target
35+
enabled: true
36+
name: collect-coredumps.service
37+
EOF
38+
echo "manifest_collect_coredumps_machineconfig_${role}.yaml"
39+
echo "---------------------------------------------"
40+
cat ${SHARED_DIR}/manifest_collect_coredumps_machineconfig_${role}.yaml
41+
done
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"path": "collect-coredumps/collect-coredumps-ref.yaml",
3+
"owners": {
4+
"approvers": [
5+
"knobunc",
6+
"Miciah",
7+
"candita",
8+
"rfredette",
9+
"alebedev87",
10+
"gcs278",
11+
"Thealisyed",
12+
"grzpiotrowski",
13+
"rikatz",
14+
"bentito"
15+
],
16+
"reviewers": [
17+
"knobunc",
18+
"Miciah",
19+
"candita",
20+
"rfredette",
21+
"alebedev87",
22+
"gcs278",
23+
"Thealisyed",
24+
"grzpiotrowski",
25+
"rikatz",
26+
"bentito"
27+
]
28+
}
29+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
ref:
2+
as: collect-coredumps
3+
from_image:
4+
namespace: origin
5+
name: centos
6+
tag: '8'
7+
commands: collect-coredumps-commands.sh
8+
resources:
9+
requests:
10+
cpu: 10m
11+
memory: 100Mi
12+
documentation: |-
13+
The coredump service configures nodes to save all generated coredumps. This is useful for debugging failures of
14+
components running in privileged pods, such as router pods.
15+
The service is deployed by injecting an installer manifest containing a MachineConfig. This contains one systemd
16+
unit, which sets the sysctl variable fs.suid_dumpable to 1, instructing systemd-coredump to save any coredumps it
17+
encounters into /var/lib/systemd/coredump/ . Those coredump files are then collected during the gather-extra step.

ci-operator/step-registry/gather/extra/gather-extra-commands.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,18 @@ for pqn in $(oc get pods -n openshift-etcd -l app=etcd --no-headers -o=name); do
214214
done
215215
echo "INFO: done attempting to fetch etcd debug info"
216216

217+
echo "INFO: gathering coredumps if present"
218+
output_dir="${ARTIFACT_DIR}/coredumps"
219+
mkdir -p "$output_dir"
220+
oc adm node-logs -l kubernetes.io/os=linux --path="/var/lib/systemd/coredump/" | \
221+
tee ${output_dir}/.coredumps_listing
222+
while IFS= read -r item; do
223+
node=$(echo $item |cut -d ' ' -f 1)
224+
fname=$(echo $item |cut -d ' ' -f 2)
225+
echo "INFO: Queueing download/gzip of /var/lib/systemd/coredump/${fname} from ${node}";
226+
echo "INFO: gziping to ${output_dir}/${node}-${fname}.gz";
227+
FILTER=gzip queue ${output_dir}/${node}-${fname}.gz oc --insecure-skip-tls-verify adm node-logs ${node} --path=/var/lib/systemd/coredump/${fname}
228+
done < ${output_dir}/.coredumps_listing
217229

218230
function gather_network() {
219231
local namespace=$1

0 commit comments

Comments
 (0)