From 285ffda57805f3de884d34a3d00e76de83d2795e Mon Sep 17 00:00:00 2001 From: William Yue Date: Thu, 5 Feb 2026 14:32:48 -0800 Subject: [PATCH] added checks to see if ec2 instance is online before job succeeds --- .github/workflows/gpu_test.yml | 23 +++++++++++++++++++++-- .github/workflows/regression_test.yml | 23 +++++++++++++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml index 768a875..a75d789 100644 --- a/.github/workflows/gpu_test.yml +++ b/.github/workflows/gpu_test.yml @@ -30,6 +30,7 @@ jobs: start-runner: name: Start GPU Runner runs-on: ubuntu-latest + timeout-minutes: 15 # Fail fast if ASG/runner never becomes ready permissions: id-token: write # Required for requesting the JWT contents: read @@ -42,8 +43,26 @@ jobs: - name: Start Instance run: | - aws autoscaling set-desired-capacity --auto-scaling-group-name github-runner-asg-g6-2xlarge --desired-capacity 1 - echo "Waiting for instance to be ready..." + ASG_NAME="github-runner-asg-g6-2xlarge" + aws autoscaling set-desired-capacity --auto-scaling-group-name "$ASG_NAME" --desired-capacity 1 + echo "Waiting for instance to be InService (max 10 min)..." + for i in $(seq 1 40); do + COUNT=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$ASG_NAME" \ + --query 'AutoScalingGroups[0].Instances[?LifecycleState==`InService`].InstanceId' --output text | wc -w) + [ "$COUNT" -ge 1 ] && break + echo "Instance not ready yet ($i/40)..." + sleep 15 + done + if [ "$COUNT" -lt 1 ]; then + echo "Timeout: instance did not become InService" + exit 1 + fi + INSTANCE_ID=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$ASG_NAME" \ + --query 'AutoScalingGroups[0].Instances[?LifecycleState==`InService`].InstanceId' --output text | awk '{print $1}') + echo "Waiting for EC2 status checks on $INSTANCE_ID..." + aws ec2 wait instance-status-ok --instance-ids "$INSTANCE_ID" + echo "Instance ready. Waiting 3 min for GitHub runner to register..." + sleep 180 gpu-test: name: Run Pytest on GPU diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml index 82ce8a2..cd4911d 100644 --- a/.github/workflows/regression_test.yml +++ b/.github/workflows/regression_test.yml @@ -32,6 +32,7 @@ jobs: start-runner: name: Start GPU Runner runs-on: ubuntu-latest + timeout-minutes: 15 # Fail fast if ASG/runner never becomes ready permissions: id-token: write contents: read @@ -44,8 +45,26 @@ jobs: - name: Start Instance run: | - aws autoscaling set-desired-capacity --auto-scaling-group-name github-runner-asg-g6-12xlarge --desired-capacity 1 - echo "Waiting for instance to be ready..." + ASG_NAME="github-runner-asg-g6-12xlarge" + aws autoscaling set-desired-capacity --auto-scaling-group-name "$ASG_NAME" --desired-capacity 1 + echo "Waiting for instance to be InService (max 10 min)..." + for i in $(seq 1 40); do + COUNT=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$ASG_NAME" \ + --query 'AutoScalingGroups[0].Instances[?LifecycleState==`InService`].InstanceId' --output text | wc -w) + [ "$COUNT" -ge 1 ] && break + echo "Instance not ready yet ($i/40)..." + sleep 15 + done + if [ "$COUNT" -lt 1 ]; then + echo "Timeout: instance did not become InService" + exit 1 + fi + INSTANCE_ID=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$ASG_NAME" \ + --query 'AutoScalingGroups[0].Instances[?LifecycleState==`InService`].InstanceId' --output text | awk '{print $1}') + echo "Waiting for EC2 status checks on $INSTANCE_ID..." + aws ec2 wait instance-status-ok --instance-ids "$INSTANCE_ID" + echo "Instance ready. Waiting 3 min for GitHub runner to register..." + sleep 180 train-regression: name: Train with Model Parallelism