From e63b2b44fe16ddde76adbca34bcd73d5b96dade3 Mon Sep 17 00:00:00 2001 From: Aniket Patil Date: Sun, 30 Nov 2025 16:46:57 +0530 Subject: [PATCH 1/2] feat: wait for pod to be running when follow=True in get_job_logs Signed-off-by: Aniket Patil --- .../trainer/backends/kubernetes/backend.py | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/kubeflow/trainer/backends/kubernetes/backend.py b/kubeflow/trainer/backends/kubernetes/backend.py index 776db5354..05424d54a 100644 --- a/kubeflow/trainer/backends/kubernetes/backend.py +++ b/kubeflow/trainer/backends/kubernetes/backend.py @@ -336,13 +336,43 @@ def get_job_logs( """Get the TrainJob logs""" # Get the TrainJob Pod name. pod_name = None - for c in self.get_job(name).steps: - if c.status != constants.POD_PENDING and c.name == step: + job = self.get_job(name) + + # First search if pod already exists + for c in job.steps: + if c.name == step and c.pod_name and c.status != constants.POD_PENDING: pod_name = c.pod_name break - if pod_name is None: + + # If follow=False → old behaviour + if pod_name is None and not follow: return + # If follow=True → wait for pod to be created & running + if pod_name is None and follow: + import time + + timeout = 120 # seconds + interval = 2 # seconds + waited = 0 + + while waited < timeout: + job = self.get_job(name) + for c in job.steps: + if c.name == step and c.pod_name and c.status != constants.POD_PENDING: + pod_name = c.pod_name + break + + if pod_name: + break + + time.sleep(interval) + waited += interval + + # Timeout → no pod found + if pod_name is None: + return + # Remove the number for the node step. container_name = re.sub(r"-\d+$", "", step) yield from self._read_pod_logs( From afd78e242d1e865b1b235e95f26648a37fc1dc38 Mon Sep 17 00:00:00 2001 From: Aniket Patil Date: Sun, 30 Nov 2025 16:51:33 +0530 Subject: [PATCH 2/2] Apply ruff auto-formatting Signed-off-by: Aniket Patil --- kubeflow/trainer/backends/kubernetes/backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kubeflow/trainer/backends/kubernetes/backend.py b/kubeflow/trainer/backends/kubernetes/backend.py index 05424d54a..95847e50f 100644 --- a/kubeflow/trainer/backends/kubernetes/backend.py +++ b/kubeflow/trainer/backends/kubernetes/backend.py @@ -352,8 +352,8 @@ def get_job_logs( if pod_name is None and follow: import time - timeout = 120 # seconds - interval = 2 # seconds + timeout = 120 # seconds + interval = 2 # seconds waited = 0 while waited < timeout: