diff --git a/kubeflow/trainer/backends/kubernetes/backend_test.py b/kubeflow/trainer/backends/kubernetes/backend_test.py index 45b98e7ae..07895b1d8 100644 --- a/kubeflow/trainer/backends/kubernetes/backend_test.py +++ b/kubeflow/trainer/backends/kubernetes/backend_test.py @@ -223,26 +223,30 @@ def get_custom_trainer( """ Get the custom trainer for the TrainJob. """ - pip_command = [f"--index-url {pip_index_urls[0]}"] - pip_command.extend([f"--extra-index-url {repo}" for repo in pip_index_urls[1:]]) - pip_command = " ".join(pip_command) + # Use the same helper as production code to build the pip install script so + # tests stay in sync with the runtime behavior. + install_script = utils.get_script_for_python_packages( + packages_to_install=packages_to_install, + pip_index_urls=pip_index_urls, + ) + + # Append the embedded training function script that matches EXEC_FUNC_SCRIPT + # with torchrun as the entrypoint and a fixed lambda for deterministic tests. + func_script = ( + "\nread -r -d '' SCRIPT << EOM\n\n" + 'func=lambda: print("Hello World"),\n\n' + "(**{'learning_rate': 0.001, 'batch_size': 32})\n\n" + 'EOM\nprintf "%s" "$SCRIPT" > "backend_test.py"\n' + 'torchrun "backend_test.py"' + ) + + full_command = install_script + func_script - packages_command = " ".join(packages_to_install) return models.TrainerV1alpha1Trainer( command=[ "bash", "-c", - '\nif ! [ -x "$(command -v pip)" ]; then\n python -m ensurepip ' - "|| python -m ensurepip --user || apt-get install python-pip" - "\nfi\n\n" - "PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet" - f" --no-warn-script-location {pip_command} --user {packages_command}" - " ||\nPIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet" - f" --no-warn-script-location {pip_command} {packages_command}" - "\n\nread -r -d '' SCRIPT << EOM\n\nfunc=lambda: " - 'print("Hello World"),\n\n(**' - "{'learning_rate': 0.001, 'batch_size': 32})\n\nEOM\nprintf \"%s\" " - '"$SCRIPT" > "backend_test.py"\ntorchrun "backend_test.py"', + full_command, ], numNodes=2, env=env, diff --git a/kubeflow/trainer/backends/kubernetes/utils.py b/kubeflow/trainer/backends/kubernetes/utils.py index 955e34479..dc895f41a 100644 --- a/kubeflow/trainer/backends/kubernetes/utils.py +++ b/kubeflow/trainer/backends/kubernetes/utils.py @@ -268,6 +268,7 @@ def get_script_for_python_packages( # first url will be the index-url. options = [f"--index-url {pip_index_urls[0]}"] options.extend(f"--extra-index-url {extra_index_url}" for extra_index_url in pip_index_urls[1:]) + options_str = " ".join(options) header_script = textwrap.dedent( """ @@ -278,18 +279,29 @@ def get_script_for_python_packages( """ ) - script_for_python_packages = ( - header_script - + "PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet " - + "--no-warn-script-location {} --user {}".format( - " ".join(options), - packages_str, - ) - + " ||\nPIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet " - + "--no-warn-script-location {} {}\n".format( - " ".join(options), - packages_str, - ) + # First try per-user installation, then fall back to system-wide installation. + # Pip output is captured to a log file and only printed when both attempts fail; + # on success we emit a single concise confirmation line. + script_for_python_packages = header_script + textwrap.dedent( + f""" + PACKAGES="{packages_str}" + PIP_OPTS="{options_str}" + LOG_FILE=/tmp/pip_install.log + rm -f "$LOG_FILE" + + if PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \\ + --no-warn-script-location $PIP_OPTS --user $PACKAGES >"$LOG_FILE" 2>&1; then + echo "Successfully installed Python packages: $PACKAGES" + elif PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \\ + --no-warn-script-location $PIP_OPTS $PACKAGES >>"$LOG_FILE" 2>&1; then + echo "Successfully installed Python packages: $PACKAGES" + else + echo "ERROR: Failed to install Python packages: $PACKAGES" >&2 + cat "$LOG_FILE" >&2 + exit 1 + fi + + """ ) return script_for_python_packages diff --git a/kubeflow/trainer/backends/kubernetes/utils_test.py b/kubeflow/trainer/backends/kubernetes/utils_test.py index af5f45ea7..4fd933348 100644 --- a/kubeflow/trainer/backends/kubernetes/utils_test.py +++ b/kubeflow/trainer/backends/kubernetes/utils_test.py @@ -151,17 +151,23 @@ def test_get_resources_per_node(test_case: TestCase): '\nif ! [ -x "$(command -v pip)" ]; then\n' " python -m ensurepip || python -m ensurepip --user || " "apt-get install python-pip\n" + "fi\n\n\n" + 'PACKAGES="torch numpy custom-package"\n' + 'PIP_OPTS="--index-url https://pypi.org/simple --extra-index-url https://private.repo.com/simple --extra-index-url https://internal.company.com/simple"\n' + "LOG_FILE=/tmp/pip_install.log\n" + 'rm -f "$LOG_FILE"\n' + "\n" + "if PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \\\n" + ' --no-warn-script-location $PIP_OPTS --user $PACKAGES >"$LOG_FILE" 2>&1; then\n' + ' echo "Successfully installed Python packages: $PACKAGES"\n' + "elif PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \\\n" + ' --no-warn-script-location $PIP_OPTS $PACKAGES >>"$LOG_FILE" 2>&1; then\n' + ' echo "Successfully installed Python packages: $PACKAGES"\n' + "else\n" + ' echo "ERROR: Failed to install Python packages: $PACKAGES" >&2\n' + ' cat "$LOG_FILE" >&2\n' + " exit 1\n" "fi\n\n" - "PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet " - "--no-warn-script-location --index-url https://pypi.org/simple " - "--extra-index-url https://private.repo.com/simple " - "--extra-index-url https://internal.company.com/simple " - "--user torch numpy custom-package ||\n" - "PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet " - "--no-warn-script-location --index-url https://pypi.org/simple " - "--extra-index-url https://private.repo.com/simple " - "--extra-index-url https://internal.company.com/simple " - "torch numpy custom-package\n" ), ), TestCase( @@ -175,13 +181,23 @@ def test_get_resources_per_node(test_case: TestCase): '\nif ! [ -x "$(command -v pip)" ]; then\n' " python -m ensurepip || python -m ensurepip --user || " "apt-get install python-pip\n" + "fi\n\n\n" + 'PACKAGES="torch numpy custom-package"\n' + 'PIP_OPTS="--index-url https://pypi.org/simple"\n' + "LOG_FILE=/tmp/pip_install.log\n" + 'rm -f "$LOG_FILE"\n' + "\n" + "if PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \\\n" + ' --no-warn-script-location $PIP_OPTS --user $PACKAGES >"$LOG_FILE" 2>&1; then\n' + ' echo "Successfully installed Python packages: $PACKAGES"\n' + "elif PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \\\n" + ' --no-warn-script-location $PIP_OPTS $PACKAGES >>"$LOG_FILE" 2>&1; then\n' + ' echo "Successfully installed Python packages: $PACKAGES"\n' + "else\n" + ' echo "ERROR: Failed to install Python packages: $PACKAGES" >&2\n' + ' cat "$LOG_FILE" >&2\n' + " exit 1\n" "fi\n\n" - "PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet " - "--no-warn-script-location --index-url https://pypi.org/simple " - "--user torch numpy custom-package ||\n" - "PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet " - "--no-warn-script-location --index-url https://pypi.org/simple " - "torch numpy custom-package\n" ), ), TestCase( @@ -199,17 +215,23 @@ def test_get_resources_per_node(test_case: TestCase): '\nif ! [ -x "$(command -v pip)" ]; then\n' " python -m ensurepip || python -m ensurepip --user || " "apt-get install python-pip\n" + "fi\n\n\n" + 'PACKAGES="torch numpy custom-package"\n' + 'PIP_OPTS="--index-url https://pypi.org/simple --extra-index-url https://private.repo.com/simple --extra-index-url https://internal.company.com/simple"\n' + "LOG_FILE=/tmp/pip_install.log\n" + 'rm -f "$LOG_FILE"\n' + "\n" + "if PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \\\n" + ' --no-warn-script-location $PIP_OPTS --user $PACKAGES >"$LOG_FILE" 2>&1; then\n' + ' echo "Successfully installed Python packages: $PACKAGES"\n' + "elif PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \\\n" + ' --no-warn-script-location $PIP_OPTS $PACKAGES >>"$LOG_FILE" 2>&1; then\n' + ' echo "Successfully installed Python packages: $PACKAGES"\n' + "else\n" + ' echo "ERROR: Failed to install Python packages: $PACKAGES" >&2\n' + ' cat "$LOG_FILE" >&2\n' + " exit 1\n" "fi\n\n" - "PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet " - "--no-warn-script-location --index-url https://pypi.org/simple " - "--extra-index-url https://private.repo.com/simple " - "--extra-index-url https://internal.company.com/simple " - "--user torch numpy custom-package ||\n" - "PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet " - "--no-warn-script-location --index-url https://pypi.org/simple " - "--extra-index-url https://private.repo.com/simple " - "--extra-index-url https://internal.company.com/simple " - "torch numpy custom-package\n" ), ), TestCase( @@ -223,13 +245,24 @@ def test_get_resources_per_node(test_case: TestCase): '\nif ! [ -x "$(command -v pip)" ]; then\n' " python -m ensurepip || python -m ensurepip --user || " "apt-get install python-pip\n" + "fi\n\n\n" + 'PACKAGES="torch numpy"\n' + "PIP_OPTS=" + f'"--index-url {constants.DEFAULT_PIP_INDEX_URLS[0]}"\n' + "LOG_FILE=/tmp/pip_install.log\n" + 'rm -f "$LOG_FILE"\n' + "\n" + "if PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \\\n" + ' --no-warn-script-location $PIP_OPTS --user $PACKAGES >"$LOG_FILE" 2>&1; then\n' + ' echo "Successfully installed Python packages: $PACKAGES"\n' + "elif PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \\\n" + ' --no-warn-script-location $PIP_OPTS $PACKAGES >>"$LOG_FILE" 2>&1; then\n' + ' echo "Successfully installed Python packages: $PACKAGES"\n' + "else\n" + ' echo "ERROR: Failed to install Python packages: $PACKAGES" >&2\n' + ' cat "$LOG_FILE" >&2\n' + " exit 1\n" "fi\n\n" - "PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet " - f"--no-warn-script-location --index-url " - f"{constants.DEFAULT_PIP_INDEX_URLS[0]} --user torch numpy ||\n" - "PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet " - f"--no-warn-script-location --index-url " - f"{constants.DEFAULT_PIP_INDEX_URLS[0]} torch numpy\n" ), ), ],