diff --git a/examples/local/local-docker-mnist.ipynb b/examples/local/local-docker-mnist.ipynb new file mode 100644 index 0000000000..a759668c87 --- /dev/null +++ b/examples/local/local-docker-mnist.ipynb @@ -0,0 +1,641 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "534927d3", + "metadata": {}, + "source": [ + "# PyTorch DDP Fashion MNIST Training Example run locally with Docker\n", + "\n", + "This example demonstrates how to utilise Kubeflow Trainer locally with docker. It simulates a similar experience to distributed training on kubernetes from your local machine. \n", + "\n", + "The notebook demonstrates how to train a convolutional neural network (CNN) to classify images using the [Fashion MNIST](https://github.com/zalandoresearch/fashion-mnist) dataset and [PyTorch Distributed Data Parallel (DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). \n" + ] + }, + { + "cell_type": "markdown", + "id": "2fae7c07", + "metadata": {}, + "source": [ + "## Install the Kubeflow SDK\n", + "\n", + "You need to install the Kubeflow SDK with the docker extra to interact with Kubeflow Trainer APIs:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ef6c2a96", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T11:52:45.197094Z", + "start_time": "2025-09-30T11:52:42.345293Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mDEPRECATION: git+https://github.com/briangallagher/sdk.git@docker-backend#egg=kubeflow[docker] contains an egg fragment with a non-PEP 508 name. pip 25.3 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/13157\u001B[0m\u001B[33m\r\n", + "\u001B[0mCollecting kubeflow (from kubeflow[docker])\r\n", + " Cloning https://github.com/briangallagher/sdk.git (to revision docker-backend) to /private/var/folders/rv/666pnlds63945vbrm6zbfkhc0000gn/T/pip-install-b48lu5mk/kubeflow_f2531a4401fa4eddbfb115d575a3353d\r\n", + " Running command git clone --filter=blob:none --quiet https://github.com/briangallagher/sdk.git /private/var/folders/rv/666pnlds63945vbrm6zbfkhc0000gn/T/pip-install-b48lu5mk/kubeflow_f2531a4401fa4eddbfb115d575a3353d\r\n", + " Running command git checkout -b docker-backend --track origin/docker-backend\r\n", + " Switched to a new branch 'docker-backend'\r\n", + " branch 'docker-backend' set up to track 'origin/docker-backend'.\r\n", + " Resolved https://github.com/briangallagher/sdk.git to commit 407c08e5960a18d0c4e44106627b5edb19a0582d\r\n", + " Installing build dependencies ... \u001B[?25ldone\r\n", + "\u001B[?25h Getting requirements to build wheel ... \u001B[?25ldone\r\n", + "\u001B[?25h Preparing metadata (pyproject.toml) ... \u001B[?25ldone\r\n", + "\u001B[?25hRequirement already satisfied: kubeflow-trainer-api>=2.0.0 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubeflow->kubeflow[docker]) (2.0.0)\r\n", + "Requirement already satisfied: kubernetes>=27.2.0 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubeflow->kubeflow[docker]) (33.1.0)\r\n", + "Requirement already satisfied: pydantic>=2.10.0 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubeflow->kubeflow[docker]) (2.11.7)\r\n", + "Requirement already satisfied: docker>=6.1.3 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubeflow->kubeflow[docker]) (7.1.0)\r\n", + "Requirement already satisfied: requests>=2.26.0 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from docker>=6.1.3->kubeflow->kubeflow[docker]) (2.32.4)\r\n", + "Requirement already satisfied: urllib3>=1.26.0 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from docker>=6.1.3->kubeflow->kubeflow[docker]) (2.4.0)\r\n", + "Requirement already satisfied: certifi>=14.05.14 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (2025.4.26)\r\n", + "Requirement already satisfied: six>=1.9.0 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (1.17.0)\r\n", + "Requirement already satisfied: python-dateutil>=2.5.3 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (2.9.0.post0)\r\n", + "Requirement already satisfied: pyyaml>=5.4.1 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (6.0.2)\r\n", + "Requirement already satisfied: google-auth>=1.0.1 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (2.40.3)\r\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (1.8.0)\r\n", + "Requirement already satisfied: requests-oauthlib in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (2.0.0)\r\n", + "Requirement already satisfied: oauthlib>=3.2.2 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (3.2.2)\r\n", + "Requirement already satisfied: durationpy>=0.7 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (0.10)\r\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (5.5.2)\r\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (0.4.2)\r\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (4.9.1)\r\n", + "Requirement already satisfied: pyasn1>=0.1.3 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from rsa<5,>=3.1.4->google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow->kubeflow[docker]) (0.6.1)\r\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from pydantic>=2.10.0->kubeflow->kubeflow[docker]) (0.7.0)\r\n", + "Requirement already satisfied: pydantic-core==2.33.2 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from pydantic>=2.10.0->kubeflow->kubeflow[docker]) (2.33.2)\r\n", + "Requirement already satisfied: typing-extensions>=4.12.2 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from pydantic>=2.10.0->kubeflow->kubeflow[docker]) (4.14.0)\r\n", + "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from pydantic>=2.10.0->kubeflow->kubeflow[docker]) (0.4.1)\r\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from requests>=2.26.0->docker>=6.1.3->kubeflow->kubeflow[docker]) (3.4.2)\r\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/briangallagher/dev/kubeflow/sdk/.venv/lib/python3.11/site-packages (from requests>=2.26.0->docker>=6.1.3->kubeflow->kubeflow[docker]) (3.10)\r\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": "!pip install -U kubeflow[docker]" + }, + { + "cell_type": "markdown", + "id": "e781251e", + "metadata": {}, + "source": [ + "## Define the Training Function\n", + "\n", + "The first step is to create function to train CNN model using Fashion MNIST data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1a46c3d9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T11:52:46.981813Z", + "start_time": "2025-09-30T11:52:46.974826Z" + } + }, + "outputs": [], + "source": [ + "def train_fashion_mnist():\n", + " import os\n", + "\n", + " import torch\n", + " import torch.distributed as dist\n", + " import torch.nn.functional as F\n", + " from torch import nn\n", + " from torch.utils.data import DataLoader, DistributedSampler\n", + " from torchvision import datasets, transforms\n", + "\n", + " # Define the PyTorch CNN model to be trained\n", + " class Net(nn.Module):\n", + " def __init__(self):\n", + " super(Net, self).__init__()\n", + " self.conv1 = nn.Conv2d(1, 20, 5, 1)\n", + " self.conv2 = nn.Conv2d(20, 50, 5, 1)\n", + " self.fc1 = nn.Linear(4 * 4 * 50, 500)\n", + " self.fc2 = nn.Linear(500, 10)\n", + "\n", + " def forward(self, x):\n", + " x = F.relu(self.conv1(x))\n", + " x = F.max_pool2d(x, 2, 2)\n", + " x = F.relu(self.conv2(x))\n", + " x = F.max_pool2d(x, 2, 2)\n", + " x = x.view(-1, 4 * 4 * 50)\n", + " x = F.relu(self.fc1(x))\n", + " x = self.fc2(x)\n", + " return F.log_softmax(x, dim=1)\n", + "\n", + " # Use NCCL if a GPU is available, otherwise use Gloo as communication backend.\n", + " device, backend = (\"cuda\", \"nccl\") if torch.cuda.is_available() else (\"cpu\", \"gloo\")\n", + " print(f\"Using Device: {device}, Backend: {backend}\")\n", + "\n", + " # Setup PyTorch distributed.\n", + " local_rank = int(os.getenv(\"LOCAL_RANK\", 0))\n", + " dist.init_process_group(backend=backend)\n", + " rank = dist.get_rank()\n", + " print(\n", + " \"Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}\".format(\n", + " dist.get_world_size(),\n", + " rank,\n", + " local_rank,\n", + " )\n", + " )\n", + "\n", + " # Create the model and load it into the device.\n", + " device = torch.device(f\"{device}:{local_rank}\")\n", + " model = nn.parallel.DistributedDataParallel(Net().to(device))\n", + " optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)\n", + "\n", + "\n", + " # Use a rank-specific dataset directory to avoid concurrent writes to a shared mount\n", + " data_dir = f\"/tmp/fashion-mnist-{rank}\"\n", + " os.makedirs(data_dir, exist_ok=True)\n", + " dataset = datasets.FashionMNIST(\n", + " data_dir,\n", + " train=True,\n", + " download=True,\n", + " transform=transforms.Compose([transforms.ToTensor()]),\n", + " )\n", + "\n", + "\n", + " # Shard the dataset accross workers.\n", + " train_loader = DataLoader(\n", + " dataset,\n", + " batch_size=100,\n", + " sampler=DistributedSampler(dataset)\n", + " )\n", + "\n", + " # TODO(astefanutti): add parameters to the training function\n", + " dist.barrier()\n", + " for epoch in range(1, 3):\n", + " model.train()\n", + "\n", + " # Iterate over mini-batches from the training set\n", + " for batch_idx, (inputs, labels) in enumerate(train_loader):\n", + " # Copy the data to the GPU device if available\n", + " inputs, labels = inputs.to(device), labels.to(device)\n", + " # Forward pass\n", + " outputs = model(inputs)\n", + " loss = F.nll_loss(outputs, labels)\n", + " # Backward pass\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " if batch_idx % 10 == 0 and dist.get_rank() == 0:\n", + " print(\n", + " \"Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}\".format(\n", + " epoch,\n", + " batch_idx * len(inputs),\n", + " len(train_loader.dataset),\n", + " 100.0 * batch_idx / len(train_loader),\n", + " loss.item(),\n", + " )\n", + " )\n", + "\n", + " # Wait for the distributed training to complete\n", + " dist.barrier()\n", + " if dist.get_rank() == 0:\n", + " print(\"Training is finished\")\n", + "\n", + " # Finally clean up PyTorch distributed\n", + " dist.destroy_process_group()" + ] + }, + { + "cell_type": "markdown", + "id": "2beea3c9", + "metadata": {}, + "source": [ + "## Run PyTorch DDP with Kubeflow TrainJob\n", + "\n", + "You can use `TrainerClient()` from the Kubeflow SDK to communicate with Kubeflow Trainer APIs and scale your training function across multiple PyTorch training nodes.\n", + "\n", + "`TrainerClient(backend_config=LocalDockerBackendConfig())` verifies that you have required access to a local docker client.\n", + "\n", + "Kubeflow Trainer creates a `TrainJob` resource and automatically sets the appropriate environment variables to set up PyTorch in distributed environment. Distributed in this context means a local docker instance with multiple containers running communicating over a docker network.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a7f34d36", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T11:52:49.751268Z", + "start_time": "2025-09-30T11:52:49.064491Z" + } + }, + "outputs": [], + "source": [ + "from kubeflow.trainer import CustomTrainer, TrainerClient, LocalDockerBackendConfig\n", + "import os\n", + "\n", + "backend_config = LocalDockerBackendConfig()\n", + "\n", + "# The SDK will look for the docker socket in the default location, for example: /var/run/docker.sock\n", + "# If it's not in the default location, for example if you are using Colima on Mac, you can specify the path to the docker socket.\n", + "# backend_config = LocalDockerBackendConfig(\n", + "# docker_host=f\"unix://{os.path.expanduser('~')}/.colima/default/docker.sock\"\n", + "# )\n", + "\n", + "client = TrainerClient(backend_config=backend_config)" + ] + }, + { + "cell_type": "markdown", + "id": "cc00ed3d", + "metadata": {}, + "source": [ + "## List the Training Runtimes\n", + "\n", + "You can get the list of available Training Runtimes to start your TrainJob.\n", + "\n", + "Additionally, it might show available accelerator type and number of available resources." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4af918c7", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T11:52:50.618471Z", + "start_time": "2025-09-30T11:52:50.611698Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Runtime(name='torch-distributed', trainer=RuntimeTrainer(trainer_type=, framework='torch', num_nodes=1, device='Unknown', device_count='Unknown'), pretrained_model=None)\n" + ] + } + ], + "source": [ + "for runtime in client.list_runtimes():\n", + " print(runtime)\n", + " if runtime.name == \"torch-distributed\":\n", + " torch_runtime = runtime" + ] + }, + { + "cell_type": "markdown", + "id": "b2bb8527", + "metadata": {}, + "source": [ + "## Run the Distributed TrainJob\n", + "\n", + "Kubeflow TrainJob will train the above model on 3 PyTorch nodes." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f7f06d52", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T11:52:52.088869Z", + "start_time": "2025-09-30T11:52:51.719310Z" + } + }, + "outputs": [], + "source": [ + "job_name = client.train(\n", + " trainer=CustomTrainer(\n", + " func=train_fashion_mnist,\n", + " # Set how many PyTorch nodes you want to use for distributed training. \n", + " # num_nodes will equal the number of local containers running\n", + " num_nodes=2, \n", + " ),\n", + " runtime=torch_runtime,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2aa697d3", + "metadata": {}, + "source": [ + "## Check the TrainJob steps\n", + "\n", + "You can check the components of TrainJob that's created.\n", + "\n", + "Since the TrainJob performs distributed training across 3 nodes, it generates 3 steps: `trainer-node-0` .. `trainer-node-2`.\n", + "\n", + "You can get the individual status for each of these steps." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cdf75d0f", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T11:52:54.522538Z", + "start_time": "2025-09-30T11:52:54.511406Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "TrainJob(name='db70754a1731', creation_timestamp=datetime.datetime(2025, 9, 30, 12, 52, 52, 87272), runtime=Runtime(name='torch-distributed', trainer=RuntimeTrainer(trainer_type=, framework='torch', num_nodes=1, device='Unknown', device_count='Unknown'), pretrained_model=None), steps=[Step(name='node-0', status='Running', pod_name='db70754a1731-node-0', device='Unknown', device_count='Unknown'), Step(name='node-1', status='Running', pod_name='db70754a1731-node-1', device='Unknown', device_count='Unknown')], num_nodes=2, status='Running')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Wait for the running status.\n", + "client.wait_for_job_status(name=job_name, status={\"Running\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b806c9f2", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T11:52:55.065707Z", + "start_time": "2025-09-30T11:52:55.057253Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: node-0, Status: Running, Devices: Unknown x Unknown\n", + "\n", + "Step: node-1, Status: Running, Devices: Unknown x Unknown\n", + "\n" + ] + } + ], + "source": [ + "for c in client.get_job(name=job_name).steps:\n", + " print(f\"Step: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "3c6092a2", + "metadata": {}, + "source": [ + "## Watch the TrainJob logs\n", + "\n", + "We can use the `get_job_logs()` API to get the TrainJob logs." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2539b6f3", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T13:20:38.174847Z", + "start_time": "2025-09-30T11:52:56.516604Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using Device: cpu, Backend: gloo\n", + "\n", + "Distributed Training for WORLD_SIZE: 2, RANK: 0, LOCAL_RANK: 0\n", + "\n", + "100%|██████████| 26.4M/26.4M [00:04<00:00, 5.45MB/s]\n", + "\n", + "100%|██████████| 29.5k/29.5k [00:00<00:00, 810kB/s]\n", + "\n", + "100%|██████████| 4.42M/4.42M [00:00<00:00, 4.76MB/s]\n", + "\n", + "100%|██████████| 5.15k/5.15k [00:00<00:00, 10.8MB/s]\n", + "\n", + "Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.303622\n", + "\n", + "Train Epoch: 1 [1000/60000 (3%)]\tLoss: 2.123634\n", + "\n", + "Train Epoch: 1 [3000/60000 (10%)]\tLoss: 1.551790\n", + "\n", + "Train Epoch: 1 [4000/60000 (13%)]\tLoss: 1.275517\n", + "\n", + "Train Epoch: 1 [5000/60000 (17%)]\tLoss: 0.750075\n", + "\n", + "Train Epoch: 1 [6000/60000 (20%)]\tLoss: 0.775872\n", + "\n", + "Train Epoch: 1 [7000/60000 (23%)]\tLoss: 0.744973\n", + "\n", + "Train Epoch: 1 [8000/60000 (27%)]\tLoss: 0.620656\n", + "\n", + "Train Epoch: 1 [9000/60000 (30%)]\tLoss: 0.495775\n", + "\n", + "Train Epoch: 1 [10000/60000 (33%)]\tLoss: 0.381999\n", + "\n", + "Train Epoch: 1 [11000/60000 (37%)]\tLoss: 0.430183\n", + "\n", + "Train Epoch: 1 [12000/60000 (40%)]\tLoss: 0.466719\n", + "\n", + "Train Epoch: 1 [13000/60000 (43%)]\tLoss: 0.541330\n", + "\n", + "Train Epoch: 1 [14000/60000 (47%)]\tLoss: 0.515363\n", + "\n", + "Train Epoch: 1 [15000/60000 (50%)]\tLoss: 0.397409\n", + "\n", + "Train Epoch: 1 [16000/60000 (53%)]\tLoss: 0.492191\n", + "\n", + "Train Epoch: 1 [17000/60000 (57%)]\tLoss: 0.560209\n", + "\n", + "Train Epoch: 1 [18000/60000 (60%)]\tLoss: 0.486869\n", + "\n", + "Train Epoch: 1 [19000/60000 (63%)]\tLoss: 0.445933\n", + "\n", + "Train Epoch: 1 [20000/60000 (67%)]\tLoss: 0.424316\n", + "\n", + "Train Epoch: 1 [21000/60000 (70%)]\tLoss: 0.418526\n", + "\n", + "Train Epoch: 1 [22000/60000 (73%)]\tLoss: 0.252146\n", + "\n", + "Train Epoch: 1 [23000/60000 (77%)]\tLoss: 0.462454\n", + "\n", + "Train Epoch: 1 [24000/60000 (80%)]\tLoss: 0.508566\n", + "\n", + "Train Epoch: 1 [25000/60000 (83%)]\tLoss: 0.356379\n", + "\n", + "Train Epoch: 1 [26000/60000 (87%)]\tLoss: 0.499496\n", + "\n", + "Train Epoch: 1 [27000/60000 (90%)]\tLoss: 0.471140\n", + "\n", + "Train Epoch: 1 [28000/60000 (93%)]\tLoss: 0.275854\n", + "\n", + "Train Epoch: 1 [29000/60000 (97%)]\tLoss: 0.356416\n", + "\n", + "Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.431857\n", + "\n", + "Train Epoch: 2 [1000/60000 (3%)]\tLoss: 0.305270\n", + "\n", + "Train Epoch: 2 [2000/60000 (7%)]\tLoss: 0.405192\n", + "\n", + "Train Epoch: 2 [3000/60000 (10%)]\tLoss: 0.357206\n", + "\n", + "Train Epoch: 2 [4000/60000 (13%)]\tLoss: 0.321652\n", + "\n", + "Train Epoch: 2 [5000/60000 (17%)]\tLoss: 0.376541\n", + "\n", + "Train Epoch: 2 [6000/60000 (20%)]\tLoss: 0.367622\n", + "\n", + "Train Epoch: 2 [7000/60000 (23%)]\tLoss: 0.499754\n", + "\n", + "Train Epoch: 2 [8000/60000 (27%)]\tLoss: 0.319743\n", + "\n", + "Train Epoch: 2 [9000/60000 (30%)]\tLoss: 0.327233\n", + "\n", + "Train Epoch: 2 [10000/60000 (33%)]\tLoss: 0.239145\n", + "\n", + "Train Epoch: 2 [11000/60000 (37%)]\tLoss: 0.402087\n", + "\n", + "Train Epoch: 2 [12000/60000 (40%)]\tLoss: 0.454580\n", + "\n", + "Train Epoch: 2 [13000/60000 (43%)]\tLoss: 0.366190\n", + "\n", + "Train Epoch: 2 [14000/60000 (47%)]\tLoss: 0.442235\n", + "\n", + "Train Epoch: 2 [15000/60000 (50%)]\tLoss: 0.312524\n", + "\n", + "Train Epoch: 2 [16000/60000 (53%)]\tLoss: 0.361948\n", + "\n", + "Train Epoch: 2 [17000/60000 (57%)]\tLoss: 0.473095\n", + "\n", + "Train Epoch: 2 [18000/60000 (60%)]\tLoss: 0.388052\n", + "\n", + "Train Epoch: 2 [19000/60000 (63%)]\tLoss: 0.314100\n", + "\n", + "Train Epoch: 2 [20000/60000 (67%)]\tLoss: 0.354624\n", + "\n", + "Train Epoch: 2 [21000/60000 (70%)]\tLoss: 0.297445\n", + "\n", + "Train Epoch: 2 [22000/60000 (73%)]\tLoss: 0.200486\n", + "\n", + "Train Epoch: 2 [23000/60000 (77%)]\tLoss: 0.393040\n", + "\n", + "Train Epoch: 2 [24000/60000 (80%)]\tLoss: 0.441462\n", + "\n", + "Train Epoch: 2 [25000/60000 (83%)]\tLoss: 0.280500\n", + "\n", + "Train Epoch: 2 [26000/60000 (87%)]\tLoss: 0.350549\n", + "\n", + "Train Epoch: 2 [27000/60000 (90%)]\tLoss: 0.371629\n", + "\n", + "Train Epoch: 2 [28000/60000 (93%)]\tLoss: 0.210757\n", + "\n", + "Train Epoch: 2 [29000/60000 (97%)]\tLoss: 0.329476\n", + "\n", + "Training is finished\n", + "\n", + "Using Device: cpu, Backend: gloo\n", + "\n", + "Distributed Training for WORLD_SIZE: 2, RANK: 1, LOCAL_RANK: 0\n", + "\n", + "100%|██████████| 26.4M/26.4M [00:05<00:00, 5.22MB/s]\n", + "\n", + "100%|██████████| 29.5k/29.5k [00:00<00:00, 958kB/s]\n", + "\n", + "100%|██████████| 4.42M/4.42M [00:00<00:00, 5.38MB/s]\n", + "\n", + "100%|██████████| 5.15k/5.15k [00:00<00:00, 13.6MB/s]\n", + "\n" + ] + } + ], + "source": [ + "for logline in client.get_job_logs(job_name, follow=True):\n", + " print(logline)" + ] + }, + { + "cell_type": "markdown", + "id": "e0e2c68b", + "metadata": {}, + "source": [ + "## Optional: Examine Docker resources\n", + "\n", + "- Containers for this training job\n", + "\n", + "```bash\n", + "docker ps --filter label=trainer.kubeflow.ai/trainjob-name\n", + "```\n", + "\n", + "Example:\n", + "```text\n", + "CONTAINER ID IMAGE NAMES\n", + "f6a786574f73 pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime ydb5bf3c10c4-node-1\n", + "c36274db6eb9 pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime ydb5bf3c10c4-node-0\n", + "```\n", + "\n", + "- Network created for this training job\n", + "\n", + "```bash\n", + "docker network ls --filter label=trainer.kubeflow.org/trainjob-name\n", + "```\n", + "\n", + "Example:\n", + "```text\n", + "NETWORK ID NAME DRIVER SCOPE\n", + "2cded187f9e7 b69f13d3f8dc-net bridge local\n", + "```\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "aa80a27a", + "metadata": {}, + "source": [ + "## Delete the TrainJob\n", + "\n", + "When TrainJob is finished, you can delete the resource.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf91da62", + "metadata": {}, + "outputs": [], + "source": [ + "# client.delete_job(job_name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}