From 5d73216c1ee3bfddc417d77c402911d11edf9d38 Mon Sep 17 00:00:00 2001
From: Kiro Agent <244629292+kiro-agent@users.noreply.github.com>
Date: Tue, 27 Jan 2026 17:39:39 +0000
Subject: [PATCH] Add YOLO-based SageMaker training notebook for defect
 detection

- Add DDA_SageMaker_YOLO_Training.ipynb notebook that uses YOLOv8 models
- Provides open-source alternative to Lookout for Vision marketplace algorithm
- Supports object detection, segmentation, and classification
- Includes data preparation, SageMaker training job setup, and model export
- Exports models to ONNX and TorchScript for edge deployment
- Update README.md to document both training options

Co-authored-by: rajjainl <182391521+rajjainl@users.noreply.github.com>
---
 DDA_SageMaker_YOLO_Training.ipynb | 427 ++++++++++++++++++++++++++++++
 README.md                         |  26 +-
 2 files changed, 452 insertions(+), 1 deletion(-)
 create mode 100644 DDA_SageMaker_YOLO_Training.ipynb

diff --git a/DDA_SageMaker_YOLO_Training.ipynb b/DDA_SageMaker_YOLO_Training.ipynb
new file mode 100644
index 0000000..f597777
--- /dev/null
+++ b/DDA_SageMaker_YOLO_Training.ipynb
@@ -0,0 +1,427 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "# SageMaker YOLO Training for DDA (Defect Detection Application)\n",
+    "\n",
+    "This notebook demonstrates training YOLOv8 models for defect detection using Amazon SageMaker.\n",
+    "\n",
+    "**Advantages of YOLO over Lookout for Vision:**\n",
+    "- Open-source and no marketplace subscription required\n",
+    "- Supports detection, segmentation, and classification\n",
+    "- Faster inference speeds\n",
+    "- More flexible deployment options"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "prereq",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "1. Amazon SageMaker Notebook Instance or SageMaker Studio\n",
+    "2. IAM role with SageMaker and S3 permissions\n",
+    "3. Training images labeled in YOLO format or SageMaker Ground Truth format"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "setup_header",
+   "metadata": {},
+   "source": [
+    "## Step 1: Environment Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "imports",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install required packages\n",
+    "!pip install -q ultralytics sagemaker boto3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import boto3\n",
+    "import sagemaker\n",
+    "import json\n",
+    "import os\n",
+    "import datetime\n",
+    "import time\n",
+    "from sagemaker.pytorch import PyTorch\n",
+    "\n",
+    "session = sagemaker.Session()\n",
+    "region = session.boto_region_name\n",
+    "bucket = session.default_bucket()\n",
+    "role = sagemaker.get_execution_role()\n",
+    "project = \"YOLO-defect-detection\"\n",
+    "\n",
+    "print(f\"Region: {region}\")\n",
+    "print(f\"Bucket: {bucket}\")\n",
+    "print(f\"Role: {role}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s3_setup_header",
+   "metadata": {},
+   "source": [
+    "## Step 2: S3 Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "s3_setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create S3 folder structure\n",
+    "s3_client = boto3.client('s3')\n",
+    "s3_uri = f\"s3://{bucket}/{project}/\"\n",
+    "\n",
+    "folders = ['', 'data/', 'data/images/', 'data/labels/', 'output/', 'models/']\n",
+    "for folder in folders:\n",
+    "    s3_client.put_object(Bucket=bucket, Key=f\"{project}/{folder}\")\n",
+    "\n",
+    "print(f\"S3 structure created: {s3_uri}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "data_header",
+   "metadata": {},
+   "source": [
+    "## Step 3: Prepare Training Data\n",
+    "\n",
+    "YOLO expects data in a specific format with images and corresponding label files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "download_sample",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download sample cookie dataset (same as LFV notebook)\n",
+    "!git clone --depth 1 https://github.com/aws-samples/amazon-lookout-for-vision.git\n",
+    "!cp -r amazon-lookout-for-vision/computer-vision-defect-detection/cookie-dataset ./\n",
+    "!rm -rf amazon-lookout-for-vision\n",
+    "print(f\"Dataset downloaded\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "convert_to_yolo",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Create YOLO dataset structure\n",
+    "yolo_data = Path('yolo_dataset')\n",
+    "(yolo_data / 'images' / 'train').mkdir(parents=True, exist_ok=True)\n",
+    "(yolo_data / 'images' / 'val').mkdir(parents=True, exist_ok=True)\n",
+    "(yolo_data / 'labels' / 'train').mkdir(parents=True, exist_ok=True)\n",
+    "(yolo_data / 'labels' / 'val').mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "# Copy images and create labels (0=normal, 1=anomaly for classification)\n",
+    "src_dir = Path('cookie-dataset/dataset-files/training-images')\n",
+    "images = list(src_dir.glob('*.jpg'))\n",
+    "\n",
+    "# Split 80/20 for train/val\n",
+    "split_idx = int(len(images) * 0.8)\n",
+    "train_imgs = images[:split_idx]\n",
+    "val_imgs = images[split_idx:]\n",
+    "\n",
+    "for img in train_imgs:\n",
+    "    shutil.copy(img, yolo_data / 'images' / 'train' / img.name)\n",
+    "    # For classification, label is in filename (anomaly or normal)\n",
+    "    label = 1 if 'anomaly' in img.name else 0\n",
+    "    label_file = yolo_data / 'labels' / 'train' / img.name.replace('.jpg', '.txt')\n",
+    "    # YOLO detection format: class x_center y_center width height (normalized)\n",
+    "    # For full-image classification, use entire image as bbox\n",
+    "    label_file.write_text(f\"{label} 0.5 0.5 1.0 1.0\\n\")\n",
+    "\n",
+    "for img in val_imgs:\n",
+    "    shutil.copy(img, yolo_data / 'images' / 'val' / img.name)\n",
+    "    label = 1 if 'anomaly' in img.name else 0\n",
+    "    label_file = yolo_data / 'labels' / 'val' / img.name.replace('.jpg', '.txt')\n",
+    "    label_file.write_text(f\"{label} 0.5 0.5 1.0 1.0\\n\")\n",
+    "\n",
+    "print(f\"Train images: {len(train_imgs)}, Val images: {len(val_imgs)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "create_yaml",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create YOLO dataset configuration file\n",
+    "yaml_content = f\"\"\"# YOLO Dataset Configuration\n",
+    "path: /opt/ml/input/data/training\n",
+    "train: images/train\n",
+    "val: images/val\n",
+    "\n",
+    "# Classes\n",
+    "names:\n",
+    "  0: normal\n",
+    "  1: anomaly\n",
+    "\"\"\"\n",
+    "\n",
+    "with open('yolo_dataset/data.yaml', 'w') as f:\n",
+    "    f.write(yaml_content)\n",
+    "\n",
+    "print(\"Created data.yaml configuration\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "upload_data",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Upload dataset to S3\n",
+    "!aws s3 sync yolo_dataset s3://{bucket}/{project}/data/ --quiet\n",
+    "training_data_uri = f\"s3://{bucket}/{project}/data/\"\n",
+    "print(f\"Data uploaded to: {training_data_uri}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "training_header",
+   "metadata": {},
+   "source": [
+    "## Step 4: Create Training Script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "training_script",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile train_yolo.py\n",
+    "import os\n",
+    "import argparse\n",
+    "from ultralytics import YOLO\n",
+    "\n",
+    "def main():\n",
+    "    parser = argparse.ArgumentParser()\n",
+    "    parser.add_argument('--epochs', type=int, default=50)\n",
+    "    parser.add_argument('--batch-size', type=int, default=16)\n",
+    "    parser.add_argument('--img-size', type=int, default=640)\n",
+    "    parser.add_argument('--model', type=str, default='yolov8n.pt')\n",
+    "    args = parser.parse_args()\n",
+    "    \n",
+    "    # SageMaker paths\n",
+    "    data_dir = '/opt/ml/input/data/training'\n",
+    "    model_dir = '/opt/ml/model'\n",
+    "    \n",
+    "    # Load pretrained model\n",
+    "    model = YOLO(args.model)\n",
+    "    \n",
+    "    # Train\n",
+    "    results = model.train(\n",
+    "        data=os.path.join(data_dir, 'data.yaml'),\n",
+    "        epochs=args.epochs,\n",
+    "        batch=args.batch_size,\n",
+    "        imgsz=args.img_size,\n",
+    "        project=model_dir,\n",
+    "        name='defect_detection'\n",
+    "    )\n",
+    "    \n",
+    "    # Export to ONNX for deployment\n",
+    "    best_model = YOLO(os.path.join(model_dir, 'defect_detection/weights/best.pt'))\n",
+    "    best_model.export(format='onnx')\n",
+    "    best_model.export(format='torchscript')\n",
+    "    \n",
+    "    print(f\"Training complete. Model saved to {model_dir}\")\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    main()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "sm_training_header",
+   "metadata": {},
+   "source": [
+    "## Step 5: Launch SageMaker Training Job"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sm_estimator",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.pytorch import PyTorch\n",
+    "\n",
+    "# Define the estimator\n",
+    "yolo_estimator = PyTorch(\n",
+    "    entry_point='train_yolo.py',\n",
+    "    role=role,\n",
+    "    instance_count=1,\n",
+    "    instance_type='ml.g4dn.xlarge',  # GPU instance\n",
+    "    framework_version='2.0.0',\n",
+    "    py_version='py310',\n",
+    "    output_path=f's3://{bucket}/{project}/output',\n",
+    "    hyperparameters={\n",
+    "        'epochs': 50,\n",
+    "        'batch-size': 16,\n",
+    "        'img-size': 640,\n",
+    "        'model': 'yolov8n.pt'  # nano model for fast training\n",
+    "    },\n",
+    "    # Install ultralytics during training\n",
+    "    source_dir='.',\n",
+    "    dependencies=['requirements.txt'] if os.path.exists('requirements.txt') else None\n",
+    ")\n",
+    "\n",
+    "print(\"Estimator configured\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "create_requirements",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create requirements file for training container\n",
+    "with open('requirements.txt', 'w') as f:\n",
+    "    f.write('ultralytics>=8.0.0\\n')\n",
+    "    f.write('onnx\\n')\n",
+    "    f.write('onnxruntime\\n')\n",
+    "print(\"Created requirements.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "start_training",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Start training job\n",
+    "job_name = f\"yolo-defect-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}\"\n",
+    "yolo_estimator.fit(\n",
+    "    inputs={'training': training_data_uri},\n",
+    "    job_name=job_name,\n",
+    "    wait=True,\n",
+    "    logs='All'\n",
+    ")\n",
+    "print(f\"Training job {job_name} completed!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "download_header",
+   "metadata": {},
+   "source": [
+    "## Step 6: Download and Prepare Model for DDA Deployment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "download_model",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download trained model\n",
+    "model_artifact = yolo_estimator.model_data\n",
+    "print(f\"Model artifact: {model_artifact}\")\n",
+    "\n",
+    "!aws s3 cp {model_artifact} ./yolo_model.tar.gz\n",
+    "!mkdir -p yolo_model && tar -xzf yolo_model.tar.gz -C yolo_model\n",
+    "!ls -la yolo_model/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "compile_header",
+   "metadata": {},
+   "source": [
+    "## Step 7: Compile Model for Edge Deployment (Optional)\n",
+    "\n",
+    "Use SageMaker Neo to compile the ONNX model for specific target devices."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "compile_model",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional: Compile with SageMaker Neo for edge deployment\n",
+    "sagemaker_client = boto3.client('sagemaker')\n",
+    "\n",
+    "# Find ONNX model and repackage\n",
+    "import tarfile\n",
+    "onnx_path = 'yolo_model/defect_detection/weights/best.onnx'\n",
+    "if os.path.exists(onnx_path):\n",
+    "    with tarfile.open('yolo_onnx.tar.gz', 'w:gz') as tar:\n",
+    "        tar.add(onnx_path, arcname='model.onnx')\n",
+    "    \n",
+    "    # Upload for compilation\n",
+    "    onnx_s3 = f's3://{bucket}/{project}/models/yolo_onnx.tar.gz'\n",
+    "    !aws s3 cp yolo_onnx.tar.gz {onnx_s3}\n",
+    "    print(f\"ONNX model uploaded to: {onnx_s3}\")\n",
+    "else:\n",
+    "    print(\"ONNX model not found - check training output\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "summary",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "This notebook trained a YOLOv8 model for defect detection.\n",
+    "\n",
+    "**Output models:**\n",
+    "- `best.pt` - PyTorch model\n",
+    "- `best.onnx` - ONNX model for cross-platform deployment\n",
+    "- `best.torchscript` - TorchScript for optimized inference\n",
+    "\n",
+    "**Next steps:**\n",
+    "- Use the ONNX model with DDA edge application\n",
+    "- Fine-tune hyperparameters for better accuracy\n",
+    "- Add more training data for improved performance"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index c02486e..711dadd 100644
--- a/README.md
+++ b/README.md
@@ -219,6 +219,28 @@ DDA consists of several key components:
 - AWS IAM (for permissions)
 - Amazon SageMaker for Model Training and Compiling
 
+## SageMaker Training Notebooks
+
+DDA provides two approaches for training defect detection models:
+
+### Option 1: YOLO Models (Recommended)
+**Notebook**: `DDA_SageMaker_YOLO_Training.ipynb`
+
+- Uses open-source YOLOv8 models (no marketplace subscription required)
+- Supports object detection, segmentation, and classification
+- Faster inference speeds suitable for real-time edge deployment
+- Exports to ONNX and TorchScript for flexible deployment
+
+### Option 2: Lookout for Vision Algorithm
+**Notebook**: `DDA_SageMaker_Model_Training_and_Compilation.ipynb`
+
+- Uses AWS Marketplace Computer Vision Defect Detection algorithm
+- Requires marketplace subscription
+- Supports classification and segmentation models
+- Compiles for multiple targets (x86, ARM64, Jetson Xavier)
+
+Both notebooks support the same deployment workflow via Greengrass components.
+
 ## Quick Start
 
 
@@ -448,7 +470,9 @@ DDA consists of several key components:
 
 #### Step 4: Deploy ML Model (Optional)
 
-1. **Train and Compile model using Amazon SageMaker** (see [SageMaker blog guide](https://aws.amazon.com/blogs/machine-learning/))
+1. **Train and Compile model using Amazon SageMaker**:
+   - **Option A (Lookout for Vision)**: Use `DDA_SageMaker_Model_Training_and_Compilation.ipynb` for AWS Marketplace algorithm
+   - **Option B (YOLO - Recommended)**: Use `DDA_SageMaker_YOLO_Training.ipynb` for open-source YOLOv8 models
 
 2. **Create model component**:
    - Use `DDA_Greengrass_Component_Creator.ipynb` notebook