From 18559dbed002d95348589c045d0c3ff385ec616d Mon Sep 17 00:00:00 2001
From: "promptless[bot]" <179508745+promptless[bot]@users.noreply.github.com>
Date: Thu, 11 Dec 2025 17:42:35 +0000
Subject: [PATCH 1/7] Restructure serverless development section for improved
 developer experience

---
 docs.json                                     |  40 ++-
 serverless/development/cleanup.mdx            | 143 ---------
 serverless/development/concurrency.mdx        | 144 ---------
 serverless/development/debugger.mdx           |  95 ------
 .../development/environment-variables.mdx     | 278 ++++++++++++----
 serverless/development/local-testing.mdx      | 241 +++++++-------
 serverless/development/logs.mdx               |  74 ++++-
 serverless/development/optimization.mdx       | 297 ++++++++++++++++++
 serverless/development/overview.mdx           | 148 +++------
 .../development/pod-first-development.mdx     | 213 +++++++++++++
 serverless/development/sdk-utilities.mdx      | 185 +++++++++++
 serverless/development/ssh-into-workers.mdx   |   2 +-
 .../development/test-response-times.mdx       |  45 ---
 serverless/development/validator.mdx          |  99 ------
 14 files changed, 1202 insertions(+), 802 deletions(-)
 create mode 100644 serverless/development/optimization.mdx
 create mode 100644 serverless/development/pod-first-development.mdx
 create mode 100644 serverless/development/sdk-utilities.mdx

diff --git a/docs.json b/docs.json
index 7fbb72d2..c7d859c5 100644
--- a/docs.json
+++ b/docs.json
@@ -93,17 +93,15 @@
               {
                 "group": "Development",
                 "pages": [
+                "serverless/development/overview",
+                "serverless/development/local-testing",
+                "serverless/development/sdk-utilities",
+                "serverless/development/pod-first-development",
+                "serverless/development/dual-mode-worker",
+                "serverless/development/environment-variables",
                   "serverless/development/logs",
                   "serverless/development/ssh-into-workers",
-                  "serverless/development/overview",
-                  "serverless/development/local-testing",
-                  "serverless/development/cleanup",
-                  "serverless/development/validator",
-                  "serverless/development/debugger",
-                  "serverless/development/concurrency",
-                  "serverless/development/environment-variables",
-                  "serverless/development/test-response-times",
-                  "serverless/development/dual-mode-worker"
+                "serverless/development/optimization"
                 ]
               }
             ]
@@ -525,6 +523,30 @@
       "source": "/serverless/storage/network-volumes",
       "destination": "/storage/network-volumes"
     },
+    {
+      "source": "/serverless/development/concurrency",
+      "destination": "/serverless/development/local-testing"
+    },
+    {
+      "source": "/serverless/development/debugger",
+      "destination": "/serverless/development/local-testing"
+    },
+    {
+      "source": "/serverless/development/validator",
+      "destination": "/serverless/development/sdk-utilities"
+    },
+    {
+      "source": "/serverless/development/cleanup",
+      "destination": "/serverless/development/sdk-utilities"
+    },
+    {
+      "source": "/serverless/development/dual-mode-worker",
+      "destination": "/serverless/development/pod-first-development"
+    },
+    {
+      "source": "/serverless/development/test-response-times",
+      "destination": "/serverless/development/optimization"
+    },
     {
       "source": "/pods/storage/create-network-volumes",
       "destination": "/storage/network-volumes"
diff --git a/serverless/development/cleanup.mdx b/serverless/development/cleanup.mdx
index b1666247..e69de29b 100644
--- a/serverless/development/cleanup.mdx
+++ b/serverless/development/cleanup.mdx
@@ -1,143 +0,0 @@
----
-title: "Cleanup"
----
-
-When developing for Runpod serverless, it's crucial to manage resources efficiently. The Runpod SDK provides a `clean()` function to help you remove temporary files and folders after processing. This guide will show you how to use this cleanup utility effectively.
-
-## The clean() Function
-
-The `clean()` function is part of Runpod's serverless utilities. It helps maintain a clean environment by removing specified folders and files after a job is completed.
-
-To use it, import the function from the Runpod serverless utilities:
-
-```python
-from runpod.serverless.utils.rp_cleanup import clean
-```
-
-## Default Behavior
-
-By default, `clean()` removes the following:
-
-* `input_objects` folder
-* `output_objects` folder
-* `job_files` folder
-* `output.zip` file
-
-## Using clean() in Your Handler
-
-Here's an example of how to incorporate the `clean()` function in your AI model handler:
-
-<Tabs>
-<Tab title="Python">
-```python
-import runpod
-from runpod.serverless.utils.rp_cleanup import clean
-import requests
-import os
-
-
-def download_image(url, save_path):
-    response = requests.get(url)
-    if response.status_code == 200:
-        with open(save_path, "wb") as file:
-            file.write(response.content)
-        return True
-    return False
-
-
-def handler(event):
-    """
-    This is a sample AI model handler function that downloads an image,
-    processes it, and then cleans up.
-    """
-    try:
-        # Extract the image URL from the input
-        image_url = event["input"]["image_url"]
-
-        # Create a temporary directory for the image
-        os.makedirs("temp_images", exist_ok=True)
-        image_path = "temp_images/downloaded_image.jpg"
-
-        # Download the image
-        if not download_image(image_url, image_path):
-            raise Exception("Failed to download image")
-
-        # Your AI model processing code here
-        # For this example, we're just simulating processing
-        result = f"Processed image from: {image_url}"
-
-        # Cleanup after processing
-        clean(folder_list=["temp_images"])
-
-        # Return the result
-        return {"output": result}
-    except Exception as e:
-        # If there's an error, attempt cleanup and return the error
-        clean(folder_list=["temp_images"])
-        return {"error": str(e)}
-
-
-# Start the serverless function
-runpod.serverless.start({"handler": handler})
-```
-
-</Tab>
-
-</Tabs>
-
-In this example, `clean()` is called after the model processing is complete, ensuring that temporary files and folders are removed.
-
-## Custom Cleanup
-
-You can also specify additional folders to be removed by passing a list to the `clean()` function:
-
-```python
-clean(["custom_folder1", "custom_folder2"])
-```
-
-## Testing your Handler with Cleanup
-
-To test your handler with the cleanup function:
-
-<Tabs>
-<Tab title="CLI">
-```sh
-python ai_model_handler.py \
-  --test_input '{
-    "input": {
-        "image_url": "https://avatars.githubusercontent.com/u/95939477?s=200&v=4"
-    }
-}'
-```
-
-</Tab>
-
-<Tab title="JSON">
-Create a `test_input.json` file:
-
-```json
-{
-  "input": {
-    "image_url": "https://avatars.githubusercontent.com/u/95939477?s=200&v=4"
-  }
-}
-```
-
-Then run:
-
-```sh
-python ai_model_handler.py
-```
-
-</Tab>
-
-</Tabs>
-
-## Best Practices
-
-1. Call `clean()` at the end of your handler to ensure proper cleanup.
-2. Use try-except blocks to handle any errors during cleanup.
-3. Be cautious when adding custom folders to the cleanup list.
-4. Consider logging cleanup actions for debugging purposes.
-
-By implementing the `clean()` function in your handlers, you ensure that each job starts with a clean slate, preventing potential issues caused by leftover files from previous runs.
diff --git a/serverless/development/concurrency.mdx b/serverless/development/concurrency.mdx
index 75740eb6..e69de29b 100644
--- a/serverless/development/concurrency.mdx
+++ b/serverless/development/concurrency.mdx
@@ -1,144 +0,0 @@
----
-title: "Concurrency"
----
-
-In this tutorial, we'll dive deep into the `--rp_api_concurrency` argument of the Runpod Python SDK. This powerful feature allows you to simulate multiple concurrent requests to your serverless function, mimicking real-world scenarios more closely.
-
-## What is rp\_api\_concurrency?
-
-The `--rp_api_concurrency` argument controls the number of concurrent workers that the local FastAPI server uses when simulating the Runpod serverless environment. Each worker can handle a separate request simultaneously, allowing you to test how your function performs under parallel execution.
-
-### Basic Usage
-
-To set the number of concurrent workers, use the `--rp_api_concurrency` flag followed by the desired number of workers:
-
-```sh
-python your_function.py --rp_serve_api --rp_api_concurrency 2
-```
-
-This command starts your local server with 2 concurrent workers.
-
-### Example: Testing a Counter Function
-
-Let's create a simple function that increments a counter and test it with different concurrency settings.
-
-1. Create a file named `counter_function.py`:
-
-```python
-import runpod
-
-counter = 0
-
-
-def handler(event):
-    global counter
-    counter += 1
-    return {"counter": counter}
-
-
-runpod.serverless.start({"handler": handler})
-```
-
-2. Run the function with a single worker:
-
-```sh
-python counter_function.py --rp_serve_api --rp_api_concurrency 1
-```
-
-3. In another terminal, use curl to send multiple requests. Create a new file called `counter.sh` and add the following to the file.
-
-```bash
-for i in {1..10}; do
-    curl -X POST http://localhost:8000/runsync -H "Content-Type: application/json" -d '{"input": {}}' &
-done
-```
-
-To execute this file run `bash counter.sh`.
-
-4. Observe the results. With a single worker, the requests are processed sequentially, and you'll see the counter increment from 1 to 10.
-
-5. Now, let's run the function with multiple workers:
-
-```sh
-python counter_function.py --rp_serve_api --rp_api_concurrency 4
-```
-
-6. When you try to run this command, you'll encounter the following error:
-
-```
-WARNING:  You must pass the application as an import string to enable 'reload' or 'workers'.
-```
-
-This error occurs because the Runpod SDK integrates with FastAPI to create the local server, and FastAPI has certain expectations about how the application is structured and named.
-
-7. To resolve this issue, we need to understand a bit more about the FastAPI integration:
-
-   * The Runpod SDK uses FastAPI to create an ASGI application that simulates the serverless environment.
-   * FastAPI's underlying server, Uvicorn, expects the main application to be in a file named `main.py` by default.
-   * When you use the `--rp_api_concurrency` flag to specify multiple workers, Uvicorn tries to spawn separate processes, each running your application.
-
-8. To make this work, we need to rename our file to `main.py`. This allows Uvicorn to correctly import and run multiple instances of your application. Here's what you need to do:
-
-   a. Rename your `counter_function.py` to `main.py`:
-
-   ```bash
-   mv counter_function.py main.py
-   ```
-
-   b. Now, run the command again:
-
-   ```sh
-   python main.py --rp_serve_api --rp_api_concurrency 4
-   ```
-
-   This time, the command should work without errors, starting your local server with 4 concurrent workers.
-
-9. With the server running, you can now send multiple requests using the curl command from step 3:
-
-```bash
-for i in {1..10}; do
-    curl -X POST http://localhost:8000/run -H "Content-Type: application/json" -d '{"input": {}}' &
-done
-```
-
-10. Observe the results. With multiple workers, you might see inconsistent results due to race conditions. The counter might not reach 10, and you may see duplicate values.
-
-## Handling Concurrency in your code
-
-To make your function concurrency-safe, you need to use appropriate synchronization mechanisms. Here's an improved version of the counter function:
-
-```python
-import runpod
-from threading import Lock
-
-counter = 0
-counter_lock = Lock()
-
-
-def handler(event):
-    global counter
-    with counter_lock:
-        counter += 1
-        return {"counter": counter}
-
-
-runpod.serverless.start({"handler": handler})
-```
-
-Now, even with multiple workers, the counter will increment correctly.
-
-## Best Practices for Using rp\_api\_concurrency
-
-1. **Start Low**: Begin testing with a low number of workers and gradually increase.
-2. **Match Production**: Set concurrency to match your expected production configuration.
-3. **Test Varied Loads**: Try different concurrency levels to understand your function's behavior under various conditions.
-4. **Monitor Resources**: Keep an eye on CPU and memory usage as you increase concurrency.
-5. **Use Logging**: Implement detailed logging to track the flow of concurrent executions.
-
-It's important to note that `--rp_api_concurrency` provides concurrent execution, not necessarily parallel execution. The degree of parallelism depends on your system's capabilities and the nature of your function.
-
-## Conclusion
-
-The `--rp_api_concurrency` argument is a powerful tool for testing your Runpod serverless functions under more realistic conditions. By simulating concurrent requests, you can identify and resolve issues related to race conditions, resource contention, and scalability before deploying to production.
-
-Remember, while local testing with concurrency is valuable, it's not a complete substitute for load testing in a production-like environment. Use this feature as part of a comprehensive testing strategy to ensure your serverless functions are robust and scalable.
diff --git a/serverless/development/debugger.mdx b/serverless/development/debugger.mdx
index bcbb841f..e69de29b 100644
--- a/serverless/development/debugger.mdx
+++ b/serverless/development/debugger.mdx
@@ -1,95 +0,0 @@
----
-title: "Debugging"
----
-
-In the previous lesson, we covered the basics of running your Runpod serverless functions locally. Now, let's explore some advanced options that give you more control over your local testing environment.
-
-The Runpod Python SDK provides several command-line arguments that allow you to customize your local testing setup.
-
-Let's go through each of these options:
-
-### Controlling Log Levels
-
-```sh
---rp_log_level ERROR | WARN | INFO | DEBUG
-```
-
-This argument allows you to set the verbosity of the console output. Options are:
-
-* `ERROR`: Only show error messages
-* `WARN`: Show warnings and errors
-* `INFO`: Show general information, warnings, and errors
-* `DEBUG`: Show all messages, including detailed debug information
-
-Example:
-
-```sh
-python hello_world.py --rp_server_api --rp_log_level DEBUG
-```
-
-The `--rp_log_level` flag enables the Runpod debugger, which can help you troubleshoot issues in your code.
-
-Example:
-
-```sh
-python hello_world.py --rp_server_api --rp_debugger
-```
-
-### Customizing the API Server
-
-The following arguments allow you to configure the FastAPI server that simulates the Runpod serverless environment:
-
-```sh
---rp_serve_api
---rp_api_port <port_number>
---rp_api_concurrency <number_of_workers>
---rp_api_host <hostname>
-```
-
-* `--rp_serve_api`: Starts the API server
-* `--rp_api_port`: Sets the port number (default is 8000)
-* `--rp_api_concurrency`: Sets the number of concurrent workers (default is 1)
-* `--rp_api_host`: Sets the hostname (default is "localhost")
-
-Example:
-
-```sh
-python hello_world.py --rp_serve_api --rp_api_port 8080 --rp_api_concurrency 4 --rp_api_host 0.0.0.0
-```
-
-This command starts the API server on port 8080 with 4 concurrent workers and makes it accessible from other devices on the network.
-
-### Providing test input
-
-As we saw in the previous lesson, you can provide test input either through a JSON file or directly via the command line:
-
-```sh
---test_input '<JSON_string>'
-```
-
-Example:
-
-```sh
-python hello_world.py --rp_server_api --test_input '{"input": {"name": "Runpod"}}'
-```
-
-You can combine these arguments to create a highly customized local testing environment. Here's an example that uses multiple options:
-
-```sh
-python hello_world.py --rp_server_api --rp_log_level DEBUG --rp_debugger --rp_api_port 8080 --rp_api_concurrency 2 --test_input '{"input": {"name": "Advanced Tester"}}'
-```
-
-This command:
-
-1. Starts the local server
-2. Sets the log level to DEBUG for maximum information
-3. Enables the debugger
-4. Uses port 8080 for the API server
-5. Sets up 2 concurrent workers
-6. Provides a test input directly in the command
-
-## Conclusion
-
-These advanced options for local testing with the Runpod Python SDK give you fine-grained control over your development environment. By mastering these tools, you can ensure your serverless functions are robust and ready for deployment to the Runpod cloud.
-
-In the next lesson, we'll explore how to structure more complex handlers to tackle advanced use cases in your serverless applications.
diff --git a/serverless/development/environment-variables.mdx b/serverless/development/environment-variables.mdx
index 74d0dae4..01d5ed65 100644
--- a/serverless/development/environment-variables.mdx
+++ b/serverless/development/environment-variables.mdx
@@ -1,91 +1,253 @@
 ---
-title: "Use environment variables"
+title: "Environment variables"
+description: "Configure your Serverless workers with environment variables."
 ---
 
-Incorporating environment variables into your Handler Functions is a key aspect of managing external resources like S3 buckets.
+Environment variables let you configure your workers without hardcoding credentials or settings in your code. They're ideal for managing API keys, service URLs, feature flags, and other configuration that changes between development and production.
 
-This section focuses on how to use environment variables to facilitate the uploading of images to an S3 bucket using Runpod Handler Functions.
+## How environment variables work
 
-You will go through the process of writing Python code for the uploading and setting the necessary environment variables in the Web interface.
+Environment variables are set in the Runpod console and are available to your handler at runtime through `os.environ`. Your handler can read these variables to configure its behavior.
 
-## Prerequisites
+### Access environment variables in your handler
 
-* Ensure the Runpod Python library is installed: `pip install runpod`.
-* Have an image file named `image.png` in the Docker container's working directory.
+```python
+import os
+import runpod
 
-## Python Code for S3 Uploads
+def handler(job):
+    # Read an environment variable
+    api_key = os.environ.get("API_KEY")
+    service_url = os.environ.get("SERVICE_URL", "https://default-url.com")
+    
+    # Use the configuration
+    result = call_external_service(service_url, api_key)
+    return {"output": result}
 
-Let's break down the steps to upload an image to an S3 bucket using Python:
+runpod.serverless.start({"handler": handler})
+```
 
-1. **Handler Function for S3 Upload**: Here's an example of a handler function that uploads `image.png` to an S3 bucket and returns the image URL:
+## Set environment variables
 
-   ```python
-   from runpod.serverless.utils import rp_upload
-   import runpod
+Set environment variables in the Runpod console when creating or editing your endpoint:
 
+1. Navigate to your endpoint in the [Runpod console](https://www.runpod.io/console/serverless).
+2. Click on the **Settings** tab.
+3. Scroll to the **Environment Variables** section.
+4. Add your variables as key-value pairs.
+5. Click **Save** to apply the changes.
 
-   def handler(job):
-       image_url = rp_upload.upload_image(job["id"], "./image.png")
-       return [image_url]
+## Build-time vs runtime variables
 
+There are two types of environment variables:
 
-   runpod.serverless.start({"handler": handler})
-   ```
+### Build-time variables
 
-2. **Packaging Your Code**: Follow the guidelines in [Worker Image Creation](/serverless/workers/deploy) for packaging and deployment.
+Build-time variables are set in your Dockerfile using the `ENV` instruction. These are baked into your Docker image during the build:
 
-### Setting Environment Variables for S3
+```dockerfile
+FROM runpod/base:0.4.0-cuda11.8.0
 
-Using environment variables securely passes the necessary credentials and configurations to your serverless function:
+# Build-time environment variables
+ENV MODEL_NAME="llama-2-7b"
+ENV DEFAULT_TEMPERATURE="0.7"
 
-1. **Accessing Environment Variables Setting**: In the template creation/editing interface of your pod, navigate to the bottom section where you can set environment variables.
+COPY handler.py /handler.py
+CMD ["python", "-u", "/handler.py"]
+```
 
-2. **Configuring S3 Variables**: Set the following key variables for your S3 bucket:
+Build-time variables are useful for:
+- Default configuration values.
+- Values that rarely change.
+- Non-sensitive information.
 
-   * `BUCKET_ENDPOINT_URL`
-   * `BUCKET_ACCESS_KEY_ID`
-   * `BUCKET_SECRET_ACCESS_KEY`
+### Runtime variables
 
-Ensure that your `BUCKET_ENDPOINT_URL` includes the bucket name. For example: `https://your-bucket-name.nyc3.digitaloceanspaces.com` | `https://your-bucket-name.nyc3.digitaloceanspaces.com`
+Runtime variables are set in the Runpod console and can be changed without rebuilding your image. These override build-time variables with the same name:
 
-## Testing your API
+Runtime variables are useful for:
+- API keys and secrets.
+- Environment-specific configuration (dev, staging, prod).
+- Values that change frequently.
+- Sensitive information that shouldn't be in your image.
 
-Finally, test the serverless function to confirm that it successfully uploads images to your S3 bucket:
+## Common use cases
 
-1. **Making a Request**: Make a POST request to your API endpoint with the necessary headers and input data. Remember, the input must be a JSON item:
+### API keys and secrets
 
-   ```java
-   import requests
+Store sensitive credentials as runtime environment variables:
 
-   endpoint = "https://api.runpod.ai/v2/xxxxxxxxx/run"
-   headers = {"Content-Type": "application/json", "Authorization": "Bearer XXXXXXXXXXXXX"}
-   input_data = {"input": {"inp": "this is an example input"}}
+```python
+import os
+import runpod
+import requests
+
+def handler(event):
+    # Read API keys from environment
+    openai_key = os.environ.get("OPENAI_API_KEY")
+    anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
+    
+    # Use them in your code
+    if not openai_key:
+        return {"error": "OPENAI_API_KEY not configured"}
+    
+    # Your API call here
+    result = call_openai(openai_key, event["input"]["prompt"])
+    return {"output": result}
 
-   response = requests.post(endpoint, json=input_data, headers=headers)
-   ```
+runpod.serverless.start({"handler": handler})
+```
 
-2. **Checking the Output**: Make a GET request to retrieve the job status and output. Here’s an example of how to do it:
+<Warning>
+Never hardcode API keys or secrets in your code. Always use environment variables to keep credentials secure.
+</Warning>
 
-   ```csharp
-   response = requests.get(
-       "https://api.runpod.ai/v2/xxxxxxxxx/status/" + response.json()["id"],
-       headers=headers,
-   )
-   response.json()
-   ```
+### S3 bucket configuration
 
-   The response should include the URL of the uploaded image on completion:
+Configure S3 or S3-compatible storage for uploading results:
 
-   ```json
-   {
-     "delayTime": 86588,
-     "executionTime": 1563,
-     "id": "e3d2e250-ea81-4074-9838-1c52d006ddcf",
-     "output": [
-       "https://your-bucket.s3.us-west-004.backblazeb2.com/your-image.png"
-     ],
-     "status": "COMPLETED"
-   }
-   ```
+```python
+import os
+import runpod
+from runpod.serverless.utils import rp_upload
+
+def handler(event):
+    # S3 credentials are read from environment variables:
+    # - BUCKET_ENDPOINT_URL
+    # - BUCKET_ACCESS_KEY_ID
+    # - BUCKET_SECRET_ACCESS_KEY
+    
+    # Process your input
+    result_image_path = generate_image(event["input"]["prompt"])
+    
+    # Upload to S3
+    image_url = rp_upload.upload_image(event["id"], result_image_path)
+    
+    return {"output": {"image_url": image_url}}
 
-By following these steps, you can effectively use environment variables to manage S3 bucket credentials and operations within your Runpod Handler Functions. This approach ensures secure, scalable, and efficient handling of external resources in your serverless applications.
+runpod.serverless.start({"handler": handler})
+```
+
+Set these variables in the Runpod console:
+- `BUCKET_ENDPOINT_URL`: Your bucket endpoint (e.g., `https://your-bucket.s3.us-west-2.amazonaws.com`)
+- `BUCKET_ACCESS_KEY_ID`: Your access key ID
+- `BUCKET_SECRET_ACCESS_KEY`: Your secret access key
+
+<Note>
+The `BUCKET_ENDPOINT_URL` should include your bucket name in the URL.
+</Note>
+
+### Feature flags
+
+Use environment variables to enable or disable features:
+
+```python
+import os
+import runpod
+
+def handler(event):
+    # Read feature flags
+    enable_caching = os.environ.get("ENABLE_CACHING", "false").lower() == "true"
+    enable_logging = os.environ.get("ENABLE_LOGGING", "true").lower() == "true"
+    
+    if enable_logging:
+        print(f"Processing request: {event['id']}")
+    
+    # Your processing logic
+    result = process_input(event["input"], use_cache=enable_caching)
+    
+    return {"output": result}
+
+runpod.serverless.start({"handler": handler})
+```
+
+### Model configuration
+
+Configure model parameters without changing code:
+
+```python
+import os
+import runpod
+
+def handler(event):
+    # Read model configuration from environment
+    model_name = os.environ.get("MODEL_NAME", "default-model")
+    max_tokens = int(os.environ.get("MAX_TOKENS", "1024"))
+    temperature = float(os.environ.get("TEMPERATURE", "0.7"))
+    
+    # Use configuration in your model
+    result = generate_text(
+        model=model_name,
+        prompt=event["input"]["prompt"],
+        max_tokens=max_tokens,
+        temperature=temperature
+    )
+    
+    return {"output": result}
+
+runpod.serverless.start({"handler": handler})
+```
+
+## Best practices
+
+### Use defaults
+
+Always provide default values for non-critical environment variables:
+
+```python
+# Good: Provides a default
+service_url = os.environ.get("SERVICE_URL", "https://api.example.com")
+
+# Good: Fails explicitly if missing
+api_key = os.environ.get("API_KEY")
+if not api_key:
+    raise ValueError("API_KEY environment variable is required")
+```
+
+### Validate on startup
+
+Validate critical environment variables when your handler starts:
+
+```python
+import os
+import runpod
+
+# Validate environment variables on startup
+required_vars = ["API_KEY", "SERVICE_URL"]
+missing_vars = [var for var in required_vars if not os.environ.get(var)]
+
+if missing_vars:
+    raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
+
+def handler(event):
+    # Your handler logic here
+    pass
+
+runpod.serverless.start({"handler": handler})
+```
+
+### Document your variables
+
+Document the environment variables your handler expects in your README:
+
+```markdown
+## Environment Variables
+
+| Variable | Required | Default | Description |
+|----------|----------|---------|-------------|
+| `API_KEY` | Yes | N/A | Your API key for the external service |
+| `SERVICE_URL` | No | `https://api.example.com` | The service endpoint URL |
+| `MAX_WORKERS` | No | `4` | Maximum number of concurrent workers |
+```
+
+### Separate secrets from config
+
+Use different approaches for secrets vs configuration:
+- **Secrets**: Only set as runtime variables in the Runpod console.
+- **Configuration**: Can use build-time defaults with runtime overrides.
+
+## Next steps
+
+- [Local testing](/serverless/development/local-testing) - Test your handler with different environment variables locally.
+- [Pod-first development](/serverless/development/pod-first-development) - Use environment variables to control Pod vs Serverless mode.
+- [Optimization](/serverless/development/optimization) - Configure your workers for different environments with CI/CD.
diff --git a/serverless/development/local-testing.mdx b/serverless/development/local-testing.mdx
index ed8dd1c8..c0487909 100644
--- a/serverless/development/local-testing.mdx
+++ b/serverless/development/local-testing.mdx
@@ -1,177 +1,208 @@
 ---
-title: "Test locally"
+title: "Local testing"
+description: "Test your Serverless handlers locally before deploying to production."
 ---
 
-When developing your Handler Function for Runpod serverless, it's crucial to test it thoroughly in a local environment before deployment. The Runpod SDK provides multiple ways to facilitate this local testing, allowing you to simulate various scenarios and inputs without consuming cloud resources.
+Testing your handler locally before deploying saves time and helps you catch issues early. The Runpod SDK provides multiple ways to test your handler function without consuming cloud resources.
 
-## Custom Inputs
+## Basic testing
 
-The simplest way to test your Handler Function is by passing a custom input directly when running your Python file.
-
-This method is ideal for quick checks and iterative development.
+The simplest way to test your handler is by running it directly with test input.
 
 ### Inline JSON
 
-You can pass inline json to your function to test its response.
-
-Assuming your handler function is in a file named `your_handler.py`, you can test it like this:
+Pass test input directly via the command line:
 
-<Tabs>
-<Tab title="CLI">
 ```sh
-python your_handler.py \
-  --test_input '{"input": {"prompt": "The quick brown fox jumps"}}'
+python your_handler.py --test_input '{"input": {"prompt": "Hello, world!"}}'
 ```
 
-</Tab>
-
-<Tab title="Python">
-Add the following file to your project and run the command.
-
-```py
-import runpod
-
+This runs your handler with the specified input and displays the output in your terminal.
 
-def handler(event):
-    """
-    This is a sample handler function that echoes the input
-    and adds a greeting.
-    """
-    try:
-        # Extract the prompt from the input
-        prompt = event["input"]["prompt"]
+### Test file
 
-        result = f"Hello! You said: {prompt}"
+For more complex or reusable test inputs, create a `test_input.json` file in the same directory as your handler:
 
-        # Return the result
-        return {"output": result}
-    except Exception as e:
-        # If there's an error, return it
-        return {"error": str(e)}
+```json test_input.json
+{
+  "input": {
+    "prompt": "This is a test input from JSON file"
+  }
+}
+```
 
+Run your handler without any arguments:
 
-# Start the serverless function
-runpod.serverless.start({"handler": handler})
+```sh
+python your_handler.py
 ```
 
-</Tab>
+The SDK automatically detects and uses the `test_input.json` file.
 
-</Tabs>
+<Note>
+If you provide both a `test_input.json` file and the `--test_input` flag, the command-line input takes precedence.
+</Note>
 
-This command runs your handler with the specified input, allowing you to verify the output and behavior quickly.
+## Local API server
 
-### JSON file
+For more comprehensive testing, start a local API server that simulates your Serverless endpoint. This lets you send HTTP requests to test your handler as if it were deployed.
 
-For more complex or reusable test inputs, you can use a `test_input.json` file.
+Start the local server:
 
-This approach allows you to easily manage and version control your test cases.
+```sh
+python your_handler.py --rp_serve_api
+```
 
-Create a file named `test_input.json` in the same directory as your `your_handler.py` file. For example:
+This starts a FastAPI server on `http://localhost:8000`.
 
-```json
-{
-  "input": {
-    "prompt": "This is a test input from JSON file"
-  }
-}
-```
+### Send requests to the server
 
-2. Run your handler using the following command:
+Once your local server is running, send HTTP POST requests to test your function:
 
 ```sh
-python your_handler.py
+curl -X POST http://localhost:8000/runsync \
+     -H "Content-Type: application/json" \
+     -d '{"input": {"prompt": "Hello, world!"}}'
 ```
 
-When you run this command, the script will automatically detect and use the `test_input.json` file if it exists.
+<Info>
+The `/run` endpoint only returns a fake request ID without executing your code, since async mode requires communication with Runpod's system. For local testing, use `/runsync` to execute your handler and get results immediately.
+</Info>
+
+## Testing concurrency
 
-3. The output will indicate that it's using the input from the JSON file:
+To test how your handler performs under parallel execution, use the `--rp_api_concurrency` flag to set the number of concurrent workers:
 
 ```sh
---- Starting Serverless Worker |  Version 1.6.2 ---
-INFO   | Using test_input.json as job input.
-DEBUG  | Retrieved local job: {'input': {'prompt': 'This is a test from JSON file'}, 'id': 'local_test'}
-INFO   | local_test | Started.
-DEBUG  | local_test | Handler output: {'output': 'Hello! You said: This is a test from JSON file'}
-DEBUG  | local_test | run_job return: {'output': {'output': 'Hello! You said: This is a test from JSON file'}}
-INFO   | Job local_test completed successfully.
-INFO   | Job result: {'output': {'output': 'Hello! You said: This is a test from JSON file'}}
-INFO   | Local testing complete, exiting.
+python main.py --rp_serve_api --rp_api_concurrency 4
 ```
 
-Using `test_input.json` is particularly helpful when:
+This starts your local server with 4 concurrent workers.
 
-* You have complex input structures that are cumbersome to type in the command line.
-* You want to maintain a set of test cases that you can easily switch between.
-* You're collaborating with a team and want to share standardized test inputs.
+<Warning>
+When using `--rp_api_concurrency` with a value greater than 1, your main file must be named `main.py` for proper FastAPI integration. If your file has a different name, rename it to `main.py` before running with multiple workers.
+</Warning>
 
-<Info>
+### Testing concurrent requests
 
-If you provide a test input via the command line (`--test_input` argument), it will override the `test_input.json` file. This allows for flexibility in your testing process.
+Send multiple requests simultaneously to test concurrency:
 
-</Info>
+```bash
+for i in {1..10}; do
+    curl -X POST http://localhost:8000/runsync \
+         -H "Content-Type: application/json" \
+         -d '{"input": {}}' &
+done
+```
 
-## Local Test Server
+### Handling concurrency in your code
 
-For more comprehensive testing, especially when you want to simulate HTTP requests to your serverless function, you can launch a local test server. This server provides an endpoint that you can send requests to, mimicking the behavior of a deployed serverless function.
+If your handler uses shared state (like global variables), use proper synchronization to avoid race conditions:
 
-To start the local test server, use the `--rp_serve_api` flag:
+```python
+import runpod
+from threading import Lock
 
-```sh
-python your_handler.py --rp_serve_api
+counter = 0
+counter_lock = Lock()
+
+
+def handler(event):
+    global counter
+    with counter_lock:
+        counter += 1
+        return {"counter": counter}
+
+
+runpod.serverless.start({"handler": handler})
 ```
 
-This command starts a FastAPI server on your local machine, accessible at `http://localhost:8000`.
+## Debugging
 
-### Customizing the Local Server
+### Log levels
 
-You can further customize the local server using additional flags:
+Control the verbosity of console output with the `--rp_log_level` flag:
 
-* `--rp_api_port`: Specify a custom port (default is 8000)
-* `--rp_api_host`: Set the host address (default is "localhost")
-* `--rp_api_concurrency`: Set the number of worker processes (default is 1)
+```sh
+python your_handler.py --rp_serve_api --rp_log_level DEBUG
+```
+
+Available log levels:
+- `ERROR`: Only show error messages.
+- `WARN`: Show warnings and errors.
+- `INFO`: Show general information, warnings, and errors.
+- `DEBUG`: Show all messages, including detailed debug information.
 
-Example:
+### Enable the debugger
+
+Use the `--rp_debugger` flag for detailed troubleshooting:
 
 ```sh
-python main.py \
-  --rp_serve_api \
-  --rp_api_port 8080 \
-  --rp_api_concurrency 4
+python your_handler.py --rp_serve_api --rp_debugger
 ```
 
-This starts the server on port `8080` with 4 worker processes.
+This enables the Runpod debugger, which provides additional diagnostic information to help you troubleshoot issues.
+
+## Server configuration
 
-### Sending Requests to the Local Server
+Customize the local API server with these flags:
 
-Once your local server is running, you can send HTTP POST requests to test your function. Use tools like `curl` or Postman, or write scripts to automate your tests.
+### Port
 
-Example using `curl`:
+Set a custom port (default is 8000):
 
 ```sh
-curl -X POST http://localhost:8000/runsync \
-     -H "Content-Type: application/json" \
-     -d '{"input": {"prompt": "The quick brown fox jumps"}}'
+python your_handler.py --rp_serve_api --rp_api_port 8080
 ```
 
-<Info>
+### Host
 
-When testing locally, the /run endpoint only returns a fake requestId without executing your code, as async mode requires communication with our system. This is why you can’t check job status using /status. For local testing, use /runsync. To test async functionality, you’ll need to deploy your app on our platform.
+Set the hostname (default is "localhost"):
 
-</Info>
+```sh
+python your_handler.py --rp_serve_api --rp_api_host 0.0.0.0
+```
+
+<Warning>
+Setting `--rp_api_host` to `0.0.0.0` allows connections from other devices on the network. This can be useful for testing but may have security implications.
+</Warning>
 
-## Advanced testing options
+## Flag reference
 
-The Runpod SDK offers additional flags for more advanced testing scenarios:
+Here's a complete reference of all available flags for local testing:
 
-* `--rp_log_level`: Control log verbosity (options: ERROR, WARN, INFO, DEBUG)
-* `--rp_debugger`: Enable the Runpod debugger for detailed troubleshooting
+| Flag | Description | Default | Example |
+|------|-------------|---------|---------|
+| `--rp_serve_api` | Starts the local API server | N/A | `--rp_serve_api` |
+| `--rp_api_port` | Sets the server port | 8000 | `--rp_api_port 8080` |
+| `--rp_api_host` | Sets the server hostname | "localhost" | `--rp_api_host 0.0.0.0` |
+| `--rp_api_concurrency` | Sets concurrent workers | 1 | `--rp_api_concurrency 4` |
+| `--rp_log_level` | Controls log verbosity | INFO | `--rp_log_level DEBUG` |
+| `--rp_debugger` | Enables the debugger | Disabled | `--rp_debugger` |
+| `--test_input` | Provides test input as JSON | N/A | `--test_input '{"input": {}}'` |
 
-Example:
+## Combined example
+
+You can combine multiple flags to create a customized local testing environment:
 
 ```sh
-python your_handler.py --rp_serve_api --rp_log_level DEBUG --rp_debugger
+python main.py --rp_serve_api \
+    --rp_api_port 8080 \
+    --rp_api_concurrency 4 \
+    --rp_log_level DEBUG \
+    --rp_debugger
 ```
 
-Local testing is a crucial step in developing robust and reliable serverless functions for Runpod. By utilizing these local testing options, you can catch and fix issues early, optimize your function's performance, and ensure a smoother deployment process.
+This command:
+- Starts the local API server on port 8080.
+- Uses 4 concurrent workers.
+- Sets the log level to DEBUG for maximum information.
+- Enables the debugger for troubleshooting.
+
+## Next steps
+
+Once you've tested your handler locally, learn about:
 
-For more detailed information on local testing and advanced usage scenarios, refer to our [blog post](https://blog.runpod.io/workers-local-api-server-introduced-with-runpod-python-0-9-13/) and the other tutorials in this documentation.
+- [SDK utilities](/serverless/development/sdk-utilities) - Helper functions for validation and cleanup.
+- [Pod-first development](/serverless/development/pod-first-development) - Develop on a Pod before deploying to Serverless.
+- [Logs](/serverless/development/logs) - Understand logging in production.
diff --git a/serverless/development/logs.mdx b/serverless/development/logs.mdx
index aa578e41..215f963f 100644
--- a/serverless/development/logs.mdx
+++ b/serverless/development/logs.mdx
@@ -8,7 +8,7 @@ description: "Access and manage logs for Serverless endpoints and workers."
   <img src="/images/serverless-logs.png" />
 </Frame>
 
-Runpod provides comprehensive logging capabilities for Serverless endpoints and workers to help you monitor, debug, and troubleshoot your applications. Understanding the different types of logs and their persistence characteristics is crucial for effective application management.
+Runpod provides comprehensive logging capabilities for Serverless endpoints and workers to help you monitor, debug, and troubleshoot your applications.
 
 ## Endpoint logs
 
@@ -256,6 +256,78 @@ if __name__ == "__main__":
     runpod.serverless.start({"handler": handler})
 ```
 
+## Structured logging
+
+Outputting structured logs in a machine-readable format (typically JSON) makes it easier to parse, search, and analyze logs programmatically. This is especially useful when exporting logs to external services or analyzing large volumes of logs.
+
+### JSON logging example
+
+```python
+import logging
+import json
+import runpod
+
+def setup_structured_logger():
+    """
+    Configure a logger that outputs JSON-formatted logs.
+    """
+    logger = logging.getLogger("runpod_worker")
+    logger.setLevel(logging.DEBUG)
+    
+    # Create a handler that outputs to stdout
+    handler = logging.StreamHandler()
+    
+    # Don't use a formatter—we'll format manually as JSON
+    logger.addHandler(handler)
+    
+    return logger
+
+logger = setup_structured_logger()
+
+def log_json(level, message, **kwargs):
+    """
+    Log a structured JSON message.
+    """
+    log_entry = {
+        "level": level,
+        "message": message,
+        **kwargs
+    }
+    print(json.dumps(log_entry))
+
+def handler(event):
+    request_id = event.get("id", "unknown")
+    
+    try:
+        log_json("INFO", "Processing request", request_id=request_id, input_keys=list(event.get("input", {}).keys()))
+        
+        # Replace process_input() with your own processing logic
+        result = process_input(event["input"])
+        
+        log_json("INFO", "Request completed", request_id=request_id, execution_time_ms=123)
+        
+        return {"output": result}
+    except Exception as e:
+        log_json("ERROR", "Request failed", request_id=request_id, error=str(e), error_type=type(e).__name__)
+        return {"error": str(e)}
+
+runpod.serverless.start({"handler": handler})
+```
+
+This produces logs like:
+
+```json
+{"level": "INFO", "message": "Processing request", "request_id": "abc123", "input_keys": ["prompt", "max_length"]}
+{"level": "INFO", "message": "Request completed", "request_id": "abc123", "execution_time_ms": 123}
+```
+
+### Benefits of structured logging
+
+- **Easier parsing**: JSON logs can be easily parsed by log aggregation tools.
+- **Better search**: Search for specific fields like `request_id` or `error_type`.
+- **Analytics**: Analyze trends, patterns, and metrics from log data.
+- **Integration**: Export to external services like Datadog, Splunk, or Elasticsearch.
+
 ### Accessing stored logs
 
 To access logs stored in network volumes:
diff --git a/serverless/development/optimization.mdx b/serverless/development/optimization.mdx
new file mode 100644
index 00000000..d7d6027c
--- /dev/null
+++ b/serverless/development/optimization.mdx
@@ -0,0 +1,297 @@
+---
+title: "Benchmarking and optimization"
+sidebarTitle: "Optimization"
+description: "Optimize your Serverless workers for performance and cost."
+---
+
+Optimizing your Serverless workers improves performance, reduces costs, and creates a better experience for your users. This guide covers benchmarking, error handling, and CI/CD integration.
+
+## Benchmarking response times
+
+Understanding your worker's performance helps you choose the right GPU and optimize your code. You can measure two key metrics:
+
+- **Delay time**: Time spent waiting for a worker to become available (cold start time).
+- **Execution time**: Time the GPU takes to actually process the request.
+
+### Send a test request
+
+Use `curl` to send a request to your endpoint:
+
+```sh
+curl -X POST https://api.runpod.ai/v2/YOUR_ENDPOINT_ID/run \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer YOUR_API_KEY" \
+  -d '{"input": {"prompt": "Hello, world!"}}'
+```
+
+This returns a request ID:
+
+```json
+{
+  "id": "abc123-def456-ghi789",
+  "status": "IN_QUEUE"
+}
+```
+
+### Check the status
+
+Use the request ID to check the status:
+
+```sh
+curl -X GET https://api.runpod.ai/v2/YOUR_ENDPOINT_ID/status/abc123-def456-ghi789 \
+  -H "Authorization: Bearer YOUR_API_KEY"
+```
+
+The response includes timing metrics:
+
+```json
+{
+  "delayTime": 2341,
+  "executionTime": 1563,
+  "id": "abc123-def456-ghi789",
+  "output": {
+    "result": "Hello, world!"
+  },
+  "status": "COMPLETED"
+}
+```
+
+- `delayTime`: Milliseconds spent waiting for a worker (includes cold start if applicable).
+- `executionTime`: Milliseconds the GPU took to process the request.
+
+### Automate benchmarking
+
+Create a Python script to automate benchmarking:
+
+```python benchmark.py
+import requests
+import time
+import statistics
+
+ENDPOINT_ID = "YOUR_ENDPOINT_ID"
+API_KEY = "YOUR_API_KEY"
+BASE_URL = f"https://api.runpod.ai/v2/{ENDPOINT_ID}"
+HEADERS = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {API_KEY}"
+}
+
+def run_benchmark(num_requests=5):
+    delay_times = []
+    execution_times = []
+    
+    for i in range(num_requests):
+        # Send request
+        response = requests.post(
+            f"{BASE_URL}/run",
+            headers=HEADERS,
+            json={"input": {"prompt": f"Test request {i+1}"}}
+        )
+        request_id = response.json()["id"]
+        
+        # Poll for completion
+        while True:
+            status_response = requests.get(
+                f"{BASE_URL}/status/{request_id}",
+                headers=HEADERS
+            )
+            status_data = status_response.json()
+            
+            if status_data["status"] == "COMPLETED":
+                delay_times.append(status_data["delayTime"])
+                execution_times.append(status_data["executionTime"])
+                break
+            elif status_data["status"] == "FAILED":
+                print(f"Request {i+1} failed")
+                break
+            
+            time.sleep(1)
+    
+    # Calculate statistics
+    print(f"Delay Time - Min: {min(delay_times)}ms, Max: {max(delay_times)}ms, Avg: {statistics.mean(delay_times):.0f}ms")
+    print(f"Execution Time - Min: {min(execution_times)}ms, Max: {max(execution_times)}ms, Avg: {statistics.mean(execution_times):.0f}ms")
+
+if __name__ == "__main__":
+    run_benchmark(num_requests=5)
+```
+
+Run the script:
+
+```sh
+python benchmark.py
+```
+
+### Optimize based on results
+
+- **High delay time**: Increase active workers or use FlashBoot to reduce cold starts.
+- **High execution time**: Optimize your code, use a faster GPU, or reduce batch sizes.
+- **Inconsistent times**: Check for resource contention or inefficient code paths.
+
+## Error handling
+
+Robust error handling prevents your worker from crashing and provides helpful error messages to users.
+
+### Basic error handling
+
+Wrap your handler logic in a try-except block:
+
+```python
+import runpod
+
+def handler(job):
+    try:
+        input = job["input"]
+
+        # Replace process_input() with your own handler logic
+        result = process_input(input)
+
+        return {"output": result}
+    except KeyError as e:
+        return {"error": f"Missing required input: {str(e)}"}
+    except Exception as e:
+        return {"error": f"An error occurred: {str(e)}"}
+
+runpod.serverless.start({"handler": handler})
+```
+
+### Structured error responses
+
+Return consistent error objects with useful information:
+
+```python
+import runpod
+import traceback
+
+def handler(job):
+    try:
+        # Validate input
+        if "prompt" not in job.get("input", {}):
+            return {
+                "error": {
+                    "type": "ValidationError",
+                    "message": "Missing required field: prompt",
+                    "details": "The 'prompt' field is required in the input object"
+                }
+            }
+        
+        prompt = job["input"]["prompt"]
+        result = process_prompt(prompt)
+        return {"output": result}
+        
+    except ValueError as e:
+        return {
+            "error": {
+                "type": "ValueError",
+                "message": str(e),
+                "details": "Invalid input value provided"
+            }
+        }
+    except Exception as e:
+        # Log the full traceback for debugging
+        print(f"Unexpected error: {traceback.format_exc()}")
+        return {
+            "error": {
+                "type": "UnexpectedError",
+                "message": "An unexpected error occurred",
+                "details": str(e)
+            }
+        }
+
+runpod.serverless.start({"handler": handler})
+```
+
+### Timeout handling
+
+For long-running operations, implement timeout logic:
+
+```python
+import runpod
+import signal
+
+class TimeoutError(Exception):
+    pass
+
+def timeout_handler(signum, frame):
+    raise TimeoutError("Operation timed out")
+
+def handler(job):
+    try:
+        # Set a timeout (e.g., 60 seconds)
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(60)
+        
+        # Your processing code here
+        result = long_running_operation(job["input"])
+        
+        # Cancel the timeout
+        signal.alarm(0)
+        
+        return {"output": result}
+        
+    except TimeoutError:
+        return {"error": "Request timed out after 60 seconds"}
+    except Exception as e:
+        return {"error": str(e)}
+
+runpod.serverless.start({"handler": handler})
+```
+
+## CI/CD integration
+
+Automate your deployment workflow with GitHub integration.
+
+### Manual CI/CD with GitHub Actions
+
+For more control, you can use GitHub Actions to build and deploy your worker:
+
+```yaml .github/workflows/deploy.yml
+name: Deploy to Runpod Serverless
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+      
+      - name: Build and push
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: true
+          tags: your-username/your-worker:latest
+      
+      - name: Update Runpod endpoint
+        run: |
+          curl -X POST https://api.runpod.ai/v2/${{ secrets.RUNPOD_ENDPOINT_ID }}/update \
+            -H "Authorization: Bearer ${{ secrets.RUNPOD_API_KEY }}" \
+            -H "Content-Type: application/json" \
+            -d '{"imageName": "your-username/your-worker:latest"}'
+```
+
+Add these secrets to your GitHub repository:
+
+- `DOCKER_USERNAME`: Your Docker Hub username.
+- `DOCKER_PASSWORD`: Your Docker Hub password or access token.
+- `RUNPOD_ENDPOINT_ID`: Your Runpod endpoint ID.
+- `RUNPOD_API_KEY`: Your Runpod API key.
+
+## Next steps
+
+- [Local testing](/serverless/development/local-testing) - Test your optimizations locally.
+- [Logs](/serverless/development/logs) - Monitor your worker's performance in production.
+- [Environment variables](/serverless/development/environment-variables) - Configure your workers for different environments.
diff --git a/serverless/development/overview.mdx b/serverless/development/overview.mdx
index 66bbeae4..42b2b0bb 100644
--- a/serverless/development/overview.mdx
+++ b/serverless/development/overview.mdx
@@ -1,137 +1,81 @@
 ---
-title: "Local server flags"
+title: "Serverless development"
+description: "Build, test, and deploy Serverless workers to production."
 ---
 
-When developing Runpod Serverless functions, it's crucial to test them thoroughly before deployment. The Runpod SDK provides a powerful local testing environment that allows you to simulate your Serverless endpoints right on your development machine. This local server eliminates the need for constant Docker container rebuilds, uploads, and endpoint updates during the development and testing phase.
+When developing for Runpod Serverless, you'll typically start by writing handler functions, test them locally, and then deploy to production. This guide introduces the development workflow and tools that help you build, test, and deploy Serverless workers effectively.
 
-To facilitate this local testing environment, the Runpod SDK offers a variety of flags that allow you to customize your setup. These flags enable you to:
+## Development lifecycle
 
-* Configure the server settings (port, host, concurrency)
-* Control logging verbosity
-* Enable debugging features
-* Provide test inputs
+The typical workflow starts with writing your handler function. Your handler receives an event object with input data and returns a response. Once you have a handler function, test it locally using the Runpod SDK's testing environment. You can test with inline JSON inputs, use a local API server, or simulate concurrency, all without actually deploying your code and incurring charges.
 
-By using these flags, you can create a local environment that closely mimics the behavior of your functions in the Runpod cloud, allowing for more accurate testing and smoother deployments.
+For GPU-intensive applications, you might want to develop on a Pod first before deploying to Serverless. This "Pod-first" workflow gives you direct access to the GPU environment with tools like Jupyter Notebooks and SSH, letting you iterate faster than deploying repeatedly to Serverless.
 
-This guide provides a comprehensive overview of all available flags, their purposes, and how to use them effectively in your local testing workflow.
+When your handler is working correctly, package it into a Docker image and deploy it to a Serverless endpoint. Your worker will auto-scale based on demand. Once deployed, use logs, metrics, and SSH access to troubleshoot issues and optimize performance in production.
 
-## Basic usage
+## Development tools
 
-To start your local server with additional flags, use the following format:
+### Local testing environment
 
-```sh
-python your_function.py [flags]
-```
+The Runpod SDK provides a comprehensive local testing environment:
 
-Replace `your_function.py` with the name of your Python file containing the Runpod handler.
+- **Basic testing**: Run your handler with inline JSON or test files.
+- **Local API server**: Simulate HTTP requests to your Serverless endpoint.
+- **Concurrency testing**: Test how your handler performs under parallel execution.
+- **Debug mode**: Enable detailed logging and troubleshooting output.
 
-## Available flags
+Learn more in [Local testing](/serverless/development/local-testing).
 
-### --rp\_serve\_api
+### SDK utilities
 
-Starts the API server for local testing.
+The Runpod SDK includes helper functions to make your handlers more robust:
 
-**Usage**:
+- **Input validation**: Validate request data against a schema.
+- **Cleanup utilities**: Automatically remove temporary files after processing.
 
-```sh
-python your_function.py --rp_serve_api
-```
+Learn more in [SDK utilities](/serverless/development/sdk-utilities).
 
-### --rp\_api\_port
+### Pod-first development
 
-Sets the port number for the FastAPI server.
+For complex GPU applications, develop on a Pod first, then deploy the same Docker image to Serverless. This workflow provides:
 
-**Default**: 8000
+- Interactive development with Jupyter Notebooks.
+- Direct SSH access to the GPU environment.
+- Faster iteration compared to deploying repeatedly to Serverless.
 
-**Usage**:
+Learn more in [Pod-first development](/serverless/development/pod-first-development).
 
-```sh
-python your_function.py --rp_serve_api --rp_api_port 8080
-```
+### Debugging and observability
 
-Setting `--rp_api_host` to `0.0.0.0` allows connections from other devices on the network, which can be useful for testing but may have security implications.
+Runpod provides several tools for debugging and monitoring:
 
-### --rp\_api\_concurrency
+- **Logs**: View real-time and historical logs from your workers.
+- **Metrics**: Monitor execution time, delay time, and resource usage.
+- **SSH access**: Connect directly to running workers for live debugging.
 
-Sets the number of concurrent workers for the FastAPI server.
+Learn more in [Logs](/serverless/development/logs) and [SSH access](/serverless/development/ssh-into-workers).
 
-**Default**: 1
+## Configuration
 
-**Usage**:
+### Environment variables
 
-```sh
-python your_function.py --rp_serve_api --rp_api_concurrency 4
-```
+Use environment variables to configure your workers without hardcoding credentials or settings in your code. Environment variables are set in the Runpod console and are available to your handler at runtime.
 
-<Info>
+Learn more in [Environment variables](/serverless/development/environment-variables).
 
-When using `--rp_api_concurrency` with a value greater than 1, ensure your main file is named `main.py` for proper FastAPI integration.
+### Optimization
 
-</Info>
+Optimize your workers for performance and cost:
 
-### --rp\_api\_host
+- **Benchmark response times**: Measure cold start and execution time.
+- **Error handling**: Implement robust error handling in your handler.
 
-Sets the hostname for the FastAPI server.
+Learn more in [Optimization](/serverless/development/optimization).
 
-**Default**: "localhost"
+## Next steps
 
-**Usage**:
+Start by learning how to test your handler locally:
 
-```sh
-python your_function.py --rp_serve_api --rp_api_host 0.0.0.0
-```
-
-### --rp\_log\_level
-
-Controls the verbosity of console output.
-
-**Options**: `ERROR` | `WARN` | `INFO` | `DEBUG`
-
-**Usage**:
-
-```sh
-python your_function.py --rp_serve_api --rp_log_level DEBUG
-```
-
-### --rp\_debugger
-
-Enables the Runpod debugger for troubleshooting. The `--rp_debugger` flag is particularly useful when you need to step through your code for troubleshooting.
-
-**Usage**:
-
-```sh
-python your_function.py --rp_serve_api --rp_debugger
-```
-
-### --test\_input
-
-Provides test input data for your function, formatted as JSON.
-
-**Usage**:
-
-```sh
-python your_function.py --rp_serve_api \
-    --test_input '{"input": {"key": "value"}}'
-```
-
-The `--test_input` flag is an alternative to using a `test_input.json` file. If both are present, the command-line input takes precedence.
-
-## Combined flags
-
-You can combine multiple flags to customize your local testing environment.
-
-For example:
-
-```sh
-python main.py --rp_serve_api \
-    --rp_api_port 8080 \
-    --rp_api_concurrency 4 \
-    --rp_log_level DEBUG \
-    --test_input '{"input": {"key": "value"}}'
-```
-
-This command starts the local server on port `8080` with 4 concurrent workers, sets the log level to `DEBUG`, and provides test input data.
-
-These flags provide powerful tools for customizing your local testing environment. By using them effectively, you can simulate various scenarios, debug issues, and ensure your Serverless functions are robust and ready for deployment to the Runpod cloud.
-
-For more detailed information on each flag and advanced usage scenarios, refer to the individual tutorials in this documentation.
+- [Local testing](/serverless/development/local-testing)
+- [SDK utilities](/serverless/development/sdk-utilities)
+- [Pod-first development](/serverless/development/pod-first-development)
diff --git a/serverless/development/pod-first-development.mdx b/serverless/development/pod-first-development.mdx
new file mode 100644
index 00000000..e77912dc
--- /dev/null
+++ b/serverless/development/pod-first-development.mdx
@@ -0,0 +1,213 @@
+---
+title: "Pod-first development"
+description: "Develop on a Pod before deploying to Serverless for faster iteration."
+---
+
+Developing machine learning applications often requires powerful GPUs, making local development challenging. Instead of repeatedly deploying to Serverless for testing, you can develop on a Pod first and then deploy the same Docker image to Serverless when ready.
+
+This "Pod-first" workflow lets you develop and test interactively in a GPU environment, then seamlessly transition to Serverless for production. You'll use a Pod as your cloud-based development machine with tools like Jupyter Notebooks and SSH, catching issues early before deploying to Serverless.
+
+<Tip>
+To get started quickly, you can [clone this repository](https://github.com/justinwlin/Runpod-GPU-And-Serverless-Base) for a ready-to-use dual-mode worker base.
+</Tip>
+
+## What you'll learn
+
+In this guide you'll learn how to:
+
+- Set up a project for a dual-mode Serverless worker.
+- Create a handler that adapts based on an environment variable.
+- Write a startup script to manage different operational modes.
+- Build a Docker image that works in both Pod and Serverless environments.
+- Deploy and test your worker in both environments.
+
+## Requirements
+
+- You've [created a Runpod account](/get-started/manage-accounts).
+- You've installed [Python 3.x](https://www.python.org/downloads/) and [Docker](https://docs.docker.com/get-started/get-docker/) and configured them for your command line.
+- Basic understanding of Docker concepts and shell scripting.
+
+## Step 1: Set up your project structure
+
+Create a directory for your project and the necessary files:
+
+```sh
+mkdir dual-mode-worker
+cd dual-mode-worker
+touch handler.py start.sh Dockerfile requirements.txt
+```
+
+This creates:
+
+- `handler.py`: Your Python script with the Runpod handler logic.
+- `start.sh`: A shell script that will be the entrypoint for your Docker container.
+- `Dockerfile`: Instructions to build your Docker image.
+- `requirements.txt`: A file to list Python dependencies.
+
+## Step 2: Create the handler
+
+This Python script will check for a `MODE_TO_RUN` environment variable to determine whether to run in Pod or Serverless mode.
+
+Add the following code to `handler.py`:
+
+```python handler.py
+import os
+import asyncio
+import runpod
+
+# Check the MODE_TO_RUN environment variable; default to "pod"
+mode_to_run = os.getenv("MODE_TO_RUN", "pod")
+
+print("------- ENVIRONMENT VARIABLES -------")
+print("Mode running: ", mode_to_run)
+print("------- -------------------- -------")
+
+async def handler(event):
+    inputReq = event.get("input", {})
+    return inputReq
+
+if mode_to_run == "pod":
+    # Pod mode: run a test directly
+    async def main():
+        prompt = "Hello World"
+        requestObject = {"input": {"prompt": prompt}}
+        response = await handler(requestObject)
+        print(response)
+
+    asyncio.run(main())
+else: 
+    # Serverless mode: start the serverless worker
+    runpod.serverless.start({
+        "handler": handler,
+        "concurrency_modifier": lambda current: 1,
+    })
+```
+
+## Step 3: Create the startup script
+
+The `start.sh` script serves as the entrypoint for your Docker container and manages different operational modes.
+
+Add the following code to `start.sh`:
+
+```bash start.sh
+#!/bin/bash
+
+echo "Pod Started"
+
+# Check if MODE_TO_RUN is set; if not, start an interactive shell
+if [ -z "$MODE_TO_RUN" ]; then
+    echo "MODE_TO_RUN not set. Starting interactive mode..."
+    exec /bin/bash
+else
+    echo "MODE_TO_RUN is set to: $MODE_TO_RUN"
+    python -u /handler.py
+fi
+```
+
+Make the script executable:
+
+```sh
+chmod +x start.sh
+```
+
+## Step 4: Create the Dockerfile
+
+Create a Dockerfile that includes your handler and startup script:
+
+```dockerfile Dockerfile
+FROM runpod/base:0.4.0-cuda11.8.0
+
+# Set the working directory
+WORKDIR /
+
+# Copy your handler and startup script
+COPY handler.py /handler.py
+COPY start.sh /start.sh
+
+# Install Python dependencies
+COPY requirements.txt /requirements.txt
+RUN pip install --no-cache-dir -r /requirements.txt
+
+# Make the startup script executable
+RUN chmod +x /start.sh
+
+# Set the entrypoint
+CMD ["/start.sh"]
+```
+
+## Step 5: Add dependencies
+
+Add the Runpod SDK to your `requirements.txt` file:
+
+```txt requirements.txt
+runpod
+```
+
+## Step 6: Build your Docker image
+
+Build your Docker image:
+
+```sh
+docker build -t your-username/dual-mode-worker:latest .
+```
+
+Push it to a container registry like Docker Hub:
+
+```sh
+docker push your-username/dual-mode-worker:latest
+```
+
+## Step 7: Deploy to a Pod for development
+
+Deploy your image to a Pod for interactive development:
+
+1. Navigate to the [Pods page](https://www.runpod.io/console/pods) in the Runpod console.
+2. Click **Deploy**.
+3. Select your preferred GPU.
+4. Under **Docker Image Name**, enter `your-username/dual-mode-worker:latest`.
+5. Leave the `MODE_TO_RUN` environment variable unset (or don't add it).
+6. Click **Deploy**.
+
+Once your Pod is running, you can:
+
+- Connect via SSH to test your handler interactively.
+- Use Jupyter Notebooks if you've configured them.
+- Debug and iterate on your code.
+- Test GPU-specific operations.
+
+## Step 8: Deploy to Serverless for production
+
+Once your handler works correctly on a Pod, deploy the same image to Serverless:
+
+1. Navigate to the [Serverless page](https://www.runpod.io/console/serverless) in the Runpod console.
+2. Click **New Endpoint**.
+3. Under **Docker Image**, enter `your-username/dual-mode-worker:latest`.
+4. Under **Environment Variables**, add:
+   - Key: `MODE_TO_RUN`
+   - Value: `serverless`
+5. Configure your endpoint settings (GPU type, workers, etc.).
+6. Click **Deploy**.
+
+Your worker will now run in Serverless mode, processing requests from your endpoint.
+
+## How it works
+
+The key to this workflow is the `MODE_TO_RUN` environment variable:
+
+- **Pod mode** (`MODE_TO_RUN` not set or set to `"pod"`): The handler runs a test directly and then keeps the container alive for interactive development.
+- **Serverless mode** (`MODE_TO_RUN="serverless"`): The handler starts the Runpod Serverless worker to process incoming requests.
+
+This lets you use the same Docker image for both development and production, eliminating the need to rebuild and redeploy when transitioning between environments.
+
+## Benefits of Pod-first development
+
+- **Faster iteration**: Develop and test interactively without waiting for Serverless deployments.
+- **Better debugging**: Use SSH, Jupyter Notebooks, and other interactive tools.
+- **GPU access**: Test GPU-specific code directly in the cloud.
+- **Seamless transition**: Deploy the same image to Serverless without modifications.
+
+## Next steps
+
+- [Local testing](/serverless/development/local-testing) - Test your handler locally before deploying.
+- [Environment variables](/serverless/development/environment-variables) - Learn more about configuring workers with environment variables.
+- [SSH access](/serverless/development/ssh-into-workers) - Connect to running workers for debugging.
diff --git a/serverless/development/sdk-utilities.mdx b/serverless/development/sdk-utilities.mdx
new file mode 100644
index 00000000..d1ea3314
--- /dev/null
+++ b/serverless/development/sdk-utilities.mdx
@@ -0,0 +1,185 @@
+---
+title: "SDK utilities"
+description: "Use helper functions to validate inputs and clean up temporary files."
+---
+
+The Runpod SDK includes helper functions to make your handlers more robust and easier to maintain. These utilities handle common tasks like input validation and cleanup.
+
+## Input validation
+
+The validator utility ensures your handler receives the correct input format before processing. This helps catch errors early and prevents issues from unexpected or malformed inputs.
+
+### Import the validator
+
+```python
+from runpod.serverless.utils.rp_validator import validate
+```
+
+### Define a schema
+
+Define your schema as a dictionary with validation rules for each input field:
+
+```python
+schema = {
+    "text": {
+        "type": str,
+        "required": True,
+    },
+    "max_length": {
+        "type": int,
+        "required": False,
+        "default": 100,
+        "constraints": lambda x: x > 0,
+    },
+}
+```
+
+Schema properties:
+- `type` (required): Expected input type (e.g., `str`, `int`, `float`, `bool`).
+- `required` (default: `False`): Whether the field is required.
+- `default` (default: `None`): Default value if input is not provided.
+- `constraints` (optional): A lambda function that returns `True` or `False` to validate the value.
+
+### Validate input in your handler
+
+```python
+import runpod
+from runpod.serverless.utils.rp_validator import validate
+
+schema = {
+    "text": {
+        "type": str,
+        "required": True,
+    },
+    "max_length": {
+        "type": int,
+        "required": False,
+        "default": 100,
+        "constraints": lambda x: x > 0,
+    },
+}
+
+
+def handler(event):
+    try:
+        validated_input = validate(event["input"], schema)
+        if "errors" in validated_input:
+            return {"error": validated_input["errors"]}
+
+        text = validated_input["validated_input"]["text"]
+        max_length = validated_input["validated_input"]["max_length"]
+
+        result = text[:max_length]
+        return {"output": result}
+    except Exception as e:
+        return {"error": str(e)}
+
+
+runpod.serverless.start({"handler": handler})
+```
+
+### Test the validator
+
+Save your handler as `your_handler.py` and test it:
+
+```sh
+python your_handler.py --test_input '{"input": {"text": "Hello, world!", "max_length": 5}}'
+```
+
+Or create a `test_input.json` file:
+
+```json test_input.json
+{
+  "input": {
+    "text": "The quick brown fox jumps over the lazy dog",
+    "max_length": 50
+  }
+}
+```
+
+## Cleanup utility
+
+The cleanup utility removes temporary files and folders after your handler completes processing. This prevents disk space issues from accumulating temporary data.
+
+### Import the cleanup function
+
+```python
+from runpod.serverless.utils.rp_cleanup import clean
+```
+
+### Default behavior
+
+By default, `clean()` removes these directories and files:
+- `input_objects/`
+- `output_objects/`
+- `job_files/`
+- `output.zip`
+
+### Use cleanup in your handler
+
+```python
+import runpod
+from runpod.serverless.utils.rp_cleanup import clean
+import requests
+import os
+
+
+def download_image(url, save_path):
+    response = requests.get(url)
+    if response.status_code == 200:
+        with open(save_path, "wb") as file:
+            file.write(response.content)
+        return True
+    return False
+
+
+def handler(event):
+    try:
+        image_url = event["input"]["image_url"]
+
+        # Create a temporary directory
+        os.makedirs("temp_images", exist_ok=True)
+        image_path = "temp_images/downloaded_image.jpg"
+
+        # Download the image
+        if not download_image(image_url, image_path):
+            raise Exception("Failed to download image")
+
+        # Process the image (your code here)
+        result = f"Processed image from: {image_url}"
+
+        # Cleanup after processing
+        clean(folder_list=["temp_images"])
+
+        return {"output": result}
+    except Exception as e:
+        # Attempt cleanup even on error
+        clean(folder_list=["temp_images"])
+        return {"error": str(e)}
+
+
+runpod.serverless.start({"handler": handler})
+```
+
+### Custom cleanup
+
+Specify additional folders to remove by passing a list to `clean()`:
+
+```python
+clean(folder_list=["temp_images", "cache", "downloads"])
+```
+
+### Best practices
+
+- Call `clean()` at the end of your handler to ensure proper cleanup.
+- Use try-except blocks to handle errors during cleanup.
+- Be cautious when adding custom folders to the cleanup list.
+- Consider logging cleanup actions for debugging purposes.
+
+## Next steps
+
+Learn about other development tools:
+
+- [Local testing](/serverless/development/local-testing) - Test your handler before deploying.
+- [Pod-first development](/serverless/development/pod-first-development) - Develop on a Pod before deploying to Serverless.
+- [Environment variables](/serverless/development/environment-variables) - Configure your workers without hardcoding credentials.
diff --git a/serverless/development/ssh-into-workers.mdx b/serverless/development/ssh-into-workers.mdx
index 3ec7e303..1b389c2f 100644
--- a/serverless/development/ssh-into-workers.mdx
+++ b/serverless/development/ssh-into-workers.mdx
@@ -60,7 +60,7 @@ Before you can SSH into a worker, you'll need to generate an SSH key and add it
     4. Under **Worker configuration**, set **Active workers** to 1 or more.
     5. Click **Save** to apply the changes.
 
-    This ensures at least one worker remains running at all times, eliminating cold start delays and allowing you to SSH in.
+    This ensures at least one worker remains running at all times, and allowing you to SSH in without your worker being automatically scaled down.
   </Step>
 
   <Step title="Select a running worker">
diff --git a/serverless/development/test-response-times.mdx b/serverless/development/test-response-times.mdx
index d4baf490..e69de29b 100644
--- a/serverless/development/test-response-times.mdx
+++ b/serverless/development/test-response-times.mdx
@@ -1,45 +0,0 @@
----
-title: "Test response time"
----
-
-When setting up an API, you have several options available at different price points and resource allocations. You can select a single option if you would prefer to only use one price point, or select a preference order between the pools that will allocate your requests accordingly.
-
-<Frame caption="">
-  <img src="/images/e37411f4-742bf51-image-cff83571196f6b2b13ae24cb5bd7df47.png" />
-</Frame>
-
-The option that will be most cost effective for you will be based on your use case and your tolerance for task run time. Each situation will be different, so when deciding which API to use, it's worth it to do some testing to not only find out how long your tasks will take to run, but how much you might expect to pay for each task.
-
-To find out how long a task will take to run, select a single pool type as shown in the image above. Then, you can send a request to the API through your preferred method. If you're unfamiliar with how to do so or don't have your own method, then you can use a free option like [reqbin.com](https://reqbin.com/) to send an API request to the Runpod severs.
-
-The URLs to use in the API will be shown in the My APIs screen:
-
-<Frame caption="">
-  <img src="/images/fce8d6bd-0d8dd86-image-8a23d98f45eed07fe6180dad5064f4c9.png" />
-</Frame>
-
-On reqbin.com, enter the Run URL of your API, select POST under the dropdown, and enter your API key that was given when you created the key under [Settings](https://www.console.runpod.io/serverless/user/settings)(if you do not have it saved, you will need to return to Settings and create a new key). Under Content, you will also need to give it a basic command (in this example, we've used a Stable Diffusion prompt).
-
-<Frame caption="">
-  <img src="/images/d77f3a84-a9b9cf3-image-553445aee2a6def062ebb7a925453aae.png" />
-</Frame>
-
-<Frame caption="">
-  <img src="/images/8694371e-7744b62-image-3c8c3981552d48e48c54d34527a9abdd.png" />
-</Frame>
-
-Send the request, and it will give you an ID for the request and notify you that it is processing. You can then swap the URL in the request field with the Status address and add the ID to the end of it, and click Send.
-
-<Frame caption="">
-  <img src="/images/b9250bc0-325f2bc-image-d0464eebb7314399317842515da851f6.png" />
-</Frame>
-
-It will return a Delay Time and an Execution Time, denoted in milliseconds. The Delay Time should be extremely minimal, unless the API process was spun up from a cold start, then a sizable delay is expected for the first request sent. The Execution Time is how long the GPU took to actually process the request once it was received. It may be a good idea to send a number of tests so you can get a min, max, and average run time -- five tests should be an adequate sample size.
-
-<Frame caption="">
-  <img src="/images/67c35646-1608d44-image-0ae69f46dc677749ae19ba027efd62c4.png" />
-</Frame>
-
-You can then switch the GPU pool above to a different pool and repeat the process.
-
-What will ultimately be right for your use case will be determined by how long you can afford to let the process run. For heavier jobs, a task on a slower GPU will be likely be more cost-effective with a tradeoff of speed. For simpler tasks, there may also be diminishing returns on how fast the task that can be run that may not be significantly improved by selecting higher-end GPUs. Experiment to find the best balance for your scenario.
diff --git a/serverless/development/validator.mdx b/serverless/development/validator.mdx
index b7d26eb0..e69de29b 100644
--- a/serverless/development/validator.mdx
+++ b/serverless/development/validator.mdx
@@ -1,99 +0,0 @@
----
-title: "Input validation"
----
-
-Runpod's validator utility ensures robust execution of serverless workers by validating input data against a defined schema.
-
-To use it, import the following to your Python file:
-
-```py
-from runpod.serverless.utils.rp_validator import validate
-```
-
-The `validate` function takes two arguments:
-
-* the input data
-* the schema to validate against
-
-## Schema Definition
-
-Define your schema as a nested dictionary with these possible rules for each input:
-
-* `required` (default: `False`): Marks the type as required.
-* `default` (default: `None`): Default value if input is not provided.
-* `type` (required): Expected input type.
-* `constraints` (optional): for example, a lambda function returning `true` or `false`.
-
-## Example Usage
-
-```python
-import runpod
-from runpod.serverless.utils.rp_validator import validate
-
-schema = {
-    "text": {
-        "type": str,
-        "required": True,
-    },
-    "max_length": {
-        "type": int,
-        "required": False,
-        "default": 100,
-        "constraints": lambda x: x > 0,
-    },
-}
-
-
-def handler(event):
-    try:
-        validated_input = validate(event["input"], schema)
-        if "errors" in validated_input:
-            return {"error": validated_input["errors"]}
-
-        text = validated_input["validated_input"]["text"]
-        max_length = validated_input["validated_input"]["max_length"]
-
-        result = text[:max_length]
-        return {"output": result}
-    except Exception as e:
-        return {"error": str(e)}
-
-
-runpod.serverless.start({"handler": handler})
-```
-
-## Testing
-
-Save as `your_handler.py` and test using:
-
-<Tabs>
-<Tab title="Command">
-```sh
-python your_handler.py
-```
-
-Or with inline input:
-
-```sh
-python your_handler.py --test_input '{"input": {"text": "Hello, world!", "max_length": 5}}'
-```
-
-</Tab>
-
-<Tab title="JSON">
-Create `test_input.json`:
-
-```json
-{
-  "input": {
-    "text": "The quick brown fox jumps over the lazy dog",
-    "max_length": 50
-  }
-}
-```
-
-</Tab>
-
-</Tabs>
-
-This approach allows early detection of input errors, preventing issues from unexpected or malformed inputs.

From 90236290d9d0faecc9b1783990d1a14619274c00 Mon Sep 17 00:00:00 2001
From: Mo King <muhsinking@gmail.com>
Date: Thu, 11 Dec 2025 15:09:52 -0500
Subject: [PATCH 2/7] Second draft of development section, add development
 diagram,  rewrite endpoint settings

---
 docs.json                                     |  14 +-
 release-notes.mdx                             |   7 +
 serverless/development/dual-mode-worker.mdx   | 136 +++++------
 serverless/development/error-handling.mdx     | 112 +++++++++
 serverless/development/local-testing.mdx      |  38 ++--
 serverless/development/logs.mdx               |   4 +-
 serverless/development/optimization.mdx       | 202 ++---------------
 serverless/development/overview.mdx           |  59 ++++-
 .../development/pod-first-development.mdx     | 213 ------------------
 serverless/development/sdk-utilities.mdx      |   6 +-
 .../endpoints/endpoint-configurations.mdx     | 198 +++++-----------
 11 files changed, 330 insertions(+), 659 deletions(-)
 create mode 100644 serverless/development/error-handling.mdx
 delete mode 100644 serverless/development/pod-first-development.mdx

diff --git a/docs.json b/docs.json
index c7d859c5..f4944279 100644
--- a/docs.json
+++ b/docs.json
@@ -96,12 +96,12 @@
                 "serverless/development/overview",
                 "serverless/development/local-testing",
                 "serverless/development/sdk-utilities",
-                "serverless/development/pod-first-development",
+                "serverless/development/error-handling",
+                "serverless/development/optimization",
+                "serverless/development/logs",
                 "serverless/development/dual-mode-worker",
-                "serverless/development/environment-variables",
-                  "serverless/development/logs",
-                  "serverless/development/ssh-into-workers",
-                "serverless/development/optimization"
+                "serverless/development/ssh-into-workers",
+                "serverless/development/environment-variables"
                 ]
               }
             ]
@@ -539,10 +539,6 @@
       "source": "/serverless/development/cleanup",
       "destination": "/serverless/development/sdk-utilities"
     },
-    {
-      "source": "/serverless/development/dual-mode-worker",
-      "destination": "/serverless/development/pod-first-development"
-    },
     {
       "source": "/serverless/development/test-response-times",
       "destination": "/serverless/development/optimization"
diff --git a/release-notes.mdx b/release-notes.mdx
index 79c8354e..242070b7 100644
--- a/release-notes.mdx
+++ b/release-notes.mdx
@@ -4,6 +4,13 @@ sidebarTitle: "Product updates"
 description: "New features, fixes, and improvements for the Runpod platform."
 ---
 
+<Update label="December 2025">
+## Serverless development guides
+
+- [New Serverless development guides](/serverless/development/overview): We've added a comprehensive new set of guides to developing, testing, and debugging Serverless workers for Runpod.
+
+</Update>
+
 <Update label="September 2025">
 ## Slurm Clusters GA, cached models in beta, and new Public Endpoints available
 
diff --git a/serverless/development/dual-mode-worker.mdx b/serverless/development/dual-mode-worker.mdx
index a3c201a3..d476f00f 100644
--- a/serverless/development/dual-mode-worker.mdx
+++ b/serverless/development/dual-mode-worker.mdx
@@ -1,53 +1,35 @@
 ---
-title: "Build a dual-mode Serverless worker"
-sidebarTitle: "Build a dual-mode worker"
-description: "Create a flexible Serverless worker that supports a Pod-first development workflow."
+title: "Pod-first development"
+description: "Develop on a Pod before deploying to Serverless for faster iteration."
 ---
 
-Developing machine learning and AI applications often requires powerful GPUs, making local development of API endpoints challenging. A typical development workflow for [Serverless](/serverless/overview) would be to write your handler code, deploy it directly to a Serverless endpoint, send endpoint requests to test, debug using worker logs, and repeat.
+Developing machine learning applications often requires powerful GPUs, making local development challenging. Instead of repeatedly deploying to Serverless for testing, you can develop on a Pod first and then deploy the same Docker image to Serverless when ready.
 
-This can have signifcant drawbacks, such as:
-
-*   **Slow iteration**: Each deployment requires a new build and test cycle, which can be time-consuming.
-*   **Limited visibility**: Logs and errors are not always easy to debug, especially when running in a remote environment.
-*   **Resource constraints**: Your local machine may not have the necessary resources to test your application.
-
-This tutorial shows how to build a "Pod-first" development environment: creating a flexible, dual-mode Docker image that can be deployed as either a Pod or a Serverless worker.
-
-Using this method, you'll leverage a [Pod](/pods/overview)—a GPU instance ideal for interactive development, with tools like Jupyter Notebooks and direct IDE integration—as your cloud-based development machine. The Pod will be deployed with a flexible Docker base, allowing the same container image to be seamlessly deployed to a Serverless endpoint.
-
-This workflow lets you develop and thoroughly test your application using a containerized Pod environment, ensuring it works correctly. Then, when you're ready to deploy to production, you can deploy it instantly to Serverless.
-
-Follow the steps below to create a worker image that leverages this flexibility, allowing for faster iteration and more robust deployments.
+This "Pod-first" workflow lets you develop and test interactively in a GPU environment, then seamlessly transition to Serverless for production. You'll use a Pod as your cloud-based development machine with tools like Jupyter Notebooks and SSH, catching issues early before deploying to Serverless.
 
 <Tip>
-
-To get a basic dual-mode worker up and running immediately, you can [clone this repository](https://github.com/justinwlin/Runpod-GPU-And-Serverless-Base) and use it as a base.
-
+To get started quickly, you can [clone this repository](https://github.com/justinwlin/Runpod-GPU-And-Serverless-Base) for a ready-to-use dual-mode worker base.
 </Tip>
 
 ## What you'll learn
 
-In this tutorial you'll learn how to:
+In this guide you'll learn how to:
 
-*   Set up a project for a dual-mode Serverless worker.
-*   Create a handler file (`handler.py`) that adapts its behavior based on a user-specified environment variable.
-*   Write a startup script (`start.sh`) to manage different operational modes.
-*   Build a Docker image designed for flexibility.
-*   Understand and utilize the "Pod-first" development workflow.
-*   Deploy and test your worker in both Pod and Serverless environments.
+- Set up a project for a dual-mode Serverless worker.
+- Create a handler that adapts based on an environment variable.
+- Write a startup script to manage different operational modes.
+- Build a Docker image that works in both Pod and Serverless environments.
+- Deploy and test your worker in both environments.
 
 ## Requirements
 
-*   You've [created a Runpod account](/get-started/manage-accounts).
-*   You've installed [Python 3.x](https://www.python.org/downloads/) and [Docker](https://docs.docker.com/get-started/get-docker/) on your local machine and configured them for your command line.
-*   Basic understanding of Docker concepts and shell scripting.
+- You've [created a Runpod account](/get-started/manage-accounts).
+- You've installed [Python 3.x](https://www.python.org/downloads/) and [Docker](https://docs.docker.com/get-started/get-docker/) and configured them for your command line.
+- Basic understanding of Docker concepts and shell scripting.
 
 ## Step 1: Set up your project structure
 
-First, create a directory for your project and the necessary files.
-
-    Open your terminal and run the following commands:
+Create a directory for your project and the necessary files:
 
 ```sh
 mkdir dual-mode-worker
@@ -62,13 +44,13 @@ This creates:
 - `Dockerfile`: Instructions to build your Docker image.
 - `requirements.txt`: A file to list Python dependencies.
 
-## Step 2: Create the `handler.py` file
+## Step 2: Create the handler
 
-This Python script will contain your core logic. It will check for a user-specified environment variable `MODE_TO_RUN` to determine whether to run in Pod or Serverless mode.
+This Python script will check for a `MODE_TO_RUN` environment variable to determine whether to run in Pod or Serverless mode.
 
 Add the following code to `handler.py`:
 
-```python
+```python handler.py
 import os
 import asyncio
 import runpod
@@ -111,7 +93,7 @@ Key features:
 
 ## Step 3: Create the `start.sh` script
 
-This script will be the entrypoint for your Docker container. It reads the `MODE_TO_RUN` environment variable and configures the container accordingly.
+The `start.sh` script serves as the entrypoint for your Docker container and manages different operational modes. It reads the `MODE_TO_RUN` environment variable and configures the container accordingly.
 
 Add the following code to `start.sh`:
 
@@ -210,18 +192,17 @@ export_env_vars
 echo "Start script(s) finished"
 
 sleep infinity
-
 ```
-Key features:
+
+Here are some key features of this script:
+
 *   `case $MODE_TO_RUN in ... esac`: This structure directs the startup based on the mode.
 *   `serverless` mode: Executes `handler.py`, which then starts the Runpod Serverless worker. `exec` replaces the shell process with the Python process.
 *   `pod` mode: Starts up the JupyterLab server for Pod development, then runs `sleep infinity` to keep the container alive so you can connect to it (e.g., via SSH or `docker exec`). You would then manually run `python /app/handler.py` inside the Pod to test your handler logic.
 
 ## Step 4: Create the `Dockerfile`
 
-This file defines how to build your Docker image.
-
-Add the following content to `Dockerfile`:
+Create a `Dockerfile` that includes your handler and startup script:
 
 ```dockerfile
 # Use an official Runpod base image
@@ -283,7 +264,9 @@ RUN ls -la $WORKSPACE_DIR/start.sh
 # depot build -t justinrunpod/pod-server-base:1.0 . --push --platform linux/amd64
 CMD $WORKSPACE_DIR/start.sh
 ```
-Key features:
+
+Key features of this `Dockerfile`:
+
 *   `FROM runpod/pytorch:2.0.1-py3.10-cuda11.8.0-devel-ubuntu22.04`: Starts with a Runpod base image that comes with nginx, runpodctl, and other helpful base packages.
 *   `ARG WORKSPACE_DIR=/workspace` and `ENV WORKSPACE_DIR=${WORKSPACE_DIR}`: Allows the workspace directory to be set at build time.
 *   `WORKDIR $WORKSPACE_DIR`: Sets the working directory to the value of `WORKSPACE_DIR`.
@@ -300,21 +283,21 @@ Instead of building and pushing your image via Docker Hub, you can also [deploy
 
 </Tip>
 
-Now, build your Docker image and push it to a container registry like Docker Hub.
+Now you're ready to build your Docker image and push it to Docker Hub:
 
 <Steps>
   <Step title="Build your Docker image">
-    Build your Docker image, replacing `[YOUR_USERNAME]` with your Docker Hub username and choosing a suitable image name:
+    Build your Docker image, replacing `YOUR_USERNAME` with your Docker Hub username and choosing a suitable image name:
     
     ```sh
-    docker build --platform linux/amd64 --tag [YOUR_USERNAME]/dual-mode-worker .
+    docker build --platform linux/amd64 --tag YOUR_USERNAME/dual-mode-worker .
     ```
     The `--platform linux/amd64` flag is important for compatibility with Runpod's infrastructure.
   </Step>
   
   <Step title="Push the image to your container registry">
     ```sh
-    docker push [YOUR_USERNAME]/dual-mode-worker:latest
+    docker push YOUR_USERNAME/dual-mode-worker:latest
     ```
     <Note>
 
@@ -330,38 +313,40 @@ Now that you've finished building our Docker image, let's explore how you would
 
 Deploy the image to a Pod by following these steps:
 
-1. Go to the [Pods page](https://www.runpod.io/console/pods) in the Runpod console and click **Create Pod**.
-2. Select an appropriate GPU for your workload (see [Choose a Pod](/pods/choose-a-pod) for guidance).
-3. Under **Pod Template**, select **Edit Template**.
-4. Under **Container Image**, enter `[YOUR_USERNAME]/dual-mode-worker:latest`.
-5. Under **Public Environment Variables**, select **Add environment variable**. Set variable key to **`MODE_TO_RUN`** and the value to **`pod`**.
-6. Click **Set Overrides**, then deploy your Pod.
-
-After [connecting to the Pod](/pods/connect-to-a-pod), navigate to `/app` and run your handler directly:
-
-```sh
-python handler.py
-```
-
-This will execute the Pod-specific test harness in your `handler.py`, giving you immediate feedback. You can edit `handler.py` within the Pod and re-run it for rapid iteration.
+1. Navigate to the [Pods page](https://www.runpod.io/console/pods) in the Runpod console.
+2. Click **Deploy**.
+3. Select your preferred GPU.
+4. Under **Container Image**, enter `YOUR_USERNAME/dual-mode-worker:latest`.
+5. Under **Public Environment Variables**, select **Add environment variable** and add:
+   - Key: `MODE_TO_RUN`
+   - Value: `pod`
+6. Click **Deploy**.
+
+Once your Pod is running, you can:
+- [Connect via the web terminal, JupyterLab, or SSH](/pods/connect-to-a-pod) to test your handler interactively.
+- Debug and iterate on your code.
+- Test GPU-specific operations.
+- Edit `handler.py` within the Pod and re-run it for rapid iteration.
 
 ## Step 7: Deploy to a Serverless endpoint
 
 Once you're confident with your `handler.py` logic tested in Pod mode, you're ready to deploy your dual-mode worker to a Serverless endpoint.
 
-1. Go to the [Serverless section](https://www.runpod.io/console/serverless) of the Runpod console.
+1. Navigate to the [Serverless page](https://www.runpod.io/console/serverless) in the Runpod console.
 2. Click **New Endpoint**.
 3. Click **Import from Docker Registry**.
-4. In the **Container Image** field, enter your Docker image URL: `docker.io/[YOUR_USERNAME]/dual-mode-worker:latest`, then click *Next****.
-5. Under **Environment Variables**, set `MODE_TO_RUN` to `serverless`.
-6. Configure GPU, workers, and other settings as needed.
-7. Select **Create Endpoint**.
+4. In the **Container Image** field, enter your Docker image URL: `docker.io/YOUR_USERNAME/dual-mode-worker:latest`, then click *Next****.
+5. Under **Environment Variables**, add:
+   - Key: `MODE_TO_RUN`
+   - Value: `serverless`
+6. Configure your endpoint settings (GPU type, workers, etc.).
+7. Click **Create Endpoint**.
 
-The *same* image is used, but `start.sh` will now direct it to run in Serverless mode, starting the `runpod.serverless.start` worker.
+The *same* image will be used for your workers, but `start.sh` will now direct them to run in Serverless mode, using the `runpod.serverless.start` function to process requests.
 
 ## Step 8: Test your endpoint
 
-After deploying your endpoint in to Serverless mode, you can test it with the following steps:
+After deploying your endpoint in to Serverless mode, you can test it by sending API requests to your endpoint.
 
 1.  Navigate to your endpoint's detail page in the Runpod console.
 2.  Click the **Requests** tab.
@@ -398,14 +383,14 @@ Congratulations! You've successfully built, deployed, and tested a dual-mode Ser
     1. Deploy your initial Docker image to a Runpod Pod, ensuring `MODE_TO_RUN` is set to `pod` (or rely on the Dockerfile default).
     2. [Connect to your Pod](/pods/connect-to-a-pod) (via SSH or web terminal).
     3. Navigate to the `/app` directory.
-    4. As you develop, install any necessary Python packages (`pip install [PACKAGE_NAME]`) or system dependencies (`apt-get install [PACKAGE_NAME]`).
+    4. As you develop, install any necessary Python packages (`pip install PACKAGE_NAME`) or system dependencies (`apt-get install PACKAGE_NAME`).
     5. Iterate on your `handler.py` script. Test your changes frequently by running `python handler.py` directly in the Pod's terminal. This will execute the test harness you defined in the `elif MODE_TO_RUN == "pod":` block, giving you immediate feedback.
   </Step>
 
   <Step title="Update your Docker image">
     Once you're satisfied with a set of changes and have new dependencies:
     1. Add new Python packages to your `requirements.txt` file.
-    2. Add system installation commands (e.g., `RUN apt-get update && apt-get install -y [PACKAGE_NAME]`) to your `Dockerfile`.
+    2. Add system installation commands (e.g., `RUN apt-get update && apt-get install -y PACKAGE_NAME`) to your `Dockerfile`.
     3. Ensure your updated `handler.py` is saved.
    
   </Step>
@@ -425,13 +410,12 @@ Congratulations! You've successfully built, deployed, and tested a dual-mode Ser
   </Step>
 </Steps>
 
-This iterative loop—write your handler, update the Docker image, test in Pod mode, then deploy to Serverless—allows for rapid development and debugging of your Serverless workers.
+This iterative loop (write your handler, update the Docker image, test in Pod mode, then deploy to Serverless) enables you to rapidly develop and debug your Serverless workers.
 
 ## Next steps
 
-Now that you've mastered the dual-mode development workflow, you can:
+Now that you've mastered the dual-mode development workflow, you can learn how to:
 
-*   [Explore advanced handler functions.](/serverless/workers/handler-functions)
-*   [Learn about sending requests programmatically via API or SDKs.](/serverless/endpoints/send-requests)
-*   [Understand endpoint configurations for performance and cost optimization.](/serverless/endpoints/endpoint-configurations) 
-*   [Deep dive into local testing and development.](/serverless/development/local-testing)
+* [Test your handlers locally before deploying to Serverless.](/serverless/development/local-testing)
+* [Manage and use environment variables for your endpoints.](/serverless/development/environment-variables)
+* [SSH into your workers for debugging.](/serverless/development/ssh-into-workers)
\ No newline at end of file
diff --git a/serverless/development/error-handling.mdx b/serverless/development/error-handling.mdx
new file mode 100644
index 00000000..aaf28122
--- /dev/null
+++ b/serverless/development/error-handling.mdx
@@ -0,0 +1,112 @@
+---
+title: "Error handling"
+sidebarTitle: "Error handling"
+description: "Implement robust error handling for your Serverless workers."
+---
+
+Robust error handling is essential for production Serverless workers. It prevents your worker from crashing silently and ensures that useful error messages are returned to the user, making debugging significantly easier.
+
+## Basic error handling
+
+The simplest way to handle errors is to wrap your handler logic in a try-except block. This ensures that even if your logic fails, the worker remains stable and returns a readable error message.
+
+```python
+import runpod
+
+def handler(job):
+    try:
+        input = job["input"]
+
+        # Replace process_input() with your own handler logic
+        result = process_input(input)
+
+        return {"output": result}
+    except KeyError as e:
+        return {"error": f"Missing required input: {str(e)}"}
+    except Exception as e:
+        return {"error": f"An error occurred: {str(e)}"}
+
+runpod.serverless.start({"handler": handler})
+```
+
+## Structured error responses
+
+For more complex applications, you should return consistent error objects. This allows the client consuming your API to programmatically handle different types of errors, such as validation failures versus unexpected server errors.
+
+```python
+import runpod
+import traceback
+
+def handler(job):
+    try:
+        # Validate input
+        if "prompt" not in job.get("input", {}):
+            return {
+                "error": {
+                    "type": "ValidationError",
+                    "message": "Missing required field: prompt",
+                    "details": "The 'prompt' field is required in the input object"
+                }
+            }
+        
+        prompt = job["input"]["prompt"]
+        result = process_prompt(prompt)
+        return {"output": result}
+        
+    except ValueError as e:
+        return {
+            "error": {
+                "type": "ValueError",
+                "message": str(e),
+                "details": "Invalid input value provided"
+            }
+        }
+    except Exception as e:
+        # Log the full traceback for debugging
+        print(f"Unexpected error: {traceback.format_exc()}")
+        return {
+            "error": {
+                "type": "UnexpectedError",
+                "message": "An unexpected error occurred",
+                "details": str(e)
+            }
+        }
+
+runpod.serverless.start({"handler": handler})
+```
+
+## Timeout handling
+
+For long-running operations, it is best practice to implement timeout logic within your handler. This prevents a job from hanging indefinitely and consuming credits without producing a result.
+
+```python
+import runpod
+import signal
+
+class TimeoutError(Exception):
+    pass
+
+def timeout_handler(signum, frame):
+    raise TimeoutError("Operation timed out")
+
+def handler(job):
+    try:
+        # Set a timeout (e.g., 60 seconds)
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(60)
+        
+        # Your processing code here
+        result = long_running_operation(job["input"])
+        
+        # Cancel the timeout
+        signal.alarm(0)
+        
+        return {"output": result}
+        
+    except TimeoutError:
+        return {"error": "Request timed out after 60 seconds"}
+    except Exception as e:
+        return {"error": str(e)}
+
+runpod.serverless.start({"handler": handler})
+```
\ No newline at end of file
diff --git a/serverless/development/local-testing.mdx b/serverless/development/local-testing.mdx
index c0487909..89b63279 100644
--- a/serverless/development/local-testing.mdx
+++ b/serverless/development/local-testing.mdx
@@ -14,7 +14,7 @@ The simplest way to test your handler is by running it directly with test input.
 Pass test input directly via the command line:
 
 ```sh
-python your_handler.py --test_input '{"input": {"prompt": "Hello, world!"}}'
+python handler.py --test_input '{"input": {"prompt": "Hello, world!"}}'
 ```
 
 This runs your handler with the specified input and displays the output in your terminal.
@@ -26,7 +26,7 @@ For more complex or reusable test inputs, create a `test_input.json` file in the
 ```json test_input.json
 {
   "input": {
-    "prompt": "This is a test input from JSON file"
+    "prompt": "This is a test input from a JSON file"
   }
 }
 ```
@@ -34,7 +34,7 @@ For more complex or reusable test inputs, create a `test_input.json` file in the
 Run your handler without any arguments:
 
 ```sh
-python your_handler.py
+python main.py
 ```
 
 The SDK automatically detects and uses the `test_input.json` file.
@@ -50,14 +50,14 @@ For more comprehensive testing, start a local API server that simulates your Ser
 Start the local server:
 
 ```sh
-python your_handler.py --rp_serve_api
+python handler.py --rp_serve_api
 ```
 
 This starts a FastAPI server on `http://localhost:8000`.
 
 ### Send requests to the server
 
-Once your local server is running, send HTTP POST requests to test your function:
+Once your local server is running, send HTTP `POST` requests from another terminal to test your function:
 
 ```sh
 curl -X POST http://localhost:8000/runsync \
@@ -65,20 +65,20 @@ curl -X POST http://localhost:8000/runsync \
      -d '{"input": {"prompt": "Hello, world!"}}'
 ```
 
-<Info>
+<Note>
 The `/run` endpoint only returns a fake request ID without executing your code, since async mode requires communication with Runpod's system. For local testing, use `/runsync` to execute your handler and get results immediately.
-</Info>
+</Note>
 
 ## Testing concurrency
 
-To test how your handler performs under parallel execution, use the `--rp_api_concurrency` flag to set the number of concurrent workers:
+To test how your handler performs under parallel execution, use the `--rp_api_concurrency` flag to set the number of concurrent workers. 
+
+This command starts your local server with 4 concurrent workers:
 
 ```sh
 python main.py --rp_serve_api --rp_api_concurrency 4
 ```
 
-This starts your local server with 4 concurrent workers.
-
 <Warning>
 When using `--rp_api_concurrency` with a value greater than 1, your main file must be named `main.py` for proper FastAPI integration. If your file has a different name, rename it to `main.py` before running with multiple workers.
 </Warning>
@@ -124,7 +124,7 @@ runpod.serverless.start({"handler": handler})
 Control the verbosity of console output with the `--rp_log_level` flag:
 
 ```sh
-python your_handler.py --rp_serve_api --rp_log_level DEBUG
+python handler.py --rp_serve_api --rp_log_level DEBUG
 ```
 
 Available log levels:
@@ -138,7 +138,7 @@ Available log levels:
 Use the `--rp_debugger` flag for detailed troubleshooting:
 
 ```sh
-python your_handler.py --rp_serve_api --rp_debugger
+python handler.py --rp_serve_api --rp_debugger
 ```
 
 This enables the Runpod debugger, which provides additional diagnostic information to help you troubleshoot issues.
@@ -152,7 +152,7 @@ Customize the local API server with these flags:
 Set a custom port (default is 8000):
 
 ```sh
-python your_handler.py --rp_serve_api --rp_api_port 8080
+python handler.py --rp_serve_api --rp_api_port 8080
 ```
 
 ### Host
@@ -160,7 +160,7 @@ python your_handler.py --rp_serve_api --rp_api_port 8080
 Set the hostname (default is "localhost"):
 
 ```sh
-python your_handler.py --rp_serve_api --rp_api_host 0.0.0.0
+python handler.py --rp_serve_api --rp_api_host 0.0.0.0
 ```
 
 <Warning>
@@ -186,7 +186,7 @@ Here's a complete reference of all available flags for local testing:
 You can combine multiple flags to create a customized local testing environment:
 
 ```sh
-python main.py --rp_serve_api \
+python handler.py --rp_serve_api \
     --rp_api_port 8080 \
     --rp_api_concurrency 4 \
     --rp_log_level DEBUG \
@@ -196,13 +196,13 @@ python main.py --rp_serve_api \
 This command:
 - Starts the local API server on port 8080.
 - Uses 4 concurrent workers.
-- Sets the log level to DEBUG for maximum information.
+- Sets the log level to `DEBUG` for maximum information.
 - Enables the debugger for troubleshooting.
 
 ## Next steps
 
 Once you've tested your handler locally, learn about:
 
-- [SDK utilities](/serverless/development/sdk-utilities) - Helper functions for validation and cleanup.
-- [Pod-first development](/serverless/development/pod-first-development) - Develop on a Pod before deploying to Serverless.
-- [Logs](/serverless/development/logs) - Understand logging in production.
+- [SDK utilities](/serverless/development/sdk-utilities): Helper functions for validation and cleanup.
+- [Pod-first development](/serverless/development/pod-first-development): Develop on a Pod before deploying to Serverless.
+- [Logs](/serverless/development/logs): Understand logging in production.
diff --git a/serverless/development/logs.mdx b/serverless/development/logs.mdx
index 215f963f..a3471950 100644
--- a/serverless/development/logs.mdx
+++ b/serverless/development/logs.mdx
@@ -1,6 +1,6 @@
 ---
-title: "Logs"
-sidebarTitle: "Logs"
+title: "Logs and monitoring"
+sidebarTitle: "Logs and monitoring"
 description: "Access and manage logs for Serverless endpoints and workers."
 ---
 
diff --git a/serverless/development/optimization.mdx b/serverless/development/optimization.mdx
index d7d6027c..89b0c87f 100644
--- a/serverless/development/optimization.mdx
+++ b/serverless/development/optimization.mdx
@@ -1,21 +1,21 @@
 ---
 title: "Benchmarking and optimization"
-sidebarTitle: "Optimization"
-description: "Optimize your Serverless workers for performance and cost."
+sidebarTitle: "Benchmarking"
+description: "Benchmark your Serverless workers to optimize for performance and cost."
 ---
 
-Optimizing your Serverless workers improves performance, reduces costs, and creates a better experience for your users. This guide covers benchmarking, error handling, and CI/CD integration.
+Benchmarking your Serverless workers can help you improve performance, reduce costs, and create a better experience for your users. This guide covers how to measure performance metrics and automate benchmarking to identify bottlenecks.
 
 ## Benchmarking response times
 
 Understanding your worker's performance helps you choose the right GPU and optimize your code. You can measure two key metrics:
 
-- **Delay time**: Time spent waiting for a worker to become available (cold start time).
-- **Execution time**: Time the GPU takes to actually process the request.
+  - **Delay time**: The time spent waiting for a worker to become available. This includes the cold start time if a new worker needs to be spun up.
+  - **Execution time**: The time the GPU takes to actually process the request once the worker has received the job.
 
 ### Send a test request
 
-Use `curl` to send a request to your endpoint:
+To gather initial metrics, use `curl` to send a request to your endpoint. This will initiate the job and return a request ID that you can use to poll for status.
 
 ```sh
 curl -X POST https://api.runpod.ai/v2/YOUR_ENDPOINT_ID/run \
@@ -24,7 +24,7 @@ curl -X POST https://api.runpod.ai/v2/YOUR_ENDPOINT_ID/run \
   -d '{"input": {"prompt": "Hello, world!"}}'
 ```
 
-This returns a request ID:
+This returns a JSON object containing the request ID:
 
 ```json
 {
@@ -35,14 +35,14 @@ This returns a request ID:
 
 ### Check the status
 
-Use the request ID to check the status:
+Once you have the request ID, query the status endpoint to retrieve the performance metrics.
 
 ```sh
 curl -X GET https://api.runpod.ai/v2/YOUR_ENDPOINT_ID/status/abc123-def456-ghi789 \
   -H "Authorization: Bearer YOUR_API_KEY"
 ```
 
-The response includes timing metrics:
+The response includes the `delayTime` and `executionTime` in milliseconds:
 
 ```json
 {
@@ -56,12 +56,9 @@ The response includes timing metrics:
 }
 ```
 
-- `delayTime`: Milliseconds spent waiting for a worker (includes cold start if applicable).
-- `executionTime`: Milliseconds the GPU took to process the request.
-
 ### Automate benchmarking
 
-Create a Python script to automate benchmarking:
+To get a statistically significant view of your worker's performance, you should automate the process. The following Python script sends multiple requests and calculates the minimum, maximum, and average times for both delay and execution.
 
 ```python benchmark.py
 import requests
@@ -115,183 +112,14 @@ if __name__ == "__main__":
     run_benchmark(num_requests=5)
 ```
 
-Run the script:
-
-```sh
-python benchmark.py
-```
-
 ### Optimize based on results
 
-- **High delay time**: Increase active workers or use FlashBoot to reduce cold starts.
-- **High execution time**: Optimize your code, use a faster GPU, or reduce batch sizes.
-- **Inconsistent times**: Check for resource contention or inefficient code paths.
-
-## Error handling
-
-Robust error handling prevents your worker from crashing and provides helpful error messages to users.
-
-### Basic error handling
-
-Wrap your handler logic in a try-except block:
-
-```python
-import runpod
-
-def handler(job):
-    try:
-        input = job["input"]
-
-        # Replace process_input() with your own handler logic
-        result = process_input(input)
-
-        return {"output": result}
-    except KeyError as e:
-        return {"error": f"Missing required input: {str(e)}"}
-    except Exception as e:
-        return {"error": f"An error occurred: {str(e)}"}
-
-runpod.serverless.start({"handler": handler})
-```
-
-### Structured error responses
-
-Return consistent error objects with useful information:
-
-```python
-import runpod
-import traceback
-
-def handler(job):
-    try:
-        # Validate input
-        if "prompt" not in job.get("input", {}):
-            return {
-                "error": {
-                    "type": "ValidationError",
-                    "message": "Missing required field: prompt",
-                    "details": "The 'prompt' field is required in the input object"
-                }
-            }
-        
-        prompt = job["input"]["prompt"]
-        result = process_prompt(prompt)
-        return {"output": result}
-        
-    except ValueError as e:
-        return {
-            "error": {
-                "type": "ValueError",
-                "message": str(e),
-                "details": "Invalid input value provided"
-            }
-        }
-    except Exception as e:
-        # Log the full traceback for debugging
-        print(f"Unexpected error: {traceback.format_exc()}")
-        return {
-            "error": {
-                "type": "UnexpectedError",
-                "message": "An unexpected error occurred",
-                "details": str(e)
-            }
-        }
-
-runpod.serverless.start({"handler": handler})
-```
-
-### Timeout handling
-
-For long-running operations, implement timeout logic:
-
-```python
-import runpod
-import signal
-
-class TimeoutError(Exception):
-    pass
-
-def timeout_handler(signum, frame):
-    raise TimeoutError("Operation timed out")
-
-def handler(job):
-    try:
-        # Set a timeout (e.g., 60 seconds)
-        signal.signal(signal.SIGALRM, timeout_handler)
-        signal.alarm(60)
-        
-        # Your processing code here
-        result = long_running_operation(job["input"])
-        
-        # Cancel the timeout
-        signal.alarm(0)
-        
-        return {"output": result}
-        
-    except TimeoutError:
-        return {"error": "Request timed out after 60 seconds"}
-    except Exception as e:
-        return {"error": str(e)}
-
-runpod.serverless.start({"handler": handler})
-```
-
-## CI/CD integration
-
-Automate your deployment workflow with GitHub integration.
-
-### Manual CI/CD with GitHub Actions
-
-For more control, you can use GitHub Actions to build and deploy your worker:
-
-```yaml .github/workflows/deploy.yml
-name: Deploy to Runpod Serverless
-
-on:
-  push:
-    branches:
-      - main
-
-jobs:
-  deploy:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-      
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      
-      - name: Login to Docker Hub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_PASSWORD }}
-      
-      - name: Build and push
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          push: true
-          tags: your-username/your-worker:latest
-      
-      - name: Update Runpod endpoint
-        run: |
-          curl -X POST https://api.runpod.ai/v2/${{ secrets.RUNPOD_ENDPOINT_ID }}/update \
-            -H "Authorization: Bearer ${{ secrets.RUNPOD_API_KEY }}" \
-            -H "Content-Type: application/json" \
-            -d '{"imageName": "your-username/your-worker:latest"}'
-```
+Once you have your benchmark data, you can take specific actions to improve performance:
 
-Add these secrets to your GitHub repository:
+If you observe **high delay times**, consider increasing your minimum active workers. Alternatively, enable [FlashBoot](/serverless/endpoints/endpoint-configurations#flashboot) to significantly reduce the cold start time required to boot new workers.
 
-- `DOCKER_USERNAME`: Your Docker Hub username.
-- `DOCKER_PASSWORD`: Your Docker Hub password or access token.
-- `RUNPOD_ENDPOINT_ID`: Your Runpod endpoint ID.
-- `RUNPOD_API_KEY`: Your Runpod API key.
+If you observe **high execution times**, focus on optimizing your code logic. You might also consider upgrading to a more powerful GPU or reducing the batch size of your inputs to speed up processing.
 
-## Next steps
+If you observe **inconsistent times**, investigate your code for resource contention or inefficient paths that might be causing sporadic slowdowns.
 
-- [Local testing](/serverless/development/local-testing) - Test your optimizations locally.
-- [Logs](/serverless/development/logs) - Monitor your worker's performance in production.
-- [Environment variables](/serverless/development/environment-variables) - Configure your workers for different environments.
+For a complete list of endpoint settings and best practices, see [Endpoint settings and optimization](/serverless/endpoints/endpoint-configurations).
\ No newline at end of file
diff --git a/serverless/development/overview.mdx b/serverless/development/overview.mdx
index 42b2b0bb..c7514f85 100644
--- a/serverless/development/overview.mdx
+++ b/serverless/development/overview.mdx
@@ -1,5 +1,6 @@
 ---
 title: "Serverless development"
+sidebarTitle: "Overview"
 description: "Build, test, and deploy Serverless workers to production."
 ---
 
@@ -9,11 +10,51 @@ When developing for Runpod Serverless, you'll typically start by writing handler
 
 The typical workflow starts with writing your handler function. Your handler receives an event object with input data and returns a response. Once you have a handler function, test it locally using the Runpod SDK's testing environment. You can test with inline JSON inputs, use a local API server, or simulate concurrency, all without actually deploying your code and incurring charges.
 
-For GPU-intensive applications, you might want to develop on a Pod first before deploying to Serverless. This "Pod-first" workflow gives you direct access to the GPU environment with tools like Jupyter Notebooks and SSH, letting you iterate faster than deploying repeatedly to Serverless.
-
 When your handler is working correctly, package it into a Docker image and deploy it to a Serverless endpoint. Your worker will auto-scale based on demand. Once deployed, use logs, metrics, and SSH access to troubleshoot issues and optimize performance in production.
 
-## Development tools
+<div style={{ marginLeft: '4rem'}}>
+```mermaid
+%%{init: {'theme':'base', 'themeVariables': { 'primaryColor':'#5D29F0','primaryTextColor':'#fff','primaryBorderColor':'#874BFF','lineColor':'#AE6DFF','secondaryColor':'#AE6DFF','tertiaryColor':'#FCB1FF','edgeLabelBackground':'#AE6DFF', 'fontSize':'15px','fontFamily':'font-inter'}}}%%
+
+flowchart TD
+    Start([Write handler function]) --> Test[Test locally with SDK]
+    
+    Test --> Check{Tests pass?}
+    
+    Check -->|"&nbsp;&nbsp;No&nbsp;&nbsp;"| Fix[Fix code & debug]
+    
+    Fix --> Test
+    
+    Check -->|"&nbsp;&nbsp;Yes&nbsp;&nbsp;"| Package[Package Docker image]
+    
+    Package --> Deploy[Deploy to serverless endpoint]
+    
+    subgraph Production [Production environment]
+        Deploy --> Running[Auto-scaling execution]
+        Running --> Monitor[Monitor logs & metrics]
+    end
+    
+    Monitor -.-> Start
+
+    style Start fill:#5D29F0,stroke:#874BFF,color:#FFFFFF,stroke-width:2px
+    style Test fill:#874BFF,stroke:#AE6DFF,color:#FFFFFF,stroke-width:2px
+    style Check fill:#1B0656,stroke:#874BFF,color:#FFFFFF,stroke-width:2px
+    style Fix fill:#FFC01F,stroke:#FF6214,color:#000000,stroke-width:2px
+    style Package fill:#5D29F0,stroke:#874BFF,color:#FFFFFF,stroke-width:2px
+    style Deploy fill:#5D29F0,stroke:#874BFF,color:#FFFFFF,stroke-width:2px
+    style Running fill:#5D29F0,stroke:#874BFF,color:#FFFFFF,stroke-width:2px
+    style Monitor fill:#FCB1FF,stroke:#AE6DFF,color:#000000,stroke-width:2px
+    style Production fill:#100433,stroke:#874BFF,color:#FFFFFF,stroke-width:2px
+    
+    linkStyle default stroke-width:2px
+```
+</div>
+
+<Tip>
+For faster iteration and debugging of GPU-intensive applications, you can develop on a Pod first before deploying to Serverless. This "Pod-first" workflow gives you direct access to the GPU environment with tools like Jupyter Notebooks and SSH, letting you iterate faster than deploying repeatedly to Serverless. Learn more in [Pod-first development](/serverless/development/dual-mode-worker).
+</Tip>
+
+## Development features
 
 ### Local testing environment
 
@@ -37,13 +78,13 @@ Learn more in [SDK utilities](/serverless/development/sdk-utilities).
 
 ### Pod-first development
 
-For complex GPU applications, develop on a Pod first, then deploy the same Docker image to Serverless. This workflow provides:
+For faster iteration and debugging of GPU-intensive applications, develop on a Pod first, then deploy the same Docker image to Serverless. This workflow provides:
 
 - Interactive development with Jupyter Notebooks.
 - Direct SSH access to the GPU environment.
 - Faster iteration compared to deploying repeatedly to Serverless.
 
-Learn more in [Pod-first development](/serverless/development/pod-first-development).
+Learn more in [Pod-first development](/serverless/development/dual-mode-worker).
 
 ### Debugging and observability
 
@@ -55,22 +96,20 @@ Runpod provides several tools for debugging and monitoring:
 
 Learn more in [Logs](/serverless/development/logs) and [SSH access](/serverless/development/ssh-into-workers).
 
-## Configuration
-
-### Environment variables
+## Environment variables
 
 Use environment variables to configure your workers without hardcoding credentials or settings in your code. Environment variables are set in the Runpod console and are available to your handler at runtime.
 
 Learn more in [Environment variables](/serverless/development/environment-variables).
 
-### Optimization
+## Benchmarking and optimization
 
 Optimize your workers for performance and cost:
 
 - **Benchmark response times**: Measure cold start and execution time.
 - **Error handling**: Implement robust error handling in your handler.
 
-Learn more in [Optimization](/serverless/development/optimization).
+Learn more in [Benchmarking and optimization](/serverless/development/optimization).
 
 ## Next steps
 
diff --git a/serverless/development/pod-first-development.mdx b/serverless/development/pod-first-development.mdx
deleted file mode 100644
index e77912dc..00000000
--- a/serverless/development/pod-first-development.mdx
+++ /dev/null
@@ -1,213 +0,0 @@
----
-title: "Pod-first development"
-description: "Develop on a Pod before deploying to Serverless for faster iteration."
----
-
-Developing machine learning applications often requires powerful GPUs, making local development challenging. Instead of repeatedly deploying to Serverless for testing, you can develop on a Pod first and then deploy the same Docker image to Serverless when ready.
-
-This "Pod-first" workflow lets you develop and test interactively in a GPU environment, then seamlessly transition to Serverless for production. You'll use a Pod as your cloud-based development machine with tools like Jupyter Notebooks and SSH, catching issues early before deploying to Serverless.
-
-<Tip>
-To get started quickly, you can [clone this repository](https://github.com/justinwlin/Runpod-GPU-And-Serverless-Base) for a ready-to-use dual-mode worker base.
-</Tip>
-
-## What you'll learn
-
-In this guide you'll learn how to:
-
-- Set up a project for a dual-mode Serverless worker.
-- Create a handler that adapts based on an environment variable.
-- Write a startup script to manage different operational modes.
-- Build a Docker image that works in both Pod and Serverless environments.
-- Deploy and test your worker in both environments.
-
-## Requirements
-
-- You've [created a Runpod account](/get-started/manage-accounts).
-- You've installed [Python 3.x](https://www.python.org/downloads/) and [Docker](https://docs.docker.com/get-started/get-docker/) and configured them for your command line.
-- Basic understanding of Docker concepts and shell scripting.
-
-## Step 1: Set up your project structure
-
-Create a directory for your project and the necessary files:
-
-```sh
-mkdir dual-mode-worker
-cd dual-mode-worker
-touch handler.py start.sh Dockerfile requirements.txt
-```
-
-This creates:
-
-- `handler.py`: Your Python script with the Runpod handler logic.
-- `start.sh`: A shell script that will be the entrypoint for your Docker container.
-- `Dockerfile`: Instructions to build your Docker image.
-- `requirements.txt`: A file to list Python dependencies.
-
-## Step 2: Create the handler
-
-This Python script will check for a `MODE_TO_RUN` environment variable to determine whether to run in Pod or Serverless mode.
-
-Add the following code to `handler.py`:
-
-```python handler.py
-import os
-import asyncio
-import runpod
-
-# Check the MODE_TO_RUN environment variable; default to "pod"
-mode_to_run = os.getenv("MODE_TO_RUN", "pod")
-
-print("------- ENVIRONMENT VARIABLES -------")
-print("Mode running: ", mode_to_run)
-print("------- -------------------- -------")
-
-async def handler(event):
-    inputReq = event.get("input", {})
-    return inputReq
-
-if mode_to_run == "pod":
-    # Pod mode: run a test directly
-    async def main():
-        prompt = "Hello World"
-        requestObject = {"input": {"prompt": prompt}}
-        response = await handler(requestObject)
-        print(response)
-
-    asyncio.run(main())
-else: 
-    # Serverless mode: start the serverless worker
-    runpod.serverless.start({
-        "handler": handler,
-        "concurrency_modifier": lambda current: 1,
-    })
-```
-
-## Step 3: Create the startup script
-
-The `start.sh` script serves as the entrypoint for your Docker container and manages different operational modes.
-
-Add the following code to `start.sh`:
-
-```bash start.sh
-#!/bin/bash
-
-echo "Pod Started"
-
-# Check if MODE_TO_RUN is set; if not, start an interactive shell
-if [ -z "$MODE_TO_RUN" ]; then
-    echo "MODE_TO_RUN not set. Starting interactive mode..."
-    exec /bin/bash
-else
-    echo "MODE_TO_RUN is set to: $MODE_TO_RUN"
-    python -u /handler.py
-fi
-```
-
-Make the script executable:
-
-```sh
-chmod +x start.sh
-```
-
-## Step 4: Create the Dockerfile
-
-Create a Dockerfile that includes your handler and startup script:
-
-```dockerfile Dockerfile
-FROM runpod/base:0.4.0-cuda11.8.0
-
-# Set the working directory
-WORKDIR /
-
-# Copy your handler and startup script
-COPY handler.py /handler.py
-COPY start.sh /start.sh
-
-# Install Python dependencies
-COPY requirements.txt /requirements.txt
-RUN pip install --no-cache-dir -r /requirements.txt
-
-# Make the startup script executable
-RUN chmod +x /start.sh
-
-# Set the entrypoint
-CMD ["/start.sh"]
-```
-
-## Step 5: Add dependencies
-
-Add the Runpod SDK to your `requirements.txt` file:
-
-```txt requirements.txt
-runpod
-```
-
-## Step 6: Build your Docker image
-
-Build your Docker image:
-
-```sh
-docker build -t your-username/dual-mode-worker:latest .
-```
-
-Push it to a container registry like Docker Hub:
-
-```sh
-docker push your-username/dual-mode-worker:latest
-```
-
-## Step 7: Deploy to a Pod for development
-
-Deploy your image to a Pod for interactive development:
-
-1. Navigate to the [Pods page](https://www.runpod.io/console/pods) in the Runpod console.
-2. Click **Deploy**.
-3. Select your preferred GPU.
-4. Under **Docker Image Name**, enter `your-username/dual-mode-worker:latest`.
-5. Leave the `MODE_TO_RUN` environment variable unset (or don't add it).
-6. Click **Deploy**.
-
-Once your Pod is running, you can:
-
-- Connect via SSH to test your handler interactively.
-- Use Jupyter Notebooks if you've configured them.
-- Debug and iterate on your code.
-- Test GPU-specific operations.
-
-## Step 8: Deploy to Serverless for production
-
-Once your handler works correctly on a Pod, deploy the same image to Serverless:
-
-1. Navigate to the [Serverless page](https://www.runpod.io/console/serverless) in the Runpod console.
-2. Click **New Endpoint**.
-3. Under **Docker Image**, enter `your-username/dual-mode-worker:latest`.
-4. Under **Environment Variables**, add:
-   - Key: `MODE_TO_RUN`
-   - Value: `serverless`
-5. Configure your endpoint settings (GPU type, workers, etc.).
-6. Click **Deploy**.
-
-Your worker will now run in Serverless mode, processing requests from your endpoint.
-
-## How it works
-
-The key to this workflow is the `MODE_TO_RUN` environment variable:
-
-- **Pod mode** (`MODE_TO_RUN` not set or set to `"pod"`): The handler runs a test directly and then keeps the container alive for interactive development.
-- **Serverless mode** (`MODE_TO_RUN="serverless"`): The handler starts the Runpod Serverless worker to process incoming requests.
-
-This lets you use the same Docker image for both development and production, eliminating the need to rebuild and redeploy when transitioning between environments.
-
-## Benefits of Pod-first development
-
-- **Faster iteration**: Develop and test interactively without waiting for Serverless deployments.
-- **Better debugging**: Use SSH, Jupyter Notebooks, and other interactive tools.
-- **GPU access**: Test GPU-specific code directly in the cloud.
-- **Seamless transition**: Deploy the same image to Serverless without modifications.
-
-## Next steps
-
-- [Local testing](/serverless/development/local-testing) - Test your handler locally before deploying.
-- [Environment variables](/serverless/development/environment-variables) - Learn more about configuring workers with environment variables.
-- [SSH access](/serverless/development/ssh-into-workers) - Connect to running workers for debugging.
diff --git a/serverless/development/sdk-utilities.mdx b/serverless/development/sdk-utilities.mdx
index d1ea3314..a35f75f6 100644
--- a/serverless/development/sdk-utilities.mdx
+++ b/serverless/development/sdk-utilities.mdx
@@ -180,6 +180,6 @@ clean(folder_list=["temp_images", "cache", "downloads"])
 
 Learn about other development tools:
 
-- [Local testing](/serverless/development/local-testing) - Test your handler before deploying.
-- [Pod-first development](/serverless/development/pod-first-development) - Develop on a Pod before deploying to Serverless.
-- [Environment variables](/serverless/development/environment-variables) - Configure your workers without hardcoding credentials.
+- [Local testing](/serverless/development/local-testing): Test your handler before deploying.
+- [Pod-first development](/serverless/development/pod-first-development): Develop on a Pod before deploying to Serverless.
+- [Environment variables](/serverless/development/environment-variables): Configure your workers without hardcoding credentials.
diff --git a/serverless/endpoints/endpoint-configurations.mdx b/serverless/endpoints/endpoint-configurations.mdx
index 121851af..b1972798 100644
--- a/serverless/endpoints/endpoint-configurations.mdx
+++ b/serverless/endpoints/endpoint-configurations.mdx
@@ -6,51 +6,47 @@ description: "Configure your endpoints to optimize for performance, cost, and re
 
 import GPUTable from '/snippets/serverless-gpu-pricing-table.mdx';
 
-This guide explains all available settings and best practices for configuring your Serverless endpoints.
+This guide details the configuration options available for Runpod Serverless endpoints. These settings control how your endpoint scales, how it utilizes hardware, and how it manages request lifecycles.
 
 <Frame alt="Endpoint configurations">
-  <img src="/images/endpoint-settings.png" />
+<img src="/images/endpoint-settings.png" />
 </Frame>
 
-## Endpoint name
+## General configuration
 
-The name you assign to your endpoint for easy identification in your dashboard. This name is only visible to you and doesn't affect the endpoint ID used for API calls.
+### Endpoint name
 
-## Endpoint type
+The name assigned to your endpoint helps you identify it within the Runpod console. This is a local display name and does not impact the endpoint ID used for API requests.
 
-Choose between two endpoint types based on your workload requirements:
+### Endpoint type
 
-**Queue based endpoints** are well-suited for long-running requests, batch processing, or asynchronous tasks. They process requests through a queueing system that guarantees execution and provides built-in retry mechanisms. These endpoints are easy to implement using [handler functions](/serverless/workers/handler-functions), and are ideal for workloads that can be processed asynchronously.
+Select the architecture that best fits your application's traffic pattern:
 
-**Load balancing endpoints** are best for high-throughput or low-latency workloads, or non-standard request/response patterns. They route requests directly to worker HTTP servers, bypassing the queue for faster response times. These endpoints support custom REST API paths and are ideal for real-time applications requiring immediate processing.
+**Queue based endpoints** utilize a built-in queueing system to manage requests. They are ideal for asynchronous tasks, batch processing, and long-running jobs where immediate synchronous responses are not required. These endpoints provide guaranteed execution and automatic retries for failed requests.
 
-For detailed information about load balancing endpoints, see [Load balancing endpoints](/serverless/load-balancing/overview).
+Queue based endpoints are implemented using [handler functions](/serverless/workers/handler-functions).
 
-## GPU configuration
+**Load balancing endpoints** route traffic directly to available workers, bypassing the internal queue. They are designed for high-throughput, low-latency applications that require synchronous request/response cycles, such as real-time inference or custom REST APIs. 
 
-Choose one or more GPU categories (organized by memory) for your endpoint in order of preference. Runpod prioritizes allocating the first category in your list and falls back to subsequent GPUs if your first choice is unavailable.
+For implementation details, see [Load balancing endpoints](https://www.google.com/search?q=/serverless/load-balancing/overview).
 
-The following GPU categories are available:
+### GPU configuration
 
-<GPUTable/>
+This setting determines the hardware tier your workers will utilize. You can select multiple GPU categories to create a prioritized list. Runpod attempts to allocate the first category in your list. If that hardware is unavailable, it automatically falls back to the subsequent options. 
 
-<Tip>
-
-Selecting multiple GPU types improves availability, especially for high-demand GPUs.
+Selecting multiple GPU types significantly improves endpoint availability during periods of high demand.
 
-</Tip>
+<GPUTable />
 
-## Worker configuration
+## Worker scaling
 
 ### Active workers
 
-Sets the minimum number of workers that remain running at all times. Setting this at one or higher eliminates cold start delays for faster response times. Active workers incur charges immediately, but receive up to 30% discount from regular pricing.
-
-Default: 0
+This setting defines the minimum number of workers that remain warm and ready to process requests at all times. Setting this to 1 or higher eliminates cold starts for the initial wave of requests. Active workers incur charges even when idle, but they receive a 20-30% discount compared to on-demand workers.
 
 <Tip>
 
-For workloads with long cold start times, consider using active workers to eliminate startup delays. You can estimate the optimal number by:
+For workloads with long cold start times, use active workers to eliminate startup delays. You can estimate the optimal number by:
 
 1. Measuring your requests per minute during typical usage.
 2. Calculating average request duration in seconds.
@@ -64,25 +60,17 @@ Even a small number of active workers can significantly improve performance for
 
 ### Max workers
 
-The maximum number of concurrent workers your endpoint can scale to.
+This setting controls the maximum number of concurrent instances your endpoint can scale to. This acts as a safety limit for costs and a cap on concurrency.
 
-Default: 3
+<Tip> 
+We recommend setting your max worker count approximately 20% higher than your expected maximum concurrency. This buffer allows for smoother scaling during traffic spikes. 
 
-<Warning>
-
-Setting max workers to 1 restricts your deployment to a single machine, creating potential bottlenecks if that machine becomes unavailable.
-
-We recommend setting your max worker count approximately 20% higher than your expected maximum concurrency. This headroom allows for smoother scaling during traffic spikes and helps prevent request throttling.
-
-</Warning>
+Avoid setting this to 1, as this restricts your deployment to a single machine, creating potential bottlenecks if that machine becomes unavailable.
+</Tip>
 
 ### GPUs per worker
 
-The number of GPUs assigned to each worker instance.
-
-Default: 1
-
-<Tip>
+This defines how many GPUs are assigned to a single worker instance. The default is 1.
 
 When choosing between multiple lower-tier GPUs or fewer high-end GPUs, you should generally prioritize high-end GPUs with lower GPU count per worker when possible.
 
@@ -90,131 +78,61 @@ When choosing between multiple lower-tier GPUs or fewer high-end GPUs, you shoul
 - Multi-GPU configurations introduce parallel processing overhead that can offset performance gains.
 - Higher GPU-per-worker requirements can reduce availability, as finding machines with multiple free GPUs is more challenging than locating single available GPUs.
 
-</Tip>
-
-## Timeout settings
+### Auto-scaling type
 
-### Idle timeout
+This setting determines the logic used to scale workers up and down.
 
-The amount of time that a worker continues running after completing a request. You're still charged for this time, even if the worker isn't actively processing any requests.
+**Queue delay** scaling adds workers based on wait times. If requests sit in the queue for longer than a defined threshold (default 4 seconds), the system provisions new workers. This is best for workloads where slight delays are acceptable in exchange for higher utilization.
 
-By default, the idle timeout is set to 5 seconds to help avoid frequent start/stop cycles and reduce the likelihood of cold starts. Setting a longer idle timeout can help minimize cold starts for intermittent traffic, but it may also increase your costs.
+**Request count** scaling is more aggressive. It adjusts worker numbers based on the total volume of pending and active work. The formula used is `Math.ceil((requestsInQueue + requestsInProgress) / scalerValue)`. Use a scaler value of 1 for maximum responsiveness, or increase it to scale more conservatively. This strategy is recommended for LLM workloads or applications with frequent, short requests.
 
-When configuring idle timeout, start by matching it to your average cold start time to reduce startup delays. For workloads with extended cold starts, consider longer idle timeouts to minimize repeated initialization costs.
+## Lifecycle and timeouts
 
-<Warning>
+### Idle timeout
 
-That idle timeout is only effective when using [queue delay scaling](#queue-delay). Be cautious with high timeout values, as workers with constant traffic may never reach the idle state necessary to scale down properly.
+The idle timeout determines how long a worker remains active after completing a request before shutting down. While a worker is idle, you are billed for the time, but the worker remains "warm," allowing it to process subsequent requests immediately. The default is 5 seconds.
 
-</Warning>
+<Tip>
+If you observe frequent cold starts, consider increasing this value to match your average traffic gaps. However, be aware that if you use the [Queue delay](#auto-scaling-type) scaling strategy, setting this value too high may prevent workers from scaling down properly.
+</Tip>
 
 ### Execution timeout
 
-The maximum time a job can run before automatic termination. This prevents runaway jobs from consuming excessive resources. You can turn off this setting, but we highly recommend keeping it on.
-
-Default: 600 seconds (10 minutes)
-Maximum: 24 hours (can be extended using job TTL)
-
-<Warning>
+The execution timeout acts as a failsafe to prevent runaway jobs from consuming infinite resources. It specifies the maximum duration a single job is allowed to run before being forcibly terminated.
 
-We strongly recommend enabling execution timeout for all endpoints. Set the timeout value to your typical request duration plus a 10-20% buffer. This safeguard prevents unexpected or faulty requests from running indefinitely and consuming unnecessary resources.
-
-</Warning>
+We strongly recommend keeping this enabled. Set the value to your longest expected request duration plus a 20% buffer. The default is 600 seconds (10 minutes), and it can be extended up to 24 hours.
 
 ### Job TTL (time-to-live)
 
-The maximum time a job remains in the queue before automatic termination.
-
-Default: 86,400,000 milliseconds (24 hours)
-Minimum: 10,000 milliseconds (10 seconds)
-
-See [Execution policies](/serverless/endpoints/send-requests#execution-policies) for more information.
-
-<Tip>
-
-You can use the `/status` operation to configure the time-to-live (TTL) for an individual job by appending a TTL parameter when checking the status of a job. For example, `https://api.runpod.ai/v2/{endpoint_id}/status/{job_id}?ttl=6000` sets the TTL for the job to 6 seconds. Use this when you want to tell the system to remove a job result sooner than the default retention time.
-
-</Tip>
+This setting defines how long a job request remains valid in the queue before expiring. If a worker does not pick up the job within this window, the system discards it. The default is 24 hours.
 
-## FlashBoot
+## Performance features
 
-FlashBoot is Runpod's solution for reducing the average cold-start times on your endpoint. It works by retaining worker resources for some time after they're no longer in use, so they can be rebooted quickly. When your endpoint has consistent traffic, your workers have a higher chance of benefiting from FlashBoot for faster spin-ups. However, if your endpoint isn't receiving frequent requests, FlashBoot has fewer opportunities to optimize performance. There is no additional cost associated with FlashBoot.
+### FlashBoot
 
-<Tip>
-
-The effectiveness of FlashBoot increases exponentially with higher request volumes and worker counts, making it ideal for busy production endpoints. For endpoints with fewer than 3 workers, FlashBoot's overhead may exceed its benefits.
-
-</Tip>
+FlashBoot reduces cold start times by retaining the state of worker resources shortly after they spin down. This allows the system to "revive" a worker much faster than a standard fresh boot. FlashBoot is most effective on endpoints with consistent traffic, where workers frequently cycle between active and idle states. There is no additional cost for enabling FlashBoot.
 
-## Model (optional)
+### Model (optional)
 
-You can select from a list of [cached models](/serverless/endpoints/model-caching) using the **Model (optional)** field. Selecting a model signals the system to place your workers on host machines that contain the selected model, resulting in faster cold starts and significant cost savings.
+The Model field allows you to select from a list of [cached models](/serverless/endpoints/model-caching). When selected, Runpod schedules your workers on host machines that already have these large model files pre-loaded. This significantly reduces the network time required to download models during initialization.
 
 ## Advanced settings
 
-When configuring advanced settings, remember that each constraint (data center, storage, CUDA version, GPU type) may limit resource availability. For maximum availability and reliability, select all data centers and CUDA versions, and avoid network volumes unless your workload specifically requires them.
-
 ### Data centers
 
-Control which data centers can deploy and cache your workers. Allowing multiple data centers improves availability, while using a network volume restricts your endpoint to a single data center.
-
-Default: All data centers
-
-<Tip>
-
-For the highest availability, allow all data centers (i.e., keep the default setting in place) and avoid using network volumes unless necessary.
-
-</Tip>
+You can restrict your endpoint to specific geographical regions. For maximum reliability and availability, we recommend allowing all data centers. Restricting this list decreases the pool of available GPUs your endpoint can draw from.
 
 ### Network volumes
 
-Attach persistent storage to your workers. [Network volumes](/storage/network-volumes) have higher latency than local storage, and restrict workers to the data center containing your volume. However, they can be very useful for sharing large models or data between workers on an endpoint.
-
-### Auto-scaling type
-
-#### Queue delay
-
-Adds workers based on request wait times.
-
-The queue delay scaling strategy adjusts worker numbers based on request wait times. Workers are added if requests spend more than X seconds in the queue, where X is a threshold you define. By default, this threshold is set at 4 seconds.
-
-#### Request count
-
-The request count scaling strategy adjusts worker numbers according to total requests in the queue and in progress. It automatically adds workers as the number of requests increases, ensuring tasks are handled efficiently.
-
-Total workers formula: `Math.ceil((requestsInQueue + requestsInProgress) / 4)`
-
-<Tip>
-
-**Optimizing your auto-scaling strategy:**
-
-- For maximum responsiveness, use "request count" with a scaler value of 1 to provision workers immediately for each incoming request.
-- LLM workloads with frequent, short requests typically perform better with "request count" scaling.
-- For gradual scaling, increase the request count scaler value to provision workers more conservatively.
-- Use queue delay when you want workers to remain available briefly after request completion to handle follow-up requests.
-- With long cold start times, favor conservative scaling to minimize the performance and cost impacts of frequent worker initialization.
-
-</Tip>
-
-### Expose HTTP/TCP ports
-
-Enables direct communication with your worker via its public IP and port. This can be useful for real-time applications requiring minimal latency, such as [WebSocket applications](https://github.com/runpod-workers/worker-websocket).
-
-### Enabled GPU types
-
-Here you can specify which [GPU types](/references/gpu-types) to use within your selected GPU size categories. By default, all GPU types are enabled.
+[Network volumes](/storage/network-volumes) provide persistent storage that survives worker restarts. While they enable data sharing between workers, they introduce network latency and restrict your endpoint to the specific data center where the volume resides. Use network volumes only if your workload specifically requires shared persistence or datasets larger than the container limit.
 
 ### CUDA version selection
 
-Specify which CUDA versions can be used with your workload to ensures your code runs on compatible GPU hardware. Runpod will match your workload to GPU instances with the selected CUDA versions.
-
-<Tip>
+This filter ensures your workers are scheduled on host machines with compatible drivers. While you should select the version your code requires, we recommend also selecting all newer versions. CUDA is generally backward compatible, and selecting a wider range of versions increases the pool of available hardware.
 
-CUDA versions are generally backward compatible, so we recommend that you check for the version you need and any higher versions. For example, if your code requires CUDA 12.4, you should also try running it on 12.5, 12.6, and so on.
-
-Limiting your endpoint to just one or two CUDA versions can significantly reduce GPU availability. Runpod continuously updates GPU drivers to support the latest CUDA versions, so keeping more CUDA versions selected gives you access to more resources.
+### Expose HTTP/TCP ports
 
-</Tip>
+Enabling this option exposes the public IP and port of the worker, allowing for direct external communication. This is required for applications that need persistent connections, such as WebSockets.
 
 ## Reducing worker startup times
 
@@ -251,17 +169,17 @@ Use these strategies to reduce worker startup times:
 ## Best practices summary
 
 - **Understand optimization tradeoffs** and make conscious tradeoffs between cost, speed, and model size.
-- **Start conservative** with max workers and scale up as needed.
-- **Monitor throttling** and adjust max workers accordingly.
-- **Use active workers** for latency-sensitive applications.
-- **Select multiple GPU types** to improve availability.
-- **Choose appropriate timeouts** based on your workload characteristics.
+- **Start conservative** with [max workers](#max-workers) and scale up as needed.
+- **Monitor throttling** and adjust [max workers](#max-workers) accordingly.
+- **Use [active workers](#active-workers)** for latency-sensitive applications.
+- **Select multiple [GPU types](#gpu-configuration)** to improve availability.
+- **Choose appropriate [timeouts](#execution-timeout)** based on your workload characteristics.
 - **Consider data locality** when using network volumes.
-- **Avoid setting max workers to 1** to prevent bottlenecks.
-- **Plan for 20% headroom** in max workers to handle load spikes.
+- **Avoid setting [max workers](#max-workers) to 1** to prevent bottlenecks.
+- **Plan for 20% headroom** in [max workers](#max-workers) to handle load spikes.
 - **Prefer high-end GPUs with lower GPU count** for better performance.
-- **Set execution timeout** to prevent runaway processes.
-- **Match auto-scaling strategy** to your workload patterns.
+- **Set [execution timeout](#execution-timeout)** to prevent runaway processes.
+- **Match [auto-scaling strategy](#auto-scaling-type)** to your workload patterns.
 - **Embed models in Docker images** when possible for faster loading.
-- **Extend idle timeouts** to prevent frequent cold starts.
-- **Consider disabling FlashBoot** for endpoints with few workers or infrequent traffic.
+- **Extend [idle timeouts](#idle-timeout)** to prevent frequent cold starts.
+- **Consider disabling [FlashBoot](#flashboot)** for endpoints with few workers or infrequent traffic.

From 22b02882418cf3bfa99259bf03f05bd948a9c7eb Mon Sep 17 00:00:00 2001
From: Mo King <muhsinking@gmail.com>
Date: Thu, 11 Dec 2025 15:39:55 -0500
Subject: [PATCH 3/7] Separate optimization and benchmarking guides

---
 docs.json                                     |   1 +
 serverless/development/benchmarking.mdx       |  97 ++++++++++
 serverless/development/optimization.mdx       | 170 ++++++------------
 .../endpoints/endpoint-configurations.mdx     | 110 +-----------
 4 files changed, 161 insertions(+), 217 deletions(-)
 create mode 100644 serverless/development/benchmarking.mdx

diff --git a/docs.json b/docs.json
index f4944279..baaa4807 100644
--- a/docs.json
+++ b/docs.json
@@ -97,6 +97,7 @@
                 "serverless/development/local-testing",
                 "serverless/development/sdk-utilities",
                 "serverless/development/error-handling",
+                "serverless/development/benchmarking",
                 "serverless/development/optimization",
                 "serverless/development/logs",
                 "serverless/development/dual-mode-worker",
diff --git a/serverless/development/benchmarking.mdx b/serverless/development/benchmarking.mdx
new file mode 100644
index 00000000..e16b3c27
--- /dev/null
+++ b/serverless/development/benchmarking.mdx
@@ -0,0 +1,97 @@
+---
+title: "Benchmarking"
+sidebarTitle: "Benchmarking"
+description: "Benchmark your Serverless workers and measure delay and execution times."
+---
+
+Understanding your worker's performance helps you choose the right GPU and [optimize your code](/serverless/development/optimization). You can measure two key metrics:
+
+  - **Delay time**: The time spent waiting for a worker to become available. This includes the cold start time if a new worker needs to be spun up.
+  - **Execution time**: The time the GPU takes to actually process the request once the worker has received the job.
+
+### Send a test request
+
+To gather initial metrics, use `curl` to send a request to your endpoint. This will initiate the job and return a request ID that you can use to poll for status.
+
+```sh
+curl -X POST https://api.runpod.ai/v2/YOUR_ENDPOINT_ID/run \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer YOUR_API_KEY" \
+  -d '{"input": {"prompt": "Hello, world!"}}'
+```
+
+This returns a JSON object containing the request ID. Poll the `/status` endpoint to get the delay time and execution time:
+
+```sh
+curl -X GET https://api.runpod.ai/v2/YOUR_ENDPOINT_ID/status/REQUEST_ID \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer YOUR_API_KEY"
+```
+
+This returns a JSON object:
+
+```json
+{
+  "id": "1234567890",
+  "status": "COMPLETED",
+  "delayTime": 1000,
+  "executionTime": 2000
+}
+```
+
+
+### Automate benchmarking
+
+To get a statistically significant view of your worker's performance, you should automate the benchmarking process. The following Python script sends multiple requests and calculates the minimum, maximum, and average times for both delay and execution.
+
+```python benchmark.py
+import requests
+import time
+import statistics
+
+ENDPOINT_ID = "YOUR_ENDPOINT_ID"
+API_KEY = "YOUR_API_KEY"
+BASE_URL = f"https://api.runpod.ai/v2/{ENDPOINT_ID}"
+HEADERS = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {API_KEY}"
+}
+
+def run_benchmark(num_requests=5):
+    delay_times = []
+    execution_times = []
+    
+    for i in range(num_requests):
+        # Send request
+        response = requests.post(
+            f"{BASE_URL}/run",
+            headers=HEADERS,
+            json={"input": {"prompt": f"Test request {i+1}"}}
+        )
+        request_id = response.json()["id"]
+        
+        # Poll for completion
+        while True:
+            status_response = requests.get(
+                f"{BASE_URL}/status/{request_id}",
+                headers=HEADERS
+            )
+            status_data = status_response.json()
+            
+            if status_data["status"] == "COMPLETED":
+                delay_times.append(status_data["delayTime"])
+                execution_times.append(status_data["executionTime"])
+                break
+            elif status_data["status"] == "FAILED":
+                print(f"Request {i+1} failed")
+                break
+            
+            time.sleep(1)
+    
+    # Calculate statistics
+    print(f"Delay Time - Min: {min(delay_times)}ms, Max: {max(delay_times)}ms, Avg: {statistics.mean(delay_times):.0f}ms")
+    print(f"Execution Time - Min: {min(execution_times)}ms, Max: {max(execution_times)}ms, Avg: {statistics.mean(execution_times):.0f}ms")
+
+if __name__ == "__main__":
+    run_benchmark(num_requests=5)
+```
\ No newline at end of file
diff --git a/serverless/development/optimization.mdx b/serverless/development/optimization.mdx
index 89b0c87f..b677f3fd 100644
--- a/serverless/development/optimization.mdx
+++ b/serverless/development/optimization.mdx
@@ -1,125 +1,63 @@
 ---
-title: "Benchmarking and optimization"
-sidebarTitle: "Benchmarking"
-description: "Benchmark your Serverless workers to optimize for performance and cost."
+title: "Optimize your workers"
+sidebarTitle: "Optimization guide"
+description: "Implement strategies to reduce latency and cost for your Serverless workers."
 ---
 
-Benchmarking your Serverless workers can help you improve performance, reduce costs, and create a better experience for your users. This guide covers how to measure performance metrics and automate benchmarking to identify bottlenecks.
+Optimizing your Serverless workers involves a cycle of measuring performance (e.g. with [benchmarking](/serverless/development/benchmarking)), identifying bottlenecks, and tuning your [endpoint configurations](/serverless/endpoints/endpoint-configurations). This guide covers specific strategies to reduce startup times and improve throughput.
 
-## Benchmarking response times
+## Optimization overview
 
-Understanding your worker's performance helps you choose the right GPU and optimize your code. You can measure two key metrics:
+Effective optimization requires making conscious tradeoffs between cost, speed, and model size.
+
+To ensure high availability during peak traffic, you should select multiple GPU types in your configuration rather than relying on a single hardware specification. When choosing hardware, a single high-end GPU is generally preferable to multiple lower-tier cards, as the superior memory bandwidth and newer architecture often yield better inference performance than parallelization across weaker cards. When choosing multiple [GPU types](/references/gpu-types), you should select the [GPU categories](/serverless/endpoints/endpoint-configurations#gpu-configuration) that are most likely to be available in your desired data centers.
+
+For latency-sensitive applications, utilizing active workers is the most effective way to eliminate cold starts. You should also configure your [max workers](/serverless/endpoints/endpoint-configurations#max-workers) setting with approximately 20% headroom above your expected concurrency. This buffer ensures that your endpoint can handle sudden load spikes without throttling requests or hitting capacity limits.
+
+Your architectural choices also significantly impact performance. Whenever possible, bake your models directly into the Docker image to leverage the high-speed local NVMe storage of the host machine. If you utilize [network volumes](/storage/network-volumes) for larger datasets, remember that this restricts your endpoint to specific data centers, which effectively shrinks your pool of available compute resources.
+
+
+## Reducing worker startup times
+
+
+There are two key metrics to consider when optimizing your workers:
 
   - **Delay time**: The time spent waiting for a worker to become available. This includes the cold start time if a new worker needs to be spun up.
   - **Execution time**: The time the GPU takes to actually process the request once the worker has received the job.
 
-### Send a test request
-
-To gather initial metrics, use `curl` to send a request to your endpoint. This will initiate the job and return a request ID that you can use to poll for status.
-
-```sh
-curl -X POST https://api.runpod.ai/v2/YOUR_ENDPOINT_ID/run \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer YOUR_API_KEY" \
-  -d '{"input": {"prompt": "Hello, world!"}}'
-```
-
-This returns a JSON object containing the request ID:
-
-```json
-{
-  "id": "abc123-def456-ghi789",
-  "status": "IN_QUEUE"
-}
-```
-
-### Check the status
-
-Once you have the request ID, query the status endpoint to retrieve the performance metrics.
-
-```sh
-curl -X GET https://api.runpod.ai/v2/YOUR_ENDPOINT_ID/status/abc123-def456-ghi789 \
-  -H "Authorization: Bearer YOUR_API_KEY"
-```
-
-The response includes the `delayTime` and `executionTime` in milliseconds:
-
-```json
-{
-  "delayTime": 2341,
-  "executionTime": 1563,
-  "id": "abc123-def456-ghi789",
-  "output": {
-    "result": "Hello, world!"
-  },
-  "status": "COMPLETED"
-}
-```
-
-### Automate benchmarking
-
-To get a statistically significant view of your worker's performance, you should automate the process. The following Python script sends multiple requests and calculates the minimum, maximum, and average times for both delay and execution.
-
-```python benchmark.py
-import requests
-import time
-import statistics
-
-ENDPOINT_ID = "YOUR_ENDPOINT_ID"
-API_KEY = "YOUR_API_KEY"
-BASE_URL = f"https://api.runpod.ai/v2/{ENDPOINT_ID}"
-HEADERS = {
-    "Content-Type": "application/json",
-    "Authorization": f"Bearer {API_KEY}"
-}
-
-def run_benchmark(num_requests=5):
-    delay_times = []
-    execution_times = []
-    
-    for i in range(num_requests):
-        # Send request
-        response = requests.post(
-            f"{BASE_URL}/run",
-            headers=HEADERS,
-            json={"input": {"prompt": f"Test request {i+1}"}}
-        )
-        request_id = response.json()["id"]
-        
-        # Poll for completion
-        while True:
-            status_response = requests.get(
-                f"{BASE_URL}/status/{request_id}",
-                headers=HEADERS
-            )
-            status_data = status_response.json()
-            
-            if status_data["status"] == "COMPLETED":
-                delay_times.append(status_data["delayTime"])
-                execution_times.append(status_data["executionTime"])
-                break
-            elif status_data["status"] == "FAILED":
-                print(f"Request {i+1} failed")
-                break
-            
-            time.sleep(1)
-    
-    # Calculate statistics
-    print(f"Delay Time - Min: {min(delay_times)}ms, Max: {max(delay_times)}ms, Avg: {statistics.mean(delay_times):.0f}ms")
-    print(f"Execution Time - Min: {min(execution_times)}ms, Max: {max(execution_times)}ms, Avg: {statistics.mean(execution_times):.0f}ms")
-
-if __name__ == "__main__":
-    run_benchmark(num_requests=5)
-```
-
-### Optimize based on results
-
-Once you have your benchmark data, you can take specific actions to improve performance:
-
-If you observe **high delay times**, consider increasing your minimum active workers. Alternatively, enable [FlashBoot](/serverless/endpoints/endpoint-configurations#flashboot) to significantly reduce the cold start time required to boot new workers.
-
-If you observe **high execution times**, focus on optimizing your code logic. You might also consider upgrading to a more powerful GPU or reducing the batch size of your inputs to speed up processing.
-
-If you observe **inconsistent times**, investigate your code for resource contention or inefficient paths that might be causing sporadic slowdowns.
-
-For a complete list of endpoint settings and best practices, see [Endpoint settings and optimization](/serverless/endpoints/endpoint-configurations).
\ No newline at end of file
+<Tip>
+Try [benchmarking your workers](/serverless/development/benchmarking) to measure these metrics.
+</Tip>
+
+**Delay time** is comprised of:
+
+  - **Initialization time**: The time spent downloading the Docker image.
+  - **Cold start time**: The time spent loading the model into memory.
+
+If your delay time is high, use these strategies to reduce it.
+
+<Note>
+If your worker's cold start time exceeds the default 7-minute limit, the system may mark it as unhealthy. You can extend this limit by setting the `RUNPOD_INIT_TIMEOUT` environment variable (e.g. `RUNPOD_INIT_TIMEOUT=800` for 800 seconds).
+</Note>
+
+### Embed models in Docker images
+
+For production environments, package your ML models directly within your worker container image instead of downloading them in your handler function. This strategy places models on the worker's high-speed local storage (SSD/NVMe), dramatically reducing the time needed to load models into GPU memory. Note that extremely large models (500GB+) may still require network volume storage.
+
+### Use network volumes during development
+
+For flexibility during development, save large models to a [network volume](/storage/network-volumes) using a Pod or one-time handler, then mount this volume to your Serverless workers. While network volumes offer slower model loading compared to embedding models directly, they can speed up your workflow by enabling rapid iteration and seamless switching between different models and configurations.
+
+### Maintain active workers
+
+Set [active worker counts](/serverless/endpoints/endpoint-configurations#active-workers) above zero to completely eliminate cold starts. These workers remain ready to process requests instantly and cost up to 30% less when idle compared to standard (flex) workers.
+
+You can estimate the optimal number of active workers using the formula: `(Requests per Minute × Request Duration) / 60`. For example, with 6 requests per minute taking 30 seconds each, you would need 3 active workers to handle the load without queuing.
+
+### Optimize scaling parameters
+
+Fine-tune your [auto-scaling configuration](/serverless/endpoints/endpoint-configurations#auto-scaling-type) for more responsive worker provisioning. Lowering the queue delay threshold to 2-3 seconds (default 4) or decreasing the request count threshold allows the system to respond more swiftly to traffic fluctuations.
+
+### Increase maximum worker limits
+
+Set a higher [max worker](/serverless/endpoints/endpoint-configurations#max-workers) limit to ensure your Docker images are pre-cached across multiple compute nodes and data centers. This proactive approach eliminates image download delays during scaling events, significantly reducing startup times.
\ No newline at end of file
diff --git a/serverless/endpoints/endpoint-configurations.mdx b/serverless/endpoints/endpoint-configurations.mdx
index b1972798..963bdf11 100644
--- a/serverless/endpoints/endpoint-configurations.mdx
+++ b/serverless/endpoints/endpoint-configurations.mdx
@@ -1,17 +1,13 @@
 ---
-title: "Endpoint settings and optimization guide"
+title: "Endpoint settings"
 sidebarTitle: "Endpoint settings"
-description: "Configure your endpoints to optimize for performance, cost, and reliability."
+description: "Reference guide for all Serverless endpoint settings and parameters."
 ---
 
 import GPUTable from '/snippets/serverless-gpu-pricing-table.mdx';
 
 This guide details the configuration options available for Runpod Serverless endpoints. These settings control how your endpoint scales, how it utilizes hardware, and how it manages request lifecycles.
 
-<Frame alt="Endpoint configurations">
-<img src="/images/endpoint-settings.png" />
-</Frame>
-
 ## General configuration
 
 ### Endpoint name
@@ -22,19 +18,13 @@ The name assigned to your endpoint helps you identify it within the Runpod conso
 
 Select the architecture that best fits your application's traffic pattern:
 
-**Queue based endpoints** utilize a built-in queueing system to manage requests. They are ideal for asynchronous tasks, batch processing, and long-running jobs where immediate synchronous responses are not required. These endpoints provide guaranteed execution and automatic retries for failed requests.
-
-Queue based endpoints are implemented using [handler functions](/serverless/workers/handler-functions).
-
-**Load balancing endpoints** route traffic directly to available workers, bypassing the internal queue. They are designed for high-throughput, low-latency applications that require synchronous request/response cycles, such as real-time inference or custom REST APIs. 
+**Queue based endpoints** utilize a built-in queueing system to manage requests. They are ideal for asynchronous tasks, batch processing, and long-running jobs where immediate synchronous responses are not required. These endpoints provide guaranteed execution and automatic retries for failed requests. Queue based endpoints are implemented using [handler functions](/serverless/workers/handler-functions).
 
-For implementation details, see [Load balancing endpoints](https://www.google.com/search?q=/serverless/load-balancing/overview).
+**Load balancing endpoints** route traffic directly to available workers, bypassing the internal queue. They are designed for high-throughput, low-latency applications that require synchronous request/response cycles, such as real-time inference or custom REST APIs. For implementation details, see [Load balancing endpoints](/serverless/load-balancing/overview).
 
 ### GPU configuration
 
-This setting determines the hardware tier your workers will utilize. You can select multiple GPU categories to create a prioritized list. Runpod attempts to allocate the first category in your list. If that hardware is unavailable, it automatically falls back to the subsequent options. 
-
-Selecting multiple GPU types significantly improves endpoint availability during periods of high demand.
+This setting determines the hardware tier your workers will utilize. You can select multiple GPU categories to create a prioritized list. Runpod attempts to allocate the first category in your list. If that hardware is unavailable, it automatically falls back to the subsequent options. Selecting multiple GPU types significantly improves endpoint availability during periods of high demand.
 
 <GPUTable />
 
@@ -44,39 +34,13 @@ Selecting multiple GPU types significantly improves endpoint availability during
 
 This setting defines the minimum number of workers that remain warm and ready to process requests at all times. Setting this to 1 or higher eliminates cold starts for the initial wave of requests. Active workers incur charges even when idle, but they receive a 20-30% discount compared to on-demand workers.
 
-<Tip>
-
-For workloads with long cold start times, use active workers to eliminate startup delays. You can estimate the optimal number by:
-
-1. Measuring your requests per minute during typical usage.
-2. Calculating average request duration in seconds.
-3. Using the formula: Active Workers = (Requests per Minute × Request Duration) / 60
-
-For example, with 6 requests per minute taking 30 seconds each: 6 × 30 / 60 = 3 active workers.
-
-Even a small number of active workers can significantly improve performance for steady traffic patterns while maintaining cost efficiency.
-
-</Tip>
-
 ### Max workers
 
-This setting controls the maximum number of concurrent instances your endpoint can scale to. This acts as a safety limit for costs and a cap on concurrency.
-
-<Tip> 
-We recommend setting your max worker count approximately 20% higher than your expected maximum concurrency. This buffer allows for smoother scaling during traffic spikes. 
-
-Avoid setting this to 1, as this restricts your deployment to a single machine, creating potential bottlenecks if that machine becomes unavailable.
-</Tip>
+This setting controls the maximum number of concurrent instances your endpoint can scale to. This acts as a safety limit for costs and a cap on concurrency. We recommend setting your max worker count approximately 20% higher than your expected maximum concurrency. This buffer allows for smoother scaling during traffic spikes.
 
 ### GPUs per worker
 
-This defines how many GPUs are assigned to a single worker instance. The default is 1.
-
-When choosing between multiple lower-tier GPUs or fewer high-end GPUs, you should generally prioritize high-end GPUs with lower GPU count per worker when possible.
-
-- High-end GPUs typically offer faster memory speeds and newer architectures, improving model loading and inference times.
-- Multi-GPU configurations introduce parallel processing overhead that can offset performance gains.
-- Higher GPU-per-worker requirements can reduce availability, as finding machines with multiple free GPUs is more challenging than locating single available GPUs.
+This defines how many GPUs are assigned to a single worker instance. The default is 1. When choosing between multiple lower-tier GPUs or fewer high-end GPUs, you should generally prioritize high-end GPUs with lower GPU count per worker when possible.
 
 ### Auto-scaling type
 
@@ -92,15 +56,9 @@ This setting determines the logic used to scale workers up and down.
 
 The idle timeout determines how long a worker remains active after completing a request before shutting down. While a worker is idle, you are billed for the time, but the worker remains "warm," allowing it to process subsequent requests immediately. The default is 5 seconds.
 
-<Tip>
-If you observe frequent cold starts, consider increasing this value to match your average traffic gaps. However, be aware that if you use the [Queue delay](#auto-scaling-type) scaling strategy, setting this value too high may prevent workers from scaling down properly.
-</Tip>
-
 ### Execution timeout
 
-The execution timeout acts as a failsafe to prevent runaway jobs from consuming infinite resources. It specifies the maximum duration a single job is allowed to run before being forcibly terminated.
-
-We strongly recommend keeping this enabled. Set the value to your longest expected request duration plus a 20% buffer. The default is 600 seconds (10 minutes), and it can be extended up to 24 hours.
+The execution timeout acts as a failsafe to prevent runaway jobs from consuming infinite resources. It specifies the maximum duration a single job is allowed to run before being forcibly terminated. We strongly recommend keeping this enabled. The default is 600 seconds (10 minutes), and it can be extended up to 24 hours.
 
 ### Job TTL (time-to-live)
 
@@ -110,7 +68,7 @@ This setting defines how long a job request remains valid in the queue before ex
 
 ### FlashBoot
 
-FlashBoot reduces cold start times by retaining the state of worker resources shortly after they spin down. This allows the system to "revive" a worker much faster than a standard fresh boot. FlashBoot is most effective on endpoints with consistent traffic, where workers frequently cycle between active and idle states. There is no additional cost for enabling FlashBoot.
+FlashBoot reduces cold start times by retaining the state of worker resources shortly after they spin down. This allows the system to "revive" a worker much faster than a standard fresh boot. FlashBoot is most effective on endpoints with consistent traffic, where workers frequently cycle between active and idle states.
 
 ### Model (optional)
 
@@ -133,53 +91,3 @@ This filter ensures your workers are scheduled on host machines with compatible
 ### Expose HTTP/TCP ports
 
 Enabling this option exposes the public IP and port of the worker, allowing for direct external communication. This is required for applications that need persistent connections, such as WebSockets.
-
-## Reducing worker startup times
-
-There are two primary factors that impact worker start times:
-
-1. **Worker initialization time:** Worker initialization occurs when a Docker image is downloaded to a new worker. This takes place after you create a new endpoint, adjust worker counts, or deploy a new worker image. Requests that arrive during initialization face delays, as a worker must be fully initialized before it can start processing.
-
-2. **Cold start:** A cold start occurs when a worker is revived from an idle state. Cold starts can get very long if your handler code loads large ML models (several gigabytes to hundreds of gigabytes) into GPU memory.
-
-<Note>
-
-If your worker's cold start time exceeds the default 7-minute limit (which can occur when loading large models), the system may mark it as unhealthy. To prevent this, you can extend the cold start timeout by setting the `RUNPOD_INIT_TIMEOUT` environment variable. For example, setting `RUNPOD_INIT_TIMEOUT=800` allows up to 800 seconds (13.3 minutes) for revival.
-
-</Note>
-
-Use these strategies to reduce worker startup times:
-
-1. **Embed models in Docker images:** Package your ML models directly within your worker container image instead of downloading them in your handler function. This strategy places models on the worker's high-speed local storage (SSD/NVMe), dramatically reducing the time needed to load models into GPU memory. This approach is optimal for production environments, though extremely large models (500GB+) may require network volume storage.
-
-2. **Store large models on network volumes:** For flexibility during development, save large models to a [network volume](/storage/network-volumes) using a Pod or one-time handler, then mount this volume to your Serverless workers. While network volumes offer slower model loading compared to embedding models directly, they can speed up your workflow by enabling rapid iteration and seamless switching between different models and configurations.
-
-3. **Maintain active workers:** Set active worker counts above zero to completely eliminate cold starts. These workers remain ready to process requests instantly and cost up to 30% less when idle compared to standard (flex) workers.
-
-4. **Extend idle timeouts:** Configure longer idle periods to preserve worker availability between requests. This strategy prevents premature worker shutdown during temporary traffic lulls, ensuring no cold starts for subsequent requests.
-
-5. **Optimize scaling parameters:** Fine-tune your auto-scaling configuration for more responsive worker provisioning:
-   - Lower queue delay thresholds to 2-3 seconds (default 4).
-   - Decrease request count thresholds to 2-3 (default 4).
-
-   These refinements create a more agile scaling system that responds swiftly to traffic fluctuations.
-
-6. **Increase maximum worker limits:** Set higher maximum worker capacities to ensure your Docker images are pre-cached across multiple compute nodes and data centers. This proactive approach eliminates image download delays during scaling events, significantly reducing startup times.
-
-## Best practices summary
-
-- **Understand optimization tradeoffs** and make conscious tradeoffs between cost, speed, and model size.
-- **Start conservative** with [max workers](#max-workers) and scale up as needed.
-- **Monitor throttling** and adjust [max workers](#max-workers) accordingly.
-- **Use [active workers](#active-workers)** for latency-sensitive applications.
-- **Select multiple [GPU types](#gpu-configuration)** to improve availability.
-- **Choose appropriate [timeouts](#execution-timeout)** based on your workload characteristics.
-- **Consider data locality** when using network volumes.
-- **Avoid setting [max workers](#max-workers) to 1** to prevent bottlenecks.
-- **Plan for 20% headroom** in [max workers](#max-workers) to handle load spikes.
-- **Prefer high-end GPUs with lower GPU count** for better performance.
-- **Set [execution timeout](#execution-timeout)** to prevent runaway processes.
-- **Match [auto-scaling strategy](#auto-scaling-type)** to your workload patterns.
-- **Embed models in Docker images** when possible for faster loading.
-- **Extend [idle timeouts](#idle-timeout)** to prevent frequent cold starts.
-- **Consider disabling [FlashBoot](#flashboot)** for endpoints with few workers or infrequent traffic.

From cc8ef1c2f64da39bf3998ed042588cbf44b3a887 Mon Sep 17 00:00:00 2001
From: Mo King <muhsinking@gmail.com>
Date: Fri, 12 Dec 2025 11:19:20 -0500
Subject: [PATCH 4/7] Split validation/cleanup, Delete unused files

---
 docs.json                                     |  21 ++--
 serverless/development/benchmarking.mdx       |  10 +-
 serverless/development/cleanup.mdx            |  88 +++++++++++++++
 serverless/development/concurrency.mdx        |   0
 serverless/development/debugger.mdx           |   0
 serverless/development/dual-mode-worker.mdx   |  10 +-
 .../development/environment-variables.mdx     |   5 -
 serverless/development/error-handling.mdx     |   4 +-
 serverless/development/local-testing.mdx      |  10 +-
 serverless/development/optimization.mdx       |   4 +-
 serverless/development/overview.mdx           |  62 +++++------
 serverless/development/sdk-utilities.mdx      |   8 --
 serverless/development/ssh-into-workers.mdx   |   8 +-
 .../development/test-response-times.mdx       |   0
 serverless/development/validation.mdx         | 103 ++++++++++++++++++
 serverless/development/validator.mdx          |   0
 serverless/overview.mdx                       |   4 +
 17 files changed, 250 insertions(+), 87 deletions(-)
 delete mode 100644 serverless/development/concurrency.mdx
 delete mode 100644 serverless/development/debugger.mdx
 delete mode 100644 serverless/development/test-response-times.mdx
 create mode 100644 serverless/development/validation.mdx
 delete mode 100644 serverless/development/validator.mdx

diff --git a/docs.json b/docs.json
index baaa4807..052ef8f7 100644
--- a/docs.json
+++ b/docs.json
@@ -93,16 +93,17 @@
               {
                 "group": "Development",
                 "pages": [
-                "serverless/development/overview",
-                "serverless/development/local-testing",
-                "serverless/development/sdk-utilities",
-                "serverless/development/error-handling",
-                "serverless/development/benchmarking",
-                "serverless/development/optimization",
-                "serverless/development/logs",
-                "serverless/development/dual-mode-worker",
-                "serverless/development/ssh-into-workers",
-                "serverless/development/environment-variables"
+                  "serverless/development/overview",
+                  "serverless/development/local-testing",
+                  "serverless/development/error-handling",
+                  "serverless/development/validation",
+                  "serverless/development/cleanup",
+                  "serverless/development/benchmarking",
+                  "serverless/development/optimization",
+                  "serverless/development/logs",
+                  "serverless/development/dual-mode-worker",
+                  "serverless/development/ssh-into-workers",
+                  "serverless/development/environment-variables"
                 ]
               }
             ]
diff --git a/serverless/development/benchmarking.mdx b/serverless/development/benchmarking.mdx
index e16b3c27..5bd56ed5 100644
--- a/serverless/development/benchmarking.mdx
+++ b/serverless/development/benchmarking.mdx
@@ -1,15 +1,15 @@
 ---
-title: "Benchmarking"
+title: "Benchmark your workers"
 sidebarTitle: "Benchmarking"
-description: "Benchmark your Serverless workers and measure delay and execution times."
+description: "Measure the performance of your Serverless workers and identify bottlenecks."
 ---
 
-Understanding your worker's performance helps you choose the right GPU and [optimize your code](/serverless/development/optimization). You can measure two key metrics:
+Benchmarking your Serverless workers helps you identify bottlenecks and [optimize your code](/serverless/development/optimization) for performance and cost. Performance is measured by two key metrics:
 
   - **Delay time**: The time spent waiting for a worker to become available. This includes the cold start time if a new worker needs to be spun up.
-  - **Execution time**: The time the GPU takes to actually process the request once the worker has received the job.
+  - **Execution time**: The time the GPU takes to process the request once the worker has received the job.
 
-### Send a test request
+## Send a test request
 
 To gather initial metrics, use `curl` to send a request to your endpoint. This will initiate the job and return a request ID that you can use to poll for status.
 
diff --git a/serverless/development/cleanup.mdx b/serverless/development/cleanup.mdx
index e69de29b..f5be4870 100644
--- a/serverless/development/cleanup.mdx
+++ b/serverless/development/cleanup.mdx
@@ -0,0 +1,88 @@
+---
+title: "Clean up temporary files"
+sidebarTitle: "Clean up files"
+description: "Manage disk space by automatically removing temporary files."
+---
+
+The Runpod SDK's `clean()` function helps maintain the health of your Serverless worker by removing temporary files and folders after processing completes. This is particularly important for workers that download large assets or generate temporary artifacts, as accumulated data can lead to `DiskQuotaExceeded` errors over time.
+
+## Import the `clean()` function
+
+To use the `clean()` function, import it from the `utils.rp_cleanup` module:
+
+```python
+from runpod.serverless.utils.rp_cleanup import clean
+```
+
+## Default behavior
+
+When called without arguments, `clean()` targets a specific set of default directories for removal:
+
+ - `input_objects/`
+ - `output_objects/`
+ - `job_files/`
+ - `output.zip`
+
+These are standard locations used by various SDK operations, and cleaning them ensures a fresh state for the next request.
+
+## Custom cleanup
+
+If your handler generates files in non-standard directories, you can override the default behavior by passing a list of folder names to the `folder_list` parameter.
+
+```python
+clean(folder_list=["temp_images", "cache", "downloads"])
+```
+
+## Use `clean()` in your handler
+
+You should integrate cleanup logic into your handler's lifecycle, typically within a `finally` block or right before returning the result.
+
+```python
+import runpod
+from runpod.serverless.utils.rp_cleanup import clean
+import requests
+import os
+
+
+def download_image(url, save_path):
+    response = requests.get(url)
+    if response.status_code == 200:
+        with open(save_path, "wb") as file:
+            file.write(response.content)
+        return True
+    return False
+
+
+def handler(event):
+    try:
+        image_url = event["input"]["image_url"]
+
+        # Create a temporary directory
+        os.makedirs("temp_images", exist_ok=True)
+        image_path = "temp_images/downloaded_image.jpg"
+
+        # Download the image
+        if not download_image(image_url, image_path):
+            raise Exception("Failed to download image")
+
+        # Process the image (your code here)
+        result = f"Processed image from: {image_url}"
+
+        # Cleanup specific folders after processing
+        clean(folder_list=["temp_images"])
+
+        return {"output": result}
+    except Exception as e:
+        # Attempt cleanup even if an error occurs
+        clean(folder_list=["temp_images"])
+        return {"error": str(e)}
+
+
+runpod.serverless.start({"handler": handler})
+```
+
+## Best practices
+
+To ensure reliability, always call `clean()` at the end of your handler execution. We recommend wrapping your cleanup calls in a `try...except` or `finally` block so that disk space is recovered even if your main processing logic fails.
+
+Be cautious when adding custom folders to the cleanup list to avoid accidentally deleting persistent data, and consider logging cleanup actions during development to verify that the correct paths are being targeted.
\ No newline at end of file
diff --git a/serverless/development/concurrency.mdx b/serverless/development/concurrency.mdx
deleted file mode 100644
index e69de29b..00000000
diff --git a/serverless/development/debugger.mdx b/serverless/development/debugger.mdx
deleted file mode 100644
index e69de29b..00000000
diff --git a/serverless/development/dual-mode-worker.mdx b/serverless/development/dual-mode-worker.mdx
index d476f00f..ddb54a95 100644
--- a/serverless/development/dual-mode-worker.mdx
+++ b/serverless/development/dual-mode-worker.mdx
@@ -410,12 +410,4 @@ Congratulations! You've successfully built, deployed, and tested a dual-mode Ser
   </Step>
 </Steps>
 
-This iterative loop (write your handler, update the Docker image, test in Pod mode, then deploy to Serverless) enables you to rapidly develop and debug your Serverless workers.
-
-## Next steps
-
-Now that you've mastered the dual-mode development workflow, you can learn how to:
-
-* [Test your handlers locally before deploying to Serverless.](/serverless/development/local-testing)
-* [Manage and use environment variables for your endpoints.](/serverless/development/environment-variables)
-* [SSH into your workers for debugging.](/serverless/development/ssh-into-workers)
\ No newline at end of file
+This iterative loop (write your handler, update the Docker image, test in Pod mode, then deploy to Serverless) enables you to rapidly develop and debug your Serverless workers.
\ No newline at end of file
diff --git a/serverless/development/environment-variables.mdx b/serverless/development/environment-variables.mdx
index 01d5ed65..6a40cb33 100644
--- a/serverless/development/environment-variables.mdx
+++ b/serverless/development/environment-variables.mdx
@@ -246,8 +246,3 @@ Use different approaches for secrets vs configuration:
 - **Secrets**: Only set as runtime variables in the Runpod console.
 - **Configuration**: Can use build-time defaults with runtime overrides.
 
-## Next steps
-
-- [Local testing](/serverless/development/local-testing) - Test your handler with different environment variables locally.
-- [Pod-first development](/serverless/development/pod-first-development) - Use environment variables to control Pod vs Serverless mode.
-- [Optimization](/serverless/development/optimization) - Configure your workers for different environments with CI/CD.
diff --git a/serverless/development/error-handling.mdx b/serverless/development/error-handling.mdx
index aaf28122..836c2924 100644
--- a/serverless/development/error-handling.mdx
+++ b/serverless/development/error-handling.mdx
@@ -8,7 +8,7 @@ Robust error handling is essential for production Serverless workers. It prevent
 
 ## Basic error handling
 
-The simplest way to handle errors is to wrap your handler logic in a try-except block. This ensures that even if your logic fails, the worker remains stable and returns a readable error message.
+The simplest way to handle errors is to wrap your handler logic in a `try...except` block. This ensures that even if your logic fails, the worker remains stable and returns a readable error message.
 
 ```python
 import runpod
@@ -31,7 +31,7 @@ runpod.serverless.start({"handler": handler})
 
 ## Structured error responses
 
-For more complex applications, you should return consistent error objects. This allows the client consuming your API to programmatically handle different types of errors, such as validation failures versus unexpected server errors.
+For more complex applications, you should return consistent error objects. This allows the client consuming your API to programmatically handle different types of errors, such as [validation failures](/serverless/development/validation) versus unexpected server errors.
 
 ```python
 import runpod
diff --git a/serverless/development/local-testing.mdx b/serverless/development/local-testing.mdx
index 89b63279..c792cb5c 100644
--- a/serverless/development/local-testing.mdx
+++ b/serverless/development/local-testing.mdx
@@ -197,12 +197,4 @@ This command:
 - Starts the local API server on port 8080.
 - Uses 4 concurrent workers.
 - Sets the log level to `DEBUG` for maximum information.
-- Enables the debugger for troubleshooting.
-
-## Next steps
-
-Once you've tested your handler locally, learn about:
-
-- [SDK utilities](/serverless/development/sdk-utilities): Helper functions for validation and cleanup.
-- [Pod-first development](/serverless/development/pod-first-development): Develop on a Pod before deploying to Serverless.
-- [Logs](/serverless/development/logs): Understand logging in production.
+- Enables the debugger for troubleshooting.
\ No newline at end of file
diff --git a/serverless/development/optimization.mdx b/serverless/development/optimization.mdx
index b677f3fd..9490b955 100644
--- a/serverless/development/optimization.mdx
+++ b/serverless/development/optimization.mdx
@@ -1,10 +1,10 @@
 ---
 title: "Optimize your workers"
-sidebarTitle: "Optimization guide"
+sidebarTitle: "Optimization"
 description: "Implement strategies to reduce latency and cost for your Serverless workers."
 ---
 
-Optimizing your Serverless workers involves a cycle of measuring performance (e.g. with [benchmarking](/serverless/development/benchmarking)), identifying bottlenecks, and tuning your [endpoint configurations](/serverless/endpoints/endpoint-configurations). This guide covers specific strategies to reduce startup times and improve throughput.
+Optimizing your Serverless workers involves a cycle of measuring performance with [benchmarking](/serverless/development/benchmarking), identifying bottlenecks, and tuning your [endpoint configurations](/serverless/endpoints/endpoint-configurations). This guide covers specific strategies to reduce startup times and improve throughput.
 
 ## Optimization overview
 
diff --git a/serverless/development/overview.mdx b/serverless/development/overview.mdx
index c7514f85..3aa61544 100644
--- a/serverless/development/overview.mdx
+++ b/serverless/development/overview.mdx
@@ -1,10 +1,10 @@
 ---
 title: "Serverless development"
 sidebarTitle: "Overview"
-description: "Build, test, and deploy Serverless workers to production."
+description: "Test, debug, and optimize your Serverless applications."
 ---
 
-When developing for Runpod Serverless, you'll typically start by writing handler functions, test them locally, and then deploy to production. This guide introduces the development workflow and tools that help you build, test, and deploy Serverless workers effectively.
+When developing for Runpod Serverless, you'll typically start by writing handler functions, test them locally, and then deploy to production. This guide introduces the development workflow and tools that help you test, debug, and optimize your Serverless applications effectively.
 
 ## Development lifecycle
 
@@ -17,7 +17,7 @@ When your handler is working correctly, package it into a Docker image and deplo
 %%{init: {'theme':'base', 'themeVariables': { 'primaryColor':'#5D29F0','primaryTextColor':'#fff','primaryBorderColor':'#874BFF','lineColor':'#AE6DFF','secondaryColor':'#AE6DFF','tertiaryColor':'#FCB1FF','edgeLabelBackground':'#AE6DFF', 'fontSize':'15px','fontFamily':'font-inter'}}}%%
 
 flowchart TD
-    Start([Write handler function]) --> Test[Test locally with SDK]
+    Start([Write handler function]) --> Test[Test handler locally with the Runpod SDK]
     
     Test --> Check{Tests pass?}
     
@@ -25,13 +25,13 @@ flowchart TD
     
     Fix --> Test
     
-    Check -->|"&nbsp;&nbsp;Yes&nbsp;&nbsp;"| Package[Package Docker image]
+    Check -->|"&nbsp;&nbsp;Yes&nbsp;&nbsp;"| Package[Package worker as a Docker image]
     
-    Package --> Deploy[Deploy to serverless endpoint]
+    Package --> Deploy[Deploy worker image to Runpod Serverless]
     
-    subgraph Production [Production environment]
-        Deploy --> Running[Auto-scaling execution]
-        Running --> Monitor[Monitor logs & metrics]
+    subgraph Production [Production]
+        Deploy --> Running[Workers auto-scale based on demand]
+        Running --> Monitor[Monitor logs and metrics, SSH into workers for live debugging]
     end
     
     Monitor -.-> Start
@@ -54,9 +54,7 @@ flowchart TD
 For faster iteration and debugging of GPU-intensive applications, you can develop on a Pod first before deploying to Serverless. This "Pod-first" workflow gives you direct access to the GPU environment with tools like Jupyter Notebooks and SSH, letting you iterate faster than deploying repeatedly to Serverless. Learn more in [Pod-first development](/serverless/development/dual-mode-worker).
 </Tip>
 
-## Development features
-
-### Local testing environment
+## Local testing
 
 The Runpod SDK provides a comprehensive local testing environment:
 
@@ -67,16 +65,31 @@ The Runpod SDK provides a comprehensive local testing environment:
 
 Learn more in [Local testing](/serverless/development/local-testing).
 
-### SDK utilities
+## Error handling
+
+Implement robust error handling to ensure your workers remain stable and return useful error messages.
+
+Learn more in [Error handling](/serverless/development/error-handling).
+
+## SDK utilities
 
 The Runpod SDK includes helper functions to make your handlers more robust:
 
 - **Input validation**: Validate request data against a schema.
 - **Cleanup utilities**: Automatically remove temporary files after processing.
 
-Learn more in [SDK utilities](/serverless/development/sdk-utilities).
+Learn more in [Validate inputs](/serverless/development/validation) and [Clean up files](/serverless/development/cleanup).
+
+## Benchmarking and optimization
+
+Optimize your workers for performance and cost:
+
+- **Benchmark response times**: Measure cold start and execution time.
+- **Optimize your workers**: Reduce startup and execution times.
+
+Learn more in the [Benchmarking](/serverless/development/benchmarking) and [Optimization](/serverless/development/optimization) guides.
 
-### Pod-first development
+## Pod-first development
 
 For faster iteration and debugging of GPU-intensive applications, develop on a Pod first, then deploy the same Docker image to Serverless. This workflow provides:
 
@@ -86,7 +99,7 @@ For faster iteration and debugging of GPU-intensive applications, develop on a P
 
 Learn more in [Pod-first development](/serverless/development/dual-mode-worker).
 
-### Debugging and observability
+## Debugging and observability
 
 Runpod provides several tools for debugging and monitoring:
 
@@ -94,27 +107,10 @@ Runpod provides several tools for debugging and monitoring:
 - **Metrics**: Monitor execution time, delay time, and resource usage.
 - **SSH access**: Connect directly to running workers for live debugging.
 
-Learn more in [Logs](/serverless/development/logs) and [SSH access](/serverless/development/ssh-into-workers).
+Learn more in [Logs and monitoring](/serverless/development/logs) and [Connect to workers with SSH](/serverless/development/ssh-into-workers).
 
 ## Environment variables
 
 Use environment variables to configure your workers without hardcoding credentials or settings in your code. Environment variables are set in the Runpod console and are available to your handler at runtime.
 
 Learn more in [Environment variables](/serverless/development/environment-variables).
-
-## Benchmarking and optimization
-
-Optimize your workers for performance and cost:
-
-- **Benchmark response times**: Measure cold start and execution time.
-- **Error handling**: Implement robust error handling in your handler.
-
-Learn more in [Benchmarking and optimization](/serverless/development/optimization).
-
-## Next steps
-
-Start by learning how to test your handler locally:
-
-- [Local testing](/serverless/development/local-testing)
-- [SDK utilities](/serverless/development/sdk-utilities)
-- [Pod-first development](/serverless/development/pod-first-development)
diff --git a/serverless/development/sdk-utilities.mdx b/serverless/development/sdk-utilities.mdx
index a35f75f6..620b7048 100644
--- a/serverless/development/sdk-utilities.mdx
+++ b/serverless/development/sdk-utilities.mdx
@@ -175,11 +175,3 @@ clean(folder_list=["temp_images", "cache", "downloads"])
 - Use try-except blocks to handle errors during cleanup.
 - Be cautious when adding custom folders to the cleanup list.
 - Consider logging cleanup actions for debugging purposes.
-
-## Next steps
-
-Learn about other development tools:
-
-- [Local testing](/serverless/development/local-testing): Test your handler before deploying.
-- [Pod-first development](/serverless/development/pod-first-development): Develop on a Pod before deploying to Serverless.
-- [Environment variables](/serverless/development/environment-variables): Configure your workers without hardcoding credentials.
diff --git a/serverless/development/ssh-into-workers.mdx b/serverless/development/ssh-into-workers.mdx
index 1b389c2f..719bc56f 100644
--- a/serverless/development/ssh-into-workers.mdx
+++ b/serverless/development/ssh-into-workers.mdx
@@ -1,14 +1,14 @@
 ---
-title: "SSH into running workers"
+title: "Connect to workers with SSH"
 sidebarTitle: "SSH into workers"
-description: "Connect to your Serverless workers via SSH for debugging and troubleshooting."
+description: "SSH into running workers for debugging and troubleshooting."
 ---
 
-SSH into running workers to debug endpoints in development and production. By connecting to a worker, you can inspect logs, file systems, and environment variables in real-time.
+You can connect directly to running workers via SSH for debugging and troubleshooting. By connecting to a worker, you can inspect logs, file systems, and environment variables in real-time.
 
 ## Generate an SSH key and add it to your Runpod account
 
-Before you can SSH into a worker, you'll need to generate an SSH key and add it to your Runpod account.
+Before you can connect to a worker, you'll need to generate an SSH key and add it to your Runpod account.
 
 <Steps>
   <Step title="Generate an SSH key">
diff --git a/serverless/development/test-response-times.mdx b/serverless/development/test-response-times.mdx
deleted file mode 100644
index e69de29b..00000000
diff --git a/serverless/development/validation.mdx b/serverless/development/validation.mdx
new file mode 100644
index 00000000..f419d31c
--- /dev/null
+++ b/serverless/development/validation.mdx
@@ -0,0 +1,103 @@
+---
+title: "Validate inputs"
+sidebarTitle: "Validate inputs"
+description: "Validate handler inputs using the Runpod SDK schema validator."
+---
+
+The Runpod SDK includes a built-in validation utility that ensures your handler receives data in the correct format before processing begins. Validating inputs early helps catch errors immediately and prevents your worker from crashing due to unexpected or malformed data types.
+
+## Import the validator
+
+To use the validation features, import the `validate` function from the utils module:
+
+```python
+from runpod.serverless.utils.rp_validator import validate
+```
+
+## Define a schema
+
+You define your validation rules using a dictionary where each key represents an expected input field. This schema dictates the data types, necessity, and constraints for the incoming data.
+
+```python
+schema = {
+    "text": {
+        "type": str,
+        "required": True,
+    },
+    "max_length": {
+        "type": int,
+        "required": False,
+        "default": 100,
+        "constraints": lambda x: x > 0,
+    },
+}
+```
+
+The schema supports several configuration keys:
+- `type` (required): Expected input type (e.g., `str`, `int`, `float`, `bool`).
+- `required` (default: `False`): Whether the field is required.
+- `default` (default: `None`): Default value if input is not provided.
+- `constraints` (optional): A lambda function that returns `True` or `False` to validate the value.
+
+## Validate input in your handler
+
+When implementing validation in your handler, pass the input object and your schema to the `validate` function. The function returns a dictionary containing either an `errors` key or a `validated_input` key.
+
+```python
+import runpod
+from runpod.serverless.utils.rp_validator import validate
+
+schema = {
+    "text": {
+        "type": str,
+        "required": True,
+    },
+    "max_length": {
+        "type": int,
+        "required": False,
+        "default": 100,
+        "constraints": lambda x: x > 0,
+    },
+}
+
+
+def handler(event):
+    try:
+        # Validate the input against the schema
+        validated_input = validate(event["input"], schema)
+        
+        # Check for validation errors
+        if "errors" in validated_input:
+            return {"error": validated_input["errors"]}
+
+        # Access the sanitized inputs
+        text = validated_input["validated_input"]["text"]
+        max_length = validated_input["validated_input"]["max_length"]
+
+        result = text[:max_length]
+        return {"output": result}
+    except Exception as e:
+        return {"error": str(e)}
+
+
+runpod.serverless.start({"handler": handler})
+```
+
+## Test the validator
+
+You can test your validation logic locally without deploying. Save your handler code and run it via the command line with the `--test_input` flag.
+
+```sh
+python your_handler.py --test_input '{"input": {"text": "Hello, world!", "max_length": 5}}'
+```
+
+Alternatively, you can define your test case in a JSON file and pass it to the handler to simulate a real request.
+
+```json test_input.json
+{
+  "input": {
+    "text": "The quick brown fox jumps over the lazy dog",
+    "max_length": 50
+  }
+}
+```
\ No newline at end of file
diff --git a/serverless/development/validator.mdx b/serverless/development/validator.mdx
deleted file mode 100644
index e69de29b..00000000
diff --git a/serverless/overview.mdx b/serverless/overview.mdx
index 087ad5c7..4fd1fe9a 100644
--- a/serverless/overview.mdx
+++ b/serverless/overview.mdx
@@ -180,6 +180,10 @@ When deploying models on Serverless endpoints, follow this order of preference:
 
 3. [Use network volumes](/serverless/storage/network-volumes): You can use network volumes to store models and other files that need to persist between workers. Models loaded from network storage are slower than cached or baked models, so you should only use this option when the preceeding approaches don't fit your needs.
 
+## Development lifecycle
+
+When developing for Serverless applications, you'll typically start by writing a handler function, testing it locally, and then deploying it to production. To learn more about testing, error handling, monitoring, and optimizing your Serverless applications, see [Serverless development](/serverless/development/overview).
+
 ## Next steps
 
 Ready to get started with Runpod Serverless?

From 8ab88c3bc1692dbfb201d115c2c73f67aaa46265 Mon Sep 17 00:00:00 2001
From: Mo King <muhsinking@gmail.com>
Date: Fri, 12 Dec 2025 11:58:22 -0500
Subject: [PATCH 5/7] rm sdk-utilities

---
 serverless/development/sdk-utilities.mdx | 177 -----------------------
 1 file changed, 177 deletions(-)
 delete mode 100644 serverless/development/sdk-utilities.mdx

diff --git a/serverless/development/sdk-utilities.mdx b/serverless/development/sdk-utilities.mdx
deleted file mode 100644
index 620b7048..00000000
--- a/serverless/development/sdk-utilities.mdx
+++ /dev/null
@@ -1,177 +0,0 @@
----
-title: "SDK utilities"
-description: "Use helper functions to validate inputs and clean up temporary files."
----
-
-The Runpod SDK includes helper functions to make your handlers more robust and easier to maintain. These utilities handle common tasks like input validation and cleanup.
-
-## Input validation
-
-The validator utility ensures your handler receives the correct input format before processing. This helps catch errors early and prevents issues from unexpected or malformed inputs.
-
-### Import the validator
-
-```python
-from runpod.serverless.utils.rp_validator import validate
-```
-
-### Define a schema
-
-Define your schema as a dictionary with validation rules for each input field:
-
-```python
-schema = {
-    "text": {
-        "type": str,
-        "required": True,
-    },
-    "max_length": {
-        "type": int,
-        "required": False,
-        "default": 100,
-        "constraints": lambda x: x > 0,
-    },
-}
-```
-
-Schema properties:
-- `type` (required): Expected input type (e.g., `str`, `int`, `float`, `bool`).
-- `required` (default: `False`): Whether the field is required.
-- `default` (default: `None`): Default value if input is not provided.
-- `constraints` (optional): A lambda function that returns `True` or `False` to validate the value.
-
-### Validate input in your handler
-
-```python
-import runpod
-from runpod.serverless.utils.rp_validator import validate
-
-schema = {
-    "text": {
-        "type": str,
-        "required": True,
-    },
-    "max_length": {
-        "type": int,
-        "required": False,
-        "default": 100,
-        "constraints": lambda x: x > 0,
-    },
-}
-
-
-def handler(event):
-    try:
-        validated_input = validate(event["input"], schema)
-        if "errors" in validated_input:
-            return {"error": validated_input["errors"]}
-
-        text = validated_input["validated_input"]["text"]
-        max_length = validated_input["validated_input"]["max_length"]
-
-        result = text[:max_length]
-        return {"output": result}
-    except Exception as e:
-        return {"error": str(e)}
-
-
-runpod.serverless.start({"handler": handler})
-```
-
-### Test the validator
-
-Save your handler as `your_handler.py` and test it:
-
-```sh
-python your_handler.py --test_input '{"input": {"text": "Hello, world!", "max_length": 5}}'
-```
-
-Or create a `test_input.json` file:
-
-```json test_input.json
-{
-  "input": {
-    "text": "The quick brown fox jumps over the lazy dog",
-    "max_length": 50
-  }
-}
-```
-
-## Cleanup utility
-
-The cleanup utility removes temporary files and folders after your handler completes processing. This prevents disk space issues from accumulating temporary data.
-
-### Import the cleanup function
-
-```python
-from runpod.serverless.utils.rp_cleanup import clean
-```
-
-### Default behavior
-
-By default, `clean()` removes these directories and files:
-- `input_objects/`
-- `output_objects/`
-- `job_files/`
-- `output.zip`
-
-### Use cleanup in your handler
-
-```python
-import runpod
-from runpod.serverless.utils.rp_cleanup import clean
-import requests
-import os
-
-
-def download_image(url, save_path):
-    response = requests.get(url)
-    if response.status_code == 200:
-        with open(save_path, "wb") as file:
-            file.write(response.content)
-        return True
-    return False
-
-
-def handler(event):
-    try:
-        image_url = event["input"]["image_url"]
-
-        # Create a temporary directory
-        os.makedirs("temp_images", exist_ok=True)
-        image_path = "temp_images/downloaded_image.jpg"
-
-        # Download the image
-        if not download_image(image_url, image_path):
-            raise Exception("Failed to download image")
-
-        # Process the image (your code here)
-        result = f"Processed image from: {image_url}"
-
-        # Cleanup after processing
-        clean(folder_list=["temp_images"])
-
-        return {"output": result}
-    except Exception as e:
-        # Attempt cleanup even on error
-        clean(folder_list=["temp_images"])
-        return {"error": str(e)}
-
-
-runpod.serverless.start({"handler": handler})
-```
-
-### Custom cleanup
-
-Specify additional folders to remove by passing a list to `clean()`:
-
-```python
-clean(folder_list=["temp_images", "cache", "downloads"])
-```
-
-### Best practices
-
-- Call `clean()` at the end of your handler to ensure proper cleanup.
-- Use try-except blocks to handle errors during cleanup.
-- Be cautious when adding custom folders to the cleanup list.
-- Consider logging cleanup actions for debugging purposes.

From 030159691d10f7e1cb1832401851c40303901615 Mon Sep 17 00:00:00 2001
From: Mo King <muhsinking@gmail.com>
Date: Fri, 12 Dec 2025 12:51:52 -0500
Subject: [PATCH 6/7] Add tip to public endpoints

---
 hub/public-endpoints.mdx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hub/public-endpoints.mdx b/hub/public-endpoints.mdx
index 517313b5..6d9073bf 100644
--- a/hub/public-endpoints.mdx
+++ b/hub/public-endpoints.mdx
@@ -10,6 +10,10 @@ description: "Test and deploy production-ready AI models using Public Endpoints.
 
 Runpod Public Endpoints provide instant access to state-of-the-art AI models through simple API calls, with an API playground available through the [Runpod Hub](/hub/overview).
 
+<Tip>
+Public Endpoints are pre-deployed models hosted by Runpod. If you want to deploy your own AI/ML APIs, use [Runpod Serverless](/serverless/overview).
+</Tip>
+
 ## Available models
 
 For a list of available models and model-specific parameters, see the [Public Endpoint model reference](/hub/public-endpoint-reference).

From 9476895a83ba70212f8dba21e289d7edc49c3c0e Mon Sep 17 00:00:00 2001
From: Mo King <muhsinking@gmail.com>
Date: Fri, 12 Dec 2025 13:20:22 -0500
Subject: [PATCH 7/7] Add guide for choosing a compute service

---
 docs.json                        |   1 +
 get-started/product-overview.mdx | 121 +++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)
 create mode 100644 get-started/product-overview.mdx

diff --git a/docs.json b/docs.json
index 052ef8f7..5214b28f 100644
--- a/docs.json
+++ b/docs.json
@@ -39,6 +39,7 @@
               "overview",
               "get-started",
               "get-started/concepts",
+              "get-started/product-overview",
               "get-started/manage-accounts",
               "get-started/api-keys",
               "get-started/connect-to-runpod"
diff --git a/get-started/product-overview.mdx b/get-started/product-overview.mdx
new file mode 100644
index 00000000..86aedbf5
--- /dev/null
+++ b/get-started/product-overview.mdx
@@ -0,0 +1,121 @@
+---
+title: "Choose the right compute service"
+sidebarTitle: "Choose a compute service"
+description: "Find the right compute solution for your AI/ML workload."
+---
+
+Runpod provides several compute options designed for different stages of the AI lifecycle, from exploration and development to production scaling. Choosing the right option depends on your specific requirements regarding scalability, persistence, and infrastructure management.
+
+## Product overview
+
+Use this decision matrix to identify the best Runpod solution for your workload:
+
+| If you want to... | Use... | Because it... |
+| :--- | :--- | :--- |
+| **Call a standard model API** (Llama 3, Flux) without managing infrastructure | [Public Endpoints](/hub/public-endpoints) | Provides instant APIs for using popular models with usage-based pricing. |
+| **Serve a custom model** that scales automatically with traffic | [Serverless](/serverless/overview) | Handles GPU/CPU auto-scaling and charges only for active compute time. |
+| **Develop code**, debug, or train models interactively | [Pods](/pods/overview) | Gives you a persistent GPU/CPU environment with full terminal/SSH access, similar to a cloud VPS. |
+| **Train massive models** across multiple GPU nodes | [Instant Clusters](/instant-clusters) | Provides pre-configured high-bandwidth interconnects for distributed training workloads. |
+
+## Detailed breakdown
+
+### [Serverless](/serverless/overview): Create custom AI/ML APIs
+
+Serverless is designed for deployment. It abstracts away the underlying infrastructure, allowing you to define a Worker (a Docker container) that spins up on demand to handle incoming API requests.
+
+**Key characteristics:**
+
+- **Auto-scaling:** Scales from zero to hundreds of workers based on request volume.
+- **Stateless:** Workers are ephemeral; they spin up, process a request, and spin down.
+- **Billing:** Pay-per-second of compute time. No cost when idle.
+- **Best for:** Production inference, sporadic workloads, and scalable microservices.
+
+### [Pods](/pods/overview): Train and fine-tune models using a persistent GPU environment
+
+Pods provide a persistent computing environment. When you deploy a Pod, you are renting a specific GPU instance that stays active until you stop or terminate it. This is equivalent to renting a virtual machine with a GPU attached.
+
+**Key characteristics:**
+
+  * **Persistent:** Your environment, installed packages, and running processes persist as long as the Pod is active.
+  * **Interactive:** Full access via SSH, JupyterLab, or VSCode Server.
+  * **Billing:** Pay-per-minute (or hourly) for the reserved time, regardless of usage.
+  * **Best for:** Model training, fine-tuning, debugging code, exploring datasets, and long-running background tasks that do not require auto-scaling.
+
+### [Public Endpoints](/hub/public-endpoints): Instant access to popular models
+
+Public Endpoints are Runpod-managed Serverless endpoints hosting popular community models. They require zero configuration and allow you to integrate AI capabilities into your application immediately.
+
+**Key characteristics:**
+
+  * **Zero setup:** No Dockerfiles or infrastructure configuration required.
+  * **Standard APIs:** OpenAI-compatible inputs for LLMs; standard JSON inputs for image generation.
+  * **Billing:** Pay-per-token (text) or pay-per-generation (image/video).
+  * **Best for:** Rapid prototyping, applications using standard open-source models, and users who do not need custom model weights.
+
+### [Instant Clusters](/instant-clusters): For distributed workloads
+
+Instant Clusters allow you to provision multiple GPU/CPU nodes networked together with high-speed interconnects (up to 3200 Gbps).
+
+**Key characteristics:**
+
+  * **Multi-node:** Orchestrated groups of 2 to 8+ nodes.
+  * **High performance:** Optimized for low-latency inter-node communication (NCCL).
+  * **Best for:** Distributed training (FSDP, DeepSpeed), fine-tuning large language models (70B+ parameters), and HPC simulations.
+
+## Workflow examples
+
+### Develop-to-deploy cycle
+
+**Goal:** Build a custom AI application from scratch and ship it to production.
+
+1.  **Interactive development:** You deploy a single [Pod](/pods/overview) with a GPU to act as your cloud workstation. You connect via VSCode or JupyterLab to write code, install dependencies, and debug your inference logic in real-time.
+2.  **Containerization:** Once your code is working, you use the Pod to build a Docker image containing your application and dependencies, pushing it to a container registry.
+3.  **Production deployment:** You deploy that Docker image as a [Serverless Endpoint](/serverless/overview). Your application is now ready to handle production traffic, automatically scaling workers up during spikes and down to zero when idle.
+
+### Distributed training pipeline
+
+**Goal:** Fine-tune a massive LLM (70B+) and serve it immediately without moving data.
+
+1.  **Multi-node training:** You spin up an [Instant Cluster](/instant-clusters) with 8x H100 GPUs to fine-tune a Llama-3-70B model using FSDP or DeepSpeed.
+2.  **Unified storage:** Throughout training, checkpoints and the final model weights are saved directly to a [network volume](/storage/network-volumes) attached to the cluster.
+3.  **Instant serving:** You deploy a [vLLM Serverless worker](/serverless/vllm/overview) and mount that *same* network volume. The endpoint reads the model weights directly from storage, allowing you to serve your newly trained model via API minutes after training finishes.
+
+### Startup MVP
+
+**Goal:** Launch a GenAI avatar app quickly with minimal DevOps overhead.
+
+1.  **Prototype with Public Endpoints:** You validate your product idea using the [Flux Public Endpoint](/hub/public-endpoints) to generate images. This requires zero infrastructure setup; you simply pay per image generated.
+2.  **Scale with Serverless:** As you grow, you need a unique art style. You fine-tune a model and deploy it as a [Serverless Endpoint](/serverless/overview). This allows your app to handle traffic spikes automatically while scaling down to zero costs during quiet hours.
+
+### Interactive research loop
+
+**Goal:** Experiment with new model architectures using large datasets.
+
+1.  **Explore on a Pod:** Spin up a single-GPU [Pod](/pods/overview) with JupyterLab enabled. Mount a [network volume](/storage/network-volumes) to hold your 2TB dataset.
+2.  **Iterate code:** Write and debug your training loop interactively in the Pod. If the process crashes, the Pod restarts quickly, and your data on the network volume remains safe.
+3.  **Scale up:** Once the code is stable, you don't need to move the data. You terminate the single Pod and spin up an [Instant Cluster](/instant-clusters) attached to that *same* network volume to run the full training job across multiple nodes.
+
+### Hybrid inference pipeline
+
+**Goal:** Run a complex pipeline involving both lightweight logic and heavy GPU inference.
+
+1.  **Orchestration:** Your main application runs on a cheap CPU Pod or external cloud function. It handles user authentication, request validation, and business logic.
+2.  **Heavy lifting:** When a valid request comes in, your app calls a [Serverless Endpoint](/serverless/overview) hosting a large LLM (e.g., Llama-3-70B) specifically for the inference step.
+3.  **Async handoff:** The Serverless worker processes the request and uploads the result directly to [s3-compatible storage](/serverless/storage/overview), returning a signed URL to your main app. This keeps your API response lightweight and fast.
+
+### Batch processing job
+
+**Goal:** Process 10,000 video files overnight for a media company.
+
+1.  **Queue requests:** Your backend pushes 10,000 job payloads to a [Serverless Endpoint](/serverless/overview) configured as an asynchronous queue.
+2.  **Auto-scale:** The endpoint detects the queue depth and automatically spins up 50 concurrent workers (e.g., L4 GPUs) to process the videos in parallel.
+3.  **Cost optimization:** As the queue drains, the workers scale down to zero automatically. You pay only for the exact GPU seconds used to process the videos, with no idle server costs.
+
+### Enterprise fine-tuning factory
+
+**Goal:** Regularly fine-tune models on new customer data automatically.
+
+1.  **Data ingestion:** Customer data is uploaded to a shared [network volume](/storage/network-volumes).
+2.  **Programmatic training:** A script uses the [Runpod API](/api-reference/pods/POST/pods) to spin up a fresh On-Demand Pod.
+3.  **Execution:** The Pod mounts the volume, runs the training script, saves the new model weights back to the volume, and then [terminates itself](/pods/manage-pods#terminate-a-pod) via API call to stop billing immediately.
+4.  **Hot reload:** A separate Serverless endpoint is triggered to reload the new weights from the volume (or [update the cached model](/serverless/endpoints/model-caching)), making the new model available for inference immediately.
\ No newline at end of file