From 84a67c6e9291d8d604e93fccf207b7ab93169789 Mon Sep 17 00:00:00 2001 From: dgokeeffe Date: Wed, 4 Feb 2026 15:07:49 +1100 Subject: [PATCH 1/3] feat: Add adb-coding-assistants-cluster module Add Terraform module for deploying Claude Code CLI on Databricks clusters with MLflow tracing integration. Features: - Claude Code CLI installation with Node.js runtime - Databricks authentication integration via proxy endpoints - MLflow tracing for Claude Code sessions - VS Code/Cursor Remote SSH support - Token refresh helpers and cron automation - Databricks skills for common patterns - Network dependency validation script - Minimal installation option for constrained environments The module includes init scripts that: - Install Claude Code CLI and dependencies - Configure authentication via DATABRICKS_TOKEN - Set up bashrc helpers for token management - Support profile-based Azure authentication - Disable experimental betas for stability Co-authored-by: Cursor --- .gitignore | 5 + README.md | 2 + .../adb-coding-assistants-cluster/Makefile | 7 + .../adb-coding-assistants-cluster/README.md | 374 +++++++++ .../adb-coding-assistants-cluster/main.tf | 21 + .../adb-coding-assistants-cluster/outputs.tf | 58 ++ .../providers.tf | 91 +++ .../terraform.tfvars.example | 88 ++ .../variables.tf | 106 +++ .../adb-coding-assistants-cluster/versions.tf | 3 + .../adb-coding-assistants-cluster/Makefile | 7 + .../adb-coding-assistants-cluster/README.md | 461 +++++++++++ modules/adb-coding-assistants-cluster/main.tf | 83 ++ .../adb-coding-assistants-cluster/outputs.tf | 34 + .../scripts/README.md | 353 ++++++++ .../scripts/check-network-deps.sh | 238 ++++++ .../scripts/install-claude-minimal.sh | 66 ++ .../scripts/install-claude.sh | 760 ++++++++++++++++++ .../scripts/vscode-setup.sh | 248 ++++++ .../variables.tf | 94 +++ .../adb-coding-assistants-cluster/versions.tf | 10 + 21 files changed, 3109 insertions(+) create mode 100644 examples/adb-coding-assistants-cluster/Makefile create mode 100644 examples/adb-coding-assistants-cluster/README.md create mode 100644 examples/adb-coding-assistants-cluster/main.tf create mode 100644 examples/adb-coding-assistants-cluster/outputs.tf create mode 100644 examples/adb-coding-assistants-cluster/providers.tf create mode 100644 examples/adb-coding-assistants-cluster/terraform.tfvars.example create mode 100644 examples/adb-coding-assistants-cluster/variables.tf create mode 100644 examples/adb-coding-assistants-cluster/versions.tf create mode 100644 modules/adb-coding-assistants-cluster/Makefile create mode 100644 modules/adb-coding-assistants-cluster/README.md create mode 100644 modules/adb-coding-assistants-cluster/main.tf create mode 100644 modules/adb-coding-assistants-cluster/outputs.tf create mode 100644 modules/adb-coding-assistants-cluster/scripts/README.md create mode 100755 modules/adb-coding-assistants-cluster/scripts/check-network-deps.sh create mode 100755 modules/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh create mode 100755 modules/adb-coding-assistants-cluster/scripts/install-claude.sh create mode 100755 modules/adb-coding-assistants-cluster/scripts/vscode-setup.sh create mode 100644 modules/adb-coding-assistants-cluster/variables.tf create mode 100644 modules/adb-coding-assistants-cluster/versions.tf diff --git a/.gitignore b/.gitignore index a248d4ca..4f282ee0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,9 @@ *.tfstate.lock. *.terraform.lock.hcl +# Terraform plan files +*.plan + # logs *.log @@ -22,6 +25,8 @@ # Ignored Terraform files *gitignore*.tf +terraform.tfvars +!terraform.tfvars.example # Ignore Mac .DS_Store files .DS_Store diff --git a/README.md b/README.md index 68baa2f6..8c0ad6f6 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ The folder `examples` contains the following Terraform implementation examples : | Azure | [adb-uc](examples/adb-uc/) | ADB Unity Catalog Process | | Azure | [adb-unity-catalog-basic-demo](examples/adb-unity-catalog-basic-demo/) | ADB Unity Catalog end-to-end demo including UC metastore setup, Users/groups sync from AAD to databricks account, UC Catalog, External locations, Schemas, & Access Grants | | Azure | [adb-overwatch](examples/adb-overwatch/) | Overwatch multi-workspace deployment on Azure | +| Azure | [adb-coding-assistants-cluster](examples/adb-coding-assistants-cluster/) | Databricks cluster with Claude Code CLI for AI-assisted development | | AWS | [aws-workspace-basic](examples/aws-workspace-basic/) | Provisioning AWS Databricks E2 | | AWS | [aws-workspace-with-firewall](examples/aws-workspace-with-firewall/) | Provisioning AWS Databricks E2 with an AWS Firewall | | AWS | [aws-exfiltration-protection](examples/aws-exfiltration-protection/) | An implementation of [Data Exfiltration Protection on AWS](https://www.databricks.com/blog/2021/02/02/data-exfiltration-protection-with-databricks-on-aws.html) | @@ -82,6 +83,7 @@ The folder `modules` contains the following Terraform modules : | Azure | [adb-overwatch-main-ws](modules/adb-overwatch-main-ws/) | Main Overwatch workspace deployment | | Azure | [adb-overwatch-ws-to-monitor](modules/adb-overwatch-ws-to-monitor/) | Overwatch deployment on the Azure workspace to monitor | | Azure | [adb-overwatch-analysis](modules/adb-overwatch-analysis/) | Overwatch analysis notebooks deployment on Azure | +| Azure | [adb-coding-assistants-cluster](modules/adb-coding-assistants-cluster/) | Databricks cluster with Claude Code CLI for AI-assisted development | | AWS | [aws-workspace-basic](modules/aws-workspace-basic/) | Provisioning AWS Databricks E2 | | AWS | [aws-databricks-base-infra](modules/aws-databricks-base-infra/) | Provisioning AWS Infrastructure to be used for the deployment of a Databricks E2 workspace | | AWS | [aws-databricks-unity-catalog](modules/aws-databricks-unity-catalog/) | Provisioning the AWS Infrastructure and setting up the metastore for Databricks Unity Catalog | diff --git a/examples/adb-coding-assistants-cluster/Makefile b/examples/adb-coding-assistants-cluster/Makefile new file mode 100644 index 00000000..653039d8 --- /dev/null +++ b/examples/adb-coding-assistants-cluster/Makefile @@ -0,0 +1,7 @@ +.PHONY: docs test_docs + +docs: + terraform-docs -c ../../.terraform-docs.yml . + +test_docs: + terraform-docs -c ../../.terraform-docs.yml --output-check . diff --git a/examples/adb-coding-assistants-cluster/README.md b/examples/adb-coding-assistants-cluster/README.md new file mode 100644 index 00000000..6dfaa8d7 --- /dev/null +++ b/examples/adb-coding-assistants-cluster/README.md @@ -0,0 +1,374 @@ +# Provisioning Databricks Cluster with Claude Code CLI + +This example uses the [adb-coding-assistants-cluster](../../modules/adb-coding-assistants-cluster) module. + +This template provides an example deployment of a Databricks cluster pre-configured with Claude Code CLI for AI-assisted development directly on the cluster. + +## What Gets Deployed + +* Unity Catalog Volume for init script storage +* Databricks cluster with Claude Code CLI auto-installed on startup +* MLflow experiment for tracing Claude Code sessions +* Bash helper functions for easy usage + +## How to use + +> **Note** +> A detailed module README with full configuration options can be found in [modules/adb-coding-assistants-cluster](../../modules/adb-coding-assistants-cluster) + +1. Reference this module using one of the different [module source types](https://developer.hashicorp.com/terraform/language/modules/sources) +2. Copy `terraform.tfvars.example` to `terraform.tfvars` +3. Update `terraform.tfvars` with your values: + - `databricks_resource_id`: Your Azure Databricks workspace resource ID + - `cluster_name`: Name for your cluster + - `catalog_name`: Unity Catalog name to use +4. (Optional) Customize cluster configuration in `terraform.tfvars` (node type, autoscaling, etc.) +5. (Optional) Configure your [remote backend](https://developer.hashicorp.com/terraform/language/settings/backends/azurerm) +6. Run `terraform init` to initialize terraform and get provider ready +7. Run `terraform plan` to review the resources that will be created +8. Run `terraform apply` to create the resources + +## Prerequisites + +- Databricks workspace with Unity Catalog enabled +- Unity Catalog with an existing catalog and schema +- **Unity Catalog metastore must have a root storage credential configured** (required for volumes) +- Permission to create clusters +- (For Azure) Authenticated via `az login` or environment variables +- Databricks Runtime 14.3 LTS or higher recommended + +> **Note**: If you encounter an error about missing root storage credential, you need to configure the metastore's root storage credential first. See [Databricks documentation](https://docs.databricks.com/api-explorer/workspace/metastores/update) for details. + +## Post-Deployment + +After the cluster starts, you can connect via SSH to use Claude Code and other development tools. + +### 1. Configure SSH Tunnel + +Use the Databricks CLI to set up SSH access to your new cluster: + +```bash +# Authenticate if needed +databricks auth login --host https://your-workspace-url.cloud.databricks.com + +# Set up SSH config (replace 'claude-dev' with your preferred alias) +databricks ssh setup --name claude-dev +# Select your cluster from the list when prompted +``` + +This creates an entry in your `~/.ssh/config` file. + +### 2. Connect via VSCode or Cursor + +1. Install the **Remote - SSH** extension in VSCode or Cursor. +2. Open the Command Palette (`Cmd+Shift+P` / `Ctrl+Shift+P`). +3. Select **Remote-SSH: Connect to Host**. +4. Choose `claude-dev` (or the alias you created). +5. Select **Linux** as the platform. +6. Once connected, open your persistent workspace folder: `/Workspace/Users//`. + +> **Important: Work Storage Location** +> ⚠️ **DO NOT use Databricks Repos (`/Repos/...`) for active development work.** Repos folders can be unreliable for persistent storage and may lose uncommitted changes during cluster restarts or sync operations. +> +> ✅ **Use `/Workspace/Users//` instead.** This location provides reliable persistent storage. You can use regular git commands to manage version control (see "Using Git in /Workspace" section below). + +### 3. Launch Claude Code + +Open the terminal in your remote VSCode/Cursor session and run: + +```bash +# 1. Load environment variables and helpers +source ~/.bashrc + +# 2. Enable MLflow tracing (optional but recommended) +claude-tracing-enable + +# 3. Start Claude Code +claude +``` + +**First-time setup tips:** +- Claude will ask for file permissions; use `Shift+Tab` to auto-allow edits in the current directory. +- If you need to refresh credentials, run `claude-refresh-token`. + +### 4. Remote Web App Development (Port Forwarding) + +VSCode and Cursor automatically forward ports. For example, to run a Streamlit app: + +1. Create `app.py`: + ```python + import streamlit as st + st.title("Databricks Remote App") + st.write("Running on cluster!") + ``` +2. Run it: + ```bash + streamlit run app.py --server.port 8501 + ``` +3. Click "Open in Browser" in the popup notification to view it at `localhost:8501`. + +### 5. Using the Databricks Python Interpreter + +You don't need to configure a virtual environment. Databricks manages it for you. + +1. In the remote terminal, find the python path: + ```bash + echo $DATABRICKS_VIRTUAL_ENV + # Output example: /local_disk0/.ephemeral_nfs/envs/pythonEnv-xxxx/bin/python + ``` +2. In VSCode/Cursor, open the Command Palette and select **Python: Select Interpreter**. +3. Paste the path from above. + +### 6. Persistent Sessions with tmux + +To keep your agent running even if you disconnect: + +```bash +# Start a new session +tmux new -s claude-session + +# Detach (Ctrl+B, then D) +# Reattach later +tmux attach -t claude-session +``` + +This allows you to leave long-running tasks (like "Build a data pipeline") executing on the cluster while you are offline. + +### 7. Using Git in /Workspace + +Since `/Workspace` doesn't have native Repos integration, use standard git commands: + +```bash +# Navigate to your workspace directory +cd /Workspace/Users// + +# Option 1: Clone an existing repository +git clone https://github.com/your-org/your-repo.git +cd your-repo + +# Option 2: Initialize a new repository +mkdir my-project && cd my-project +git init +git remote add origin https://github.com/your-org/your-repo.git + +# Configure git (first time only) +git config user.name "Your Name" +git config user.email "your.email@company.com" + +# Regular git workflow +git add . +git commit -m "Your commit message" +git push origin main +``` + +**Git Authentication Options:** + +1. **Personal Access Token (PAT)** - Recommended: + ```bash + # GitHub: Create at https://github.com/settings/tokens + # Use token as password when prompted + git clone https://github.com/your-org/repo.git + ``` + +2. **SSH Keys**: + ```bash + # Generate SSH key on the cluster + ssh-keygen -t ed25519 -C "your.email@company.com" + + # Add to GitHub: Copy output and add at https://github.com/settings/keys + cat ~/.ssh/id_ed25519.pub + + # Clone using SSH + git clone git@github.com:your-org/repo.git + ``` + +3. **Git Credential Manager**: + ```bash + # Store credentials to avoid repeated prompts + git config --global credential.helper store + ``` + +## Helper Commands + +### Claude CLI Commands + +| Command | Purpose | +|---------|---------| +| `check-claude` | Verify Claude CLI installation and configuration | +| `claude-debug` | Show detailed Claude configuration | +| `claude-refresh-token` | Regenerate Claude settings from environment | +| `claude-token-status` | Check token freshness and auto-refresh status | +| `claude-tracing-enable` | Enable MLflow tracing for Claude sessions | +| `claude-tracing-status` | Check tracing status | +| `claude-tracing-disable` | Disable tracing | + +### Git Workspace Commands + +| Command | Purpose | +|---------|---------| +| `git-workspace-init` | Interactive setup for git in /Workspace (clone or init) | +| `git-workspace-check` | Verify location and check for uncommitted/unpushed changes | +| `git-workspace-setup-auth` | Configure git authentication (PAT, SSH, or credential helper) | + +These helpers warn you if working in `/Repos` and ensure your work is backed up in git. + +### VS Code/Cursor Remote Commands + +| Command | Purpose | +|---------|---------| +| `claude-vscode-setup` | Show Remote SSH setup instructions | +| `claude-vscode-env` | Get Python interpreter path for IDE | +| `claude-vscode-check` | Verify Remote SSH configuration | +| `claude-vscode-config` | Generate settings.json snippet | + +## Offline Installation + +For air-gapped or restricted network environments, use the separate offline module: [`adb-coding-assistants-cluster-offline`](../../modules/adb-coding-assistants-cluster-offline/README.md). See the [Offline Installation Guide](../../modules/adb-coding-assistants-cluster-offline/scripts/OFFLINE-INSTALLATION.md) for detailed instructions. + +## Configuration Examples + +### Single-Node Development Cluster + +```hcl +cluster_mode = "SINGLE_NODE" +num_workers = 0 +node_type_id = "Standard_D8pds_v6" +``` + +### Autoscaling Production Cluster + +```hcl +cluster_mode = "STANDARD" +num_workers = null # Enable autoscaling +min_workers = 2 +max_workers = 8 +node_type_id = "Standard_D8pds_v6" +``` + +## Authentication + +This example uses Databricks unified authentication. Authentication can be provided via: + +1. **Azure CLI** (recommended for local development): + ```bash + az login + terraform apply + ``` + +2. **Environment Variables** (recommended for CI/CD): + ```bash + export DATABRICKS_HOST="https://adb-xxx.azuredatabricks.net" + export DATABRICKS_TOKEN="dapi..." + terraform apply + ``` + +3. **Configuration Profile**: + ```bash + export DATABRICKS_CONFIG_PROFILE="my-profile" + terraform apply + ``` + +For more details on authentication, see the [Databricks unified authentication documentation](https://docs.databricks.com/dev-tools/auth/unified-auth.html). + +## Troubleshooting + +### Init Script Fails + +Check cluster event logs in the Databricks UI under **Compute** → **Your Cluster** → **Event Log**. + +Common issues: +- Network connectivity to download packages +- Unity Catalog volume permissions +- Insufficient cluster permissions + +### Claude Not Found After Login + +```bash +# Reload bashrc +source ~/.bashrc + +# Verify PATH +check-claude +``` + +### Authentication Issues + +```bash +# Check environment variables +check-claude + +# Regenerate configuration +claude-refresh-token +``` + +## Additional Resources + +- [Module Documentation](../../modules/adb-coding-assistants-cluster/README.md) +- [Offline Module Documentation](../../modules/adb-coding-assistants-cluster-offline/README.md) +- [Offline Installation Guide](../../modules/adb-coding-assistants-cluster-offline/scripts/OFFLINE-INSTALLATION.md) +- [Scripts Documentation](../../modules/adb-coding-assistants-cluster/scripts/README.md) +- [Databricks Init Scripts Documentation](https://docs.databricks.com/clusters/init-scripts.html) +- [Unity Catalog Volumes Documentation](https://docs.databricks.com/data-governance/unity-catalog/volumes.html) + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.0 | +| [azurerm](#requirement\_azurerm) | >=4.31.0 | +| [databricks](#requirement\_databricks) | >=1.81.1 | + +## Providers + +| Name | Version | +|------|---------| +| [azurerm](#provider\_azurerm) | 4.57.0 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [claude\_cluster](#module\_claude\_cluster) | ../../modules/adb-coding-assistants-cluster | n/a | + +## Resources + +| Name | Type | +|------|------| +| [azurerm_client_config.current](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/client_config) | data source | +| [azurerm_databricks_workspace.this](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/databricks_workspace) | data source | +| [azurerm_resource_group.this](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/resource_group) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [catalog\_name](#input\_catalog\_name) | Unity Catalog name for the volume | `string` | n/a | yes | +| [cluster\_name](#input\_cluster\_name) | Name of the Databricks cluster | `string` | n/a | yes | +| [databricks\_resource\_id](#input\_databricks\_resource\_id) | The Azure resource ID for the Databricks workspace. Format: /subscriptions/{subscription-id}/resourceGroups/{resource-group}/providers/Microsoft.Databricks/workspaces/{workspace-name} | `string` | n/a | yes | +| [autotermination\_minutes](#input\_autotermination\_minutes) | Minutes of inactivity before cluster auto-terminates | `number` | `30` | no | +| [cluster\_mode](#input\_cluster\_mode) | Cluster mode: STANDARD or SINGLE\_NODE | `string` | `"STANDARD"` | no | +| [init\_script\_source\_path](#input\_init\_script\_source\_path) | Local path to the init script | `string` | `null` | no | +| [max\_workers](#input\_max\_workers) | Maximum number of workers for autoscaling | `number` | `3` | no | +| [min\_workers](#input\_min\_workers) | Minimum number of workers for autoscaling | `number` | `1` | no | +| [mlflow\_experiment\_name](#input\_mlflow\_experiment\_name) | MLflow experiment name for Claude Code tracing | `string` | `"/Workspace/Shared/claude-code-tracing"` | no | +| [node\_type\_id](#input\_node\_type\_id) | Node type for the cluster. Default is Standard_D8pds_v6 (modern, premium SSD + local NVMe). If unavailable in your region, consider Standard_DS13_v2 as fallback. | `string` | `"Standard_D8pds_v6"` | no | +| [num\_workers](#input\_num\_workers) | Number of worker nodes (null for autoscaling) | `number` | `null` | no | +| [schema\_name](#input\_schema\_name) | Schema name for the volume | `string` | `"default"` | no | +| [spark\_version](#input\_spark\_version) | Databricks Runtime version | `string` | `"17.3.x-cpu-ml-scala2.13"` | no | +| [tags](#input\_tags) | Custom tags for the cluster | `map(string)` |
{
"Environment": "dev",
"Purpose": "coding-assistants"
}
| no | +| [volume\_name](#input\_volume\_name) | Volume name to store init scripts | `string` | `"coding_assistants"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [cluster\_id](#output\_cluster\_id) | The ID of the created cluster | +| [cluster\_name](#output\_cluster\_name) | Name of the created cluster | +| [cluster\_url](#output\_cluster\_url) | URL to access the cluster in Databricks UI | +| [init\_script\_path](#output\_init\_script\_path) | Path to the init script in the volume | +| [mlflow\_experiment\_name](#output\_mlflow\_experiment\_name) | MLflow experiment name for tracing | +| [setup\_instructions](#output\_setup\_instructions) | Instructions for using the cluster | +| [volume\_full\_name](#output\_volume\_full\_name) | Full name of the volume | +| [volume\_path](#output\_volume\_path) | Path to the volume containing init scripts | + diff --git a/examples/adb-coding-assistants-cluster/main.tf b/examples/adb-coding-assistants-cluster/main.tf new file mode 100644 index 00000000..cd2519e6 --- /dev/null +++ b/examples/adb-coding-assistants-cluster/main.tf @@ -0,0 +1,21 @@ +# Cluster with Claude Code CLI coding assistant +# Provider configuration is in providers.tf +module "claude_cluster" { + source = "../../modules/adb-coding-assistants-cluster" + + cluster_name = var.cluster_name + catalog_name = var.catalog_name + schema_name = var.schema_name + volume_name = var.volume_name + init_script_source_path = var.init_script_source_path + spark_version = var.spark_version + node_type_id = var.node_type_id + autotermination_minutes = var.autotermination_minutes + num_workers = var.num_workers + min_workers = var.min_workers + max_workers = var.max_workers + mlflow_experiment_name = var.mlflow_experiment_name + cluster_mode = var.cluster_mode + tags = var.tags +} + diff --git a/examples/adb-coding-assistants-cluster/outputs.tf b/examples/adb-coding-assistants-cluster/outputs.tf new file mode 100644 index 00000000..6c3cce86 --- /dev/null +++ b/examples/adb-coding-assistants-cluster/outputs.tf @@ -0,0 +1,58 @@ +output "cluster_id" { + description = "The ID of the created cluster" + value = module.claude_cluster.cluster_id +} + +output "cluster_url" { + description = "URL to access the cluster in Databricks UI" + value = module.claude_cluster.cluster_url +} + +output "cluster_name" { + description = "Name of the created cluster" + value = module.claude_cluster.cluster_name +} + +output "volume_path" { + description = "Path to the volume containing init scripts" + value = module.claude_cluster.volume_path +} + +output "volume_full_name" { + description = "Full name of the volume" + value = module.claude_cluster.volume_full_name +} + +output "init_script_path" { + description = "Path to the init script in the volume" + value = module.claude_cluster.init_script_path +} + +output "mlflow_experiment_name" { + description = "MLflow experiment name for tracing" + value = module.claude_cluster.mlflow_experiment_name +} + +output "setup_instructions" { + description = "Instructions for using the cluster" + value = <<-EOT + Cluster deployed successfully! + + 1. Access cluster: ${module.claude_cluster.cluster_url} + 2. Wait for cluster to start (init script runs automatically) + 3. Open a notebook or terminal + 4. Run: source ~/.bashrc + 5. Verify: check-claude + 6. Start using: claude "your question" + + MLflow Experiment: ${module.claude_cluster.mlflow_experiment_name} + + Helper commands: + - check-claude: Verify installation status + - claude-debug: Show configuration details + - claude-refresh-token: Update authentication + - claude-tracing-enable: Enable MLflow tracing + - claude-tracing-status: Check tracing status + - claude-tracing-disable: Disable tracing + EOT +} diff --git a/examples/adb-coding-assistants-cluster/providers.tf b/examples/adb-coding-assistants-cluster/providers.tf new file mode 100644 index 00000000..26c208e7 --- /dev/null +++ b/examples/adb-coding-assistants-cluster/providers.tf @@ -0,0 +1,91 @@ +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + version = ">=4.31.0" + } + databricks = { + source = "databricks/databricks" + version = ">=1.81.1" + } + external = { + source = "hashicorp/external" + version = ">=2.3.0" + } + } +} + +# Determine authentication approach based on variables provided +locals { + # Use profile-based auth if profile is specified + use_profile_auth = var.databricks_profile != null + + # For Azure resource ID approach + resource_regex = var.databricks_resource_id != null ? "(?i)subscriptions/(.+)/resourceGroups/(.+)/providers/Microsoft.Databricks/workspaces/(.+)" : "" + subscription_id_from_resource = var.databricks_resource_id != null ? regex(local.resource_regex, var.databricks_resource_id)[0] : null + resource_group = var.databricks_resource_id != null ? regex(local.resource_regex, var.databricks_resource_id)[1] : null + databricks_workspace_name = var.databricks_resource_id != null ? regex(local.resource_regex, var.databricks_resource_id)[2] : null +} + +# Get Azure subscription ID from Azure CLI or environment variable when not provided via resource ID +# This is needed for the Azure provider even when using profile-based Databricks auth +data "external" "azure_subscription" { + count = local.subscription_id_from_resource == null ? 1 : 0 + program = ["bash", "-c", "SUBSCRIPTION_ID=$(az account show --query id -o tsv 2>/dev/null || echo $${ARM_SUBSCRIPTION_ID:-}); echo \"{\\\"id\\\":\\\"$${SUBSCRIPTION_ID:-}\\\"}\""] +} + +locals { + # Use subscription ID from resource ID, or from Azure CLI/environment, or null (provider will try to auto-detect) + subscription_id = coalesce( + local.subscription_id_from_resource, + try(data.external.azure_subscription[0].result.id != "" ? data.external.azure_subscription[0].result.id : null, null) + ) +} + +# Data source to get current Azure client configuration (only for Azure resource ID approach) +data "azurerm_client_config" "current" { + count = local.use_profile_auth ? 0 : 1 +} + +# Data source to get the resource group (only for Azure resource ID approach) +data "azurerm_resource_group" "this" { + count = local.use_profile_auth ? 0 : 1 + name = local.resource_group +} + +# Configure the Azure Provider +# When using profile-based auth, subscription_id is not needed (provider will auto-detect if Azure CLI is configured) +# When using Azure resource ID approach, subscription_id is extracted from the resource ID +provider "azurerm" { + subscription_id = local.subscription_id + features {} + skip_provider_registration = local.use_profile_auth + + # Allow provider to work without explicit subscription_id when using profile auth + # It will attempt to auto-detect from Azure CLI or environment variables +} + +# Data source to get the Databricks workspace (only for Azure resource ID approach) +data "azurerm_databricks_workspace" "this" { + count = local.use_profile_auth ? 0 : 1 + name = local.databricks_workspace_name + resource_group_name = local.resource_group +} + +# Configure the Databricks Provider +# Two authentication approaches supported: +# +# 1. Profile-based (Recommended - Simple and cloud-agnostic): +# Set databricks_profile variable to your ~/.databrickscfg profile name +# Example: databricks_profile = "dok" +# +# 2. Azure resource ID (Azure-specific): +# Set databricks_resource_id to your Azure Databricks workspace resource ID +# Requires Azure CLI authentication (az login) +# +# See: https://docs.databricks.com/dev-tools/auth/unified-auth.html +provider "databricks" { + profile = var.databricks_profile + host = local.use_profile_auth ? null : data.azurerm_databricks_workspace.this[0].workspace_url +} + diff --git a/examples/adb-coding-assistants-cluster/terraform.tfvars.example b/examples/adb-coding-assistants-cluster/terraform.tfvars.example new file mode 100644 index 00000000..32296df2 --- /dev/null +++ b/examples/adb-coding-assistants-cluster/terraform.tfvars.example @@ -0,0 +1,88 @@ +# Example terraform.tfvars file for Claude Code CLI Cluster +# Copy this to terraform.tfvars and customize for your environment + +#============================================================================= +# AUTHENTICATION - Choose ONE approach +#============================================================================= + +# OPTION 1: Profile-based (Recommended - Simple and cloud-agnostic) +# Uses your ~/.databrickscfg profile +databricks_profile = "my-profile" # Replace with your profile name from ~/.databrickscfg + +# OPTION 2: Azure Resource ID (Azure-specific) +# Comment out databricks_profile above and uncomment below to use Azure resource ID +# databricks_resource_id = "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/my-rg/providers/Microsoft.Databricks/workspaces/my-workspace" + +#============================================================================= +# REQUIRED VARIABLES +#============================================================================= + +cluster_name = "claude-coding-assistant" +catalog_name = "main" + +#============================================================================= +# OPTIONAL VARIABLES +#============================================================================= + +# Unity Catalog configuration +schema_name = "default" +volume_name = "coding_assistants" + +# Cluster configuration +spark_version = "17.3.x-cpu-ml-scala2.13" +node_type_id = "Standard_D8pds_v6" # Azure: Standard_D8pds_v6 (8 vCPU, 32 GB RAM, Premium SSD + local NVMe). Fallback: Standard_DS13_v2 if unavailable in region +autotermination_minutes = 30 + +# Cluster mode options: +# - "SINGLE_NODE": Cost-effective for individual development (recommended) +# - "STANDARD": Multi-node for team environments +cluster_mode = "SINGLE_NODE" + +# Worker configuration (ignored if cluster_mode = "SINGLE_NODE") +num_workers = 0 # Set to null to enable autoscaling, or a specific number + +# Autoscaling configuration (only used if num_workers = null) +min_workers = 1 +max_workers = 3 + +# MLflow experiment for Claude Code tracing +mlflow_experiment_name = "/Workspace/Shared/claude-code-tracing" + +# Optional: Custom init script path (defaults to bundled script) +# init_script_source_path = "./custom-install-claude.sh" + +# Custom tags +tags = { + Environment = "development" + Purpose = "ai-coding" + Owner = "data-engineering" + CostCenter = "engineering" +} + +#============================================================================= +# CLOUD-SPECIFIC NODE TYPES REFERENCE +#============================================================================= + +# Azure VM Types (Premium SSD): +# Modern Dpdsv6-series (Cobalt 100 processor, Premium SSD + local NVMe): +# - Standard_D4pds_v6 (4 cores, 16 GB RAM) - Cost-effective +# - Standard_D8pds_v6 (8 cores, 32 GB RAM) - Recommended default (modern) +# - Standard_D16pds_v6 (16 cores, 64 GB RAM) - For larger workloads +# Note: Dpdsv6-series may have limited regional availability +# +# DS-series (Premium SSD, widely available): +# - Standard_DS3_v2 (4 cores, 14 GB RAM) - Cost-effective for development +# - Standard_DS4_v2 (8 cores, 28 GB RAM) - Good for medium workloads +# - Standard_DS13_v2 (8 cores, 56 GB RAM) - Good fallback if Dpdsv6 unavailable +# - Standard_DS5_v2 (16 cores, 56 GB RAM) - More CPU, same RAM as DS13_v2 +# - Standard_DS14_v2 (16 cores, 112 GB RAM) - For large-scale workloads +# +# AWS: +# - i3.xlarge (4 cores, 30.5 GB RAM) - Recommended for single-node +# - i3.2xlarge (8 cores, 61 GB RAM) - For larger workloads +# - r5.xlarge (4 cores, 32 GB RAM) - Memory-optimized +# +# GCP: +# - n1-highmem-4 (4 cores, 26 GB RAM) - Recommended for single-node +# - n1-highmem-8 (8 cores, 52 GB RAM) - For larger workloads +# - n2-standard-4 (4 cores, 16 GB RAM) - Cost-optimized diff --git a/examples/adb-coding-assistants-cluster/variables.tf b/examples/adb-coding-assistants-cluster/variables.tf new file mode 100644 index 00000000..4dc0f3ac --- /dev/null +++ b/examples/adb-coding-assistants-cluster/variables.tf @@ -0,0 +1,106 @@ +variable "databricks_profile" { + description = "Databricks CLI profile name from ~/.databrickscfg (recommended for simple, cloud-agnostic authentication). If set, databricks_resource_id is ignored." + type = string + default = null +} + +variable "databricks_resource_id" { + description = "The Azure resource ID for the Databricks workspace (Azure-specific approach). Format: /subscriptions/{subscription-id}/resourceGroups/{resource-group}/providers/Microsoft.Databricks/workspaces/{workspace-name}. Only used if databricks_profile is not set." + type = string + default = null + + validation { + condition = var.databricks_profile != null || var.databricks_resource_id != null + error_message = "Either databricks_profile or databricks_resource_id must be set. Recommended: use databricks_profile for simpler configuration." + } +} + +variable "cluster_name" { + description = "Name of the Databricks cluster" + type = string +} + +variable "catalog_name" { + description = "Unity Catalog name for the volume" + type = string +} + +variable "schema_name" { + description = "Schema name for the volume" + type = string + default = "default" +} + +variable "volume_name" { + description = "Volume name to store init scripts" + type = string + default = "coding_assistants" +} + +variable "init_script_source_path" { + description = "Local path to the init script" + type = string + default = null +} + +variable "spark_version" { + description = "Databricks Runtime version" + type = string + default = "17.3.x-cpu-ml-scala2.13" +} + +variable "node_type_id" { + description = "Node type for the cluster. Default is Standard_D8pds_v6 (modern, premium SSD + local NVMe). If unavailable in your region, consider Standard_DS13_v2 as fallback." + type = string + default = "Standard_D8pds_v6" +} + +variable "autotermination_minutes" { + description = "Minutes of inactivity before cluster auto-terminates" + type = number + default = 30 +} + +variable "num_workers" { + description = "Number of worker nodes (null for autoscaling)" + type = number + default = null +} + +variable "min_workers" { + description = "Minimum number of workers for autoscaling" + type = number + default = 1 +} + +variable "max_workers" { + description = "Maximum number of workers for autoscaling" + type = number + default = 3 +} + +variable "mlflow_experiment_name" { + description = "MLflow experiment name for Claude Code tracing" + type = string + default = "/Workspace/Shared/claude-code-tracing" +} + +variable "cluster_mode" { + description = "Cluster mode: STANDARD or SINGLE_NODE" + type = string + default = "STANDARD" + + validation { + condition = contains(["STANDARD", "SINGLE_NODE"], var.cluster_mode) + error_message = "cluster_mode must be either STANDARD or SINGLE_NODE" + } +} + +variable "tags" { + description = "Custom tags for the cluster" + type = map(string) + default = { + Environment = "dev" + Purpose = "coding-assistants" + } +} diff --git a/examples/adb-coding-assistants-cluster/versions.tf b/examples/adb-coding-assistants-cluster/versions.tf new file mode 100644 index 00000000..7117131f --- /dev/null +++ b/examples/adb-coding-assistants-cluster/versions.tf @@ -0,0 +1,3 @@ +terraform { + required_version = ">= 1.0" +} diff --git a/modules/adb-coding-assistants-cluster/Makefile b/modules/adb-coding-assistants-cluster/Makefile new file mode 100644 index 00000000..653039d8 --- /dev/null +++ b/modules/adb-coding-assistants-cluster/Makefile @@ -0,0 +1,7 @@ +.PHONY: docs test_docs + +docs: + terraform-docs -c ../../.terraform-docs.yml . + +test_docs: + terraform-docs -c ../../.terraform-docs.yml --output-check . diff --git a/modules/adb-coding-assistants-cluster/README.md b/modules/adb-coding-assistants-cluster/README.md new file mode 100644 index 00000000..8838f877 --- /dev/null +++ b/modules/adb-coding-assistants-cluster/README.md @@ -0,0 +1,461 @@ +# Provisioning Databricks Cluster with Claude Code CLI + +This module deploys a Databricks cluster pre-configured with Claude Code CLI for AI-assisted development directly on Databricks. + +## Module content + +This module can be used to deploy the following: + +* Unity Catalog Volume for secure init script storage +* Init script with Claude Code CLI installation +* Databricks cluster with automatic AI coding assistant setup +* MLflow experiment configuration for tracing +* Helper bash functions for cluster users + +## Features + +- ✅ **Zero-configuration AI coding tools** on cluster startup +- ✅ **Unity Catalog Volumes** for secure script storage (Databricks recommended practice) +- ✅ **MLflow tracing** integration for Claude Code sessions +- ✅ **Flexible cluster configuration** (single-node or autoscaling) + +> **Note**: For offline/air-gapped environments, use the separate [`adb-coding-assistants-cluster-offline`](../adb-coding-assistants-cluster-offline/README.md) module. + +## Architecture + +``` +┌─────────────────────────────────────────────────────┐ +│ Unity Catalog Volume │ +│ /Volumes//// │ +│ └── install-claude.sh │ +└─────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────┐ +│ Databricks Cluster (on startup) │ +│ │ +│ 1. Executes init script from volume │ +│ 2. Installs Node.js, OpenCode, Claude CLI │ +│ 3. Configures bashrc with helper functions │ +│ 4. Auto-generates configs on user login │ +└─────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────┐ +│ User Login │ +│ │ +│ • DATABRICKS_TOKEN available from environment │ +│ • Configs auto-generate: │ +│ - ~/.claude/settings.json │ +│ - ~/.opencode/config.json │ +│ • Commands ready: claude, opencode │ +└─────────────────────────────────────────────────────┘ +``` + +## Prerequisites + +- Databricks workspace with Unity Catalog enabled +- Databricks Runtime 13.3 LTS or higher (recommended for Unity Catalog volumes) +- Databricks Terraform provider >= 1.40.0 +- Unity Catalog with an existing catalog and schema +- **Unity Catalog metastore must have a root storage credential configured** (required for volumes) + +> **Note**: If you encounter an error about missing root storage credential, you need to configure the metastore's root storage credential first. See [Databricks documentation](https://docs.databricks.com/api-explorer/workspace/metastores/update) for details. + +## Usage + +### Basic Example + +```hcl +module "coding_cluster" { + source = "./modules/coding-assistants-cluster" + + cluster_name = "ai-dev-cluster" + catalog_name = "main" + schema_name = "default" + + # init_script_source_path is optional - module includes the script +} +``` + +### Single-Node Cluster + +```hcl +module "single_node_cluster" { + source = "./modules/coding-assistants-cluster" + + cluster_name = "ai-dev-single-node" + catalog_name = "main" + schema_name = "default" + + cluster_mode = "SINGLE_NODE" + num_workers = 0 +} +``` + +### Autoscaling Cluster + +```hcl +module "autoscaling_cluster" { + source = "./modules/coding-assistants-cluster" + + cluster_name = "ai-dev-autoscaling" + catalog_name = "main" + schema_name = "default" + + min_workers = 2 + max_workers = 8 + + tags = { + Environment = "production" + Team = "data-science" + } +} +``` + +### Complete Example + +```hcl +module "coding_cluster" { + source = "./modules/coding-assistants-cluster" + + # Cluster configuration + cluster_name = "ai-development-cluster" + spark_version = "17.3.x-cpu-ml-scala2.13" + node_type_id = "Standard_D8pds_v6" + autotermination_minutes = 60 + + # Volume configuration + catalog_name = "main" + schema_name = "default" + volume_name = "coding_assistants" + + # Init script (optional - uses bundled script by default) + # init_script_source_path = "/path/to/custom/script.sh" + + # MLflow configuration + mlflow_experiment_name = "/Users/me@company.com/my-claude-traces" + + # Autoscaling + min_workers = 1 + max_workers = 5 + + # Tags + tags = { + Environment = "development" + Project = "ai-assisted-coding" + CostCenter = "engineering" + ManagedBy = "terraform" + } +} +``` + +## Init Script Storage Best Practices + +According to [Databricks documentation](https://docs.databricks.com/aws/en/init-scripts/): + +> **Databricks Runtime 13.3 LTS and above with Unity Catalog** +> Store init scripts in Unity Catalog volumes. + +### Why Unity Catalog Volumes? + +1. **Governance**: Full Unity Catalog ACL support +2. **Security**: Identity-based access control +3. **Portability**: Works across AWS, Azure, and GCP +4. **Versioning**: Easy to manage and update scripts +5. **No DBFS**: Recommended alternative to legacy DBFS storage + +### Init Script Identity + +- **Single-user access mode**: Uses assigned principal's identity +- **Standard access mode**: Uses cluster owner's identity +- **Volume access**: Governed by Unity Catalog permissions + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.0 | +| [databricks](#requirement\_databricks) | >= 1.40.0 | + +## Providers + +| Name | Version | +|------|---------| +| [databricks](#provider\_databricks) | 1.102.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [databricks_cluster.coding_assistants](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/cluster) | resource | +| [databricks_file.init_script](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/file) | resource | +| [databricks_volume.init_scripts](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/volume) | resource | +| [databricks_current_user.me](https://registry.terraform.io/providers/databricks/databricks/latest/docs/data-sources/current_user) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [catalog\_name](#input\_catalog\_name) | Unity Catalog name for the volume | `string` | n/a | yes | +| [cluster\_name](#input\_cluster\_name) | Name of the Databricks cluster | `string` | n/a | yes | +| [autotermination\_minutes](#input\_autotermination\_minutes) | Minutes of inactivity before cluster auto-terminates | `number` | `30` | no | +| [cluster\_mode](#input\_cluster\_mode) | Cluster mode: STANDARD or SINGLE\_NODE | `string` | `"STANDARD"` | no | +| [init\_script\_source\_path](#input\_init\_script\_source\_path) | Local path to the init script | `string` | `null` | no | +| [max\_workers](#input\_max\_workers) | Maximum number of workers for autoscaling | `number` | `3` | no | +| [min\_workers](#input\_min\_workers) | Minimum number of workers for autoscaling | `number` | `1` | no | +| [mlflow\_experiment\_name](#input\_mlflow\_experiment\_name) | MLflow experiment name for Claude Code tracing | `string` | `"/Workspace/Shared/claude-code-tracing"` | no | +| [node\_type\_id](#input\_node\_type\_id) | Node type for the cluster. Default is Standard_D8pds_v6 (modern, premium SSD + local NVMe). If unavailable in your region, consider Standard_DS13_v2 as fallback. | `string` | `"Standard_D8pds_v6"` | no | +| [num\_workers](#input\_num\_workers) | Number of worker nodes (null for autoscaling) | `number` | `null` | no | +| [schema\_name](#input\_schema\_name) | Schema name for the volume | `string` | `"default"` | no | +| [spark\_version](#input\_spark\_version) | Databricks Runtime version | `string` | `"17.3.x-cpu-ml-scala2.13"` | no | +| [tags](#input\_tags) | Custom tags for the cluster | `map(string)` |
{
"Environment": "dev",
"Purpose": "coding-assistants"
}
| no | +| [volume\_name](#input\_volume\_name) | Volume name to store init scripts | `string` | `"coding_assistants"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [cluster\_id](#output\_cluster\_id) | The ID of the created cluster | +| [cluster\_name](#output\_cluster\_name) | Name of the created cluster | +| [cluster\_url](#output\_cluster\_url) | URL to access the cluster in Databricks UI | +| [init\_script\_path](#output\_init\_script\_path) | Path to the init script in the volume | +| [mlflow\_experiment\_name](#output\_mlflow\_experiment\_name) | MLflow experiment name for tracing | +| [volume\_full\_name](#output\_volume\_full\_name) | Full name of the volume | +| [volume\_path](#output\_volume\_path) | Path to the volume containing init scripts | + + +## Post-Deployment Usage + +### On the Cluster + +After the cluster starts, users can: + +```bash +# Check installation status +check-claude + +# Debug Claude configuration +claude-debug + +# Use Claude Code +claude "Analyze the customer churn data" + +# Enable MLflow tracing +claude-tracing-enable + +# Check tracing status +claude-tracing-status +``` + +### Persistent Work Storage + +**IMPORTANT: Do not use Databricks Repos (`/Repos/...`) for active development work.** + +Databricks Repos folders can be unreliable for persistent storage and may lose uncommitted changes during cluster restarts or sync operations. Instead: + +✅ **Use `/Workspace/Users//` for all development work** + +This location provides reliable persistent storage across cluster restarts. Use the provided git helpers to manage version control: + +```bash +# Navigate to your workspace +cd /Workspace/Users/$(whoami)/ + +# Set up git (interactive helper) +git-workspace-init + +# Check git status and location +git-workspace-check + +# Configure git authentication +git-workspace-setup-auth +``` + +The git helpers will: +- Warn if you're working in `/Repos` (unreliable location) +- Help you clone existing repos or initialize new ones +- Check for uncommitted or unpushed changes +- Guide you through authentication setup (PAT, SSH, or credential helper) + +### Helper Commands + +The init script installs these helper commands in `~/.bashrc`: + +#### Claude CLI Commands + +| Command | Purpose | +|---------|---------| +| `check-claude` | Verify installation and configuration | +| `claude-debug` | Show detailed Claude CLI configuration | +| `claude-refresh-token` | Regenerate Claude settings | +| `claude-token-status` | Check token freshness and auto-refresh status | +| `claude-tracing-enable` | Enable MLflow tracing | +| `claude-tracing-status` | Check tracing status | +| `claude-tracing-disable` | Disable MLflow tracing | + +#### Git Workspace Commands + +| Command | Purpose | +|---------|---------| +| `git-workspace-init` | Interactive git setup in /Workspace (clone or init) | +| `git-workspace-check` | Check location and uncommitted/unpushed changes | +| `git-workspace-setup-auth` | Configure git authentication (PAT/SSH/credential helper) | + +#### VS Code/Cursor Remote Commands + +| Command | Purpose | +|---------|---------| +| `claude-vscode-setup` | Show Remote SSH setup guide | +| `claude-vscode-env` | Get Python interpreter path | +| `claude-vscode-check` | Verify Remote SSH configuration | +| `claude-vscode-config` | Generate settings.json snippet | + +## Cluster Access Modes + +### Single-User Access Mode + +```hcl +# Automatically configured by the module +data_security_mode = "SINGLE_USER" +single_user_name = data.databricks_current_user.me.user_name +``` + +### Standard Access Mode + +For standard access mode, you must: +1. Set up an allowlist for init scripts +2. Grant permissions to the volume + +See [Allowlist documentation](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/allowlist). + +## Troubleshooting + +### Init Script Fails + +Check cluster logs: +```bash +# Enable cluster log delivery in cluster config +# Then view: /cluster-logs//init_scripts/ +``` + +### Commands Not Found + +```bash +# Reload bashrc +source ~/.bashrc + +# Check PATH +echo $PATH | grep -E "(claude|opencode)" + +# Verify installation +check-coding-assistants +``` + +### Authentication Issues + +```bash +# Check environment variables +claude-debug + +# Verify token is set +echo $DATABRICKS_TOKEN + +# Regenerate configs +claude-refresh-token +opencode-refresh-config +``` + +### Script Size Limit + +Init scripts must be < 64KB. If exceeded: +- Break into multiple scripts +- Remove unnecessary comments +- Compress/optimize script + +## Security Considerations + +### Volume Permissions + +Ensure appropriate Unity Catalog permissions: + +```sql +-- Grant read access to volume +GRANT READ VOLUME ON VOLUME .. TO ; + +-- For standard access mode, add to allowlist +-- (Requires admin access) +``` + +### Token Security + +- Tokens are **never hardcoded** in configs +- Read from environment: `$DATABRICKS_TOKEN` +- Configs regenerate per session +- Settings files are user-readable only (`~/.claude/`, `~/.opencode/`) + +## Maintenance + +### Updating the Init Script + +1. Update the local init script file +2. Run `terraform apply` to upload new version +3. Restart clusters to apply changes + +```bash +terraform apply -target=module.coding_cluster.databricks_file.init_script +``` + +### Updating Cluster Configuration + +```bash +# Update variables in your config +# Then apply +terraform apply + +# Restart cluster for changes to take effect +``` + +## Cost Optimization + +- Use `autotermination_minutes` to automatically shut down idle clusters +- Use single-node mode for development: `cluster_mode = "SINGLE_NODE"` +- Enable autoscaling to scale down during low usage +- Consider spot instances (if supported by your cloud provider) + +## Limitations + +- Init scripts must be < 64KB +- Init script failures cause cluster launch to fail +- Requires Databricks Runtime 13.3 LTS+ for Unity Catalog volumes +- Standard access mode requires admin-configured allowlist + +## References + +- [Databricks Init Scripts Documentation](https://docs.databricks.com/init-scripts/) +- [Unity Catalog Volumes](https://docs.databricks.com/volumes/) +- [Databricks Terraform Provider](https://registry.terraform.io/providers/databricks/databricks/latest/docs) +- [Cluster Configuration](https://docs.databricks.com/compute/configure) + +## License + +This module is provided as-is for use with Databricks workspaces. + +## Contributing + +To contribute improvements to this module: +1. Test changes in an isolated Databricks workspace +2. Run `terraform validate` and `terraform fmt` +3. Update documentation for any new variables or outputs +4. Submit pull request with clear description of changes + +## Support + +For issues related to: +- **Module**: Open an issue in this repository +- **Init Script**: See the init script documentation +- **Databricks Platform**: Contact Databricks support +- **Claude/OpenCode**: Contact Anthropic or OpenCode support respectively diff --git a/modules/adb-coding-assistants-cluster/main.tf b/modules/adb-coding-assistants-cluster/main.tf new file mode 100644 index 00000000..578c1a20 --- /dev/null +++ b/modules/adb-coding-assistants-cluster/main.tf @@ -0,0 +1,83 @@ +# Data source to get current user +data "databricks_current_user" "me" {} + +# Local value for init script path +locals { + init_script_path = var.init_script_source_path != null ? var.init_script_source_path : "${path.module}/scripts/install-claude.sh" +} + +# Create or reference the volume for init scripts +resource "databricks_volume" "init_scripts" { + name = var.volume_name + catalog_name = var.catalog_name + schema_name = var.schema_name + volume_type = "MANAGED" + comment = "Volume for Claude Code CLI init scripts" + + lifecycle { + ignore_changes = [owner] + } +} + +# Upload the init script to the volume +resource "databricks_file" "init_script" { + source = local.init_script_path + path = "${databricks_volume.init_scripts.volume_path}/install-claude.sh" +} + +# Create the cluster with init script +resource "databricks_cluster" "coding_assistants" { + cluster_name = var.cluster_name + spark_version = var.spark_version + node_type_id = var.node_type_id + autotermination_minutes = var.autotermination_minutes + data_security_mode = "SINGLE_USER" + single_user_name = data.databricks_current_user.me.user_name + + # Autoscaling or fixed size + # Autoscaling is not supported for single-node clusters + dynamic "autoscale" { + for_each = var.cluster_mode == "STANDARD" && var.num_workers == null ? [1] : [] + content { + min_workers = var.min_workers + max_workers = var.max_workers + } + } + + # For single-node clusters, num_workers must be 0 (driver-only) + # For standard clusters, use the provided num_workers value + num_workers = var.cluster_mode == "SINGLE_NODE" ? 0 : var.num_workers + + # Single node configuration + # According to Databricks docs: single-node clusters run Spark locally with no worker nodes + spark_conf = var.cluster_mode == "SINGLE_NODE" ? { + "spark.databricks.cluster.profile" = "singleNode" + "spark.master" = "local[*]" + } : {} + + custom_tags = merge( + var.tags, + { + "ManagedBy" = "Terraform" + }, + var.cluster_mode == "SINGLE_NODE" ? { + "ResourceClass" = "SingleNode" + } : {} + ) + + # Environment variables for Claude Code CLI + spark_env_vars = { + MLFLOW_EXPERIMENT_NAME = var.mlflow_experiment_name + } + + # Init script configuration + init_scripts { + volumes { + destination = "${databricks_volume.init_scripts.volume_path}/install-claude.sh" + } + } + + depends_on = [ + databricks_file.init_script + ] +} diff --git a/modules/adb-coding-assistants-cluster/outputs.tf b/modules/adb-coding-assistants-cluster/outputs.tf new file mode 100644 index 00000000..c19dca1a --- /dev/null +++ b/modules/adb-coding-assistants-cluster/outputs.tf @@ -0,0 +1,34 @@ +output "cluster_id" { + description = "The ID of the created cluster" + value = databricks_cluster.coding_assistants.id +} + +output "cluster_url" { + description = "URL to access the cluster in Databricks UI" + value = databricks_cluster.coding_assistants.url +} + +output "cluster_name" { + description = "Name of the created cluster" + value = databricks_cluster.coding_assistants.cluster_name +} + +output "volume_path" { + description = "Path to the volume containing init scripts" + value = databricks_volume.init_scripts.volume_path +} + +output "volume_full_name" { + description = "Full name of the volume" + value = "${var.catalog_name}.${var.schema_name}.${var.volume_name}" +} + +output "init_script_path" { + description = "Path to the init script in the volume" + value = databricks_file.init_script.path +} + +output "mlflow_experiment_name" { + description = "MLflow experiment name for tracing" + value = var.mlflow_experiment_name +} diff --git a/modules/adb-coding-assistants-cluster/scripts/README.md b/modules/adb-coding-assistants-cluster/scripts/README.md new file mode 100644 index 00000000..ae644a3a --- /dev/null +++ b/modules/adb-coding-assistants-cluster/scripts/README.md @@ -0,0 +1,353 @@ +# Claude Code CLI Installation Scripts + +This directory contains installation scripts for Claude Code CLI on Databricks clusters. + +## Scripts Overview + +| Script | Purpose | Network Required | +|--------|---------|------------------| +| `install-claude.sh` | Online installation (default) | ✅ Yes | + +> **Note**: For offline/air-gapped installations, use the separate [`adb-coding-assistants-cluster-offline`](../adb-coding-assistants-cluster-offline/README.md) module. + +## Quick Start + +### Online Installation (Default) + +For clusters with internet access: + +```hcl +resource "databricks_cluster" "claude_cluster" { + cluster_name = "claude-coding-assistant" + spark_version = data.databricks_spark_version.latest_lts.id + node_type_id = "Standard_D8pds_v6" + autotermination_minutes = 60 + num_workers = 0 + + init_scripts { + dbfs { + destination = "dbfs:/init-scripts/install-claude.sh" + } + } +} +``` + + +## What Gets Installed + +The script installs: + +- ✅ **Node.js 20.x** - Required runtime for Claude CLI +- ✅ **Claude Code CLI** - AI coding assistant +- ✅ **MLflow** - For tracing Claude interactions +- ✅ **System tools** - curl, wget, git, jq +- ✅ **Bash helpers** - Convenience functions for using Claude + +## Helper Commands + +After installation, these commands are available: + +```bash +# Verify installation +check-claude + +# Show debug info +claude-debug + +# Refresh authentication +claude-refresh-token + +# Token management +claude-token-status # Check token freshness +claude-setup-token-refresh # Enable automatic hourly refresh +claude-remove-token-refresh # Disable automatic refresh + +# Enable MLflow tracing +claude-tracing-enable + +# Check tracing status +claude-tracing-status + +# Disable tracing +claude-tracing-disable + +# VS Code/Cursor Remote SSH helpers +claude-vscode-setup # Show setup guide +claude-vscode-env # Get Python virtual environment path +claude-vscode-check # Verify VS Code/Cursor setup +claude-vscode-config # Generate VS Code settings.json snippet +``` + +## VS Code/Cursor Remote SSH Setup + +For remote development using VS Code or Cursor, follow these steps: + +### Quick Setup + +1. **Get Python interpreter path** (after SSH connection): + ```bash + claude-vscode-env + # Or manually: echo $DATABRICKS_VIRTUAL_ENV + ``` + +2. **Show complete setup guide**: + ```bash + claude-vscode-setup + ``` + +3. **Generate VS Code settings**: + ```bash + claude-vscode-config + ``` + +### Detailed Steps + +#### 1. Install Remote SSH Extension + +- **VS Code**: Install "Remote - SSH" extension from marketplace +- **Cursor**: Built-in Remote SSH extension (already included) + +#### 2. Configure Default Extensions + +Open Command Palette (`Cmd+Shift+P` / `Ctrl+Shift+P`): +- Type: `Remote-SSH: Settings` +- Or manually edit `settings.json`: + +```json +{ + "remote.SSH.defaultExtensions": [ + "ms-Python.python", + "ms-toolsai.jupyter" + ] +} +``` + +#### 3. Connect to Cluster + +- Command Palette → `Remote-SSH: Connect to Host` +- Enter your cluster SSH connection details + +#### 4. Select Python Interpreter + +After connecting: + +1. Run `claude-vscode-env` to get the Python path +2. Command Palette → `Python: Select Interpreter` +3. Enter or browse to: `/databricks/python*/pythonEnv-*/bin/python` + +**Important**: Always select the `pythonEnv-xxx` interpreter for full Databricks Runtime library access. + +#### 5. Verify Setup + +```bash +# Check setup status +claude-vscode-check + +# Test in a Python file +import pyspark +import pandas +import mlflow +print("Setup successful!") +``` + +### Important Notes + +- **IPYNB notebooks** and **`*.py` Databricks notebooks** have access to Databricks globals (`dbutils`, `spark`, etc.) +- **Regular Python `*.py` files** do NOT have access to Databricks globals +- Always select the `pythonEnv-xxx` interpreter for full Databricks Runtime library access + +### Standalone Helper Script + +A standalone helper script is also available: + +```bash +# Show setup guide +./scripts/vscode-setup.sh --guide + +# Get Python interpreter path +./scripts/vscode-setup.sh --env + +# Check current setup +./scripts/vscode-setup.sh --check + +# Generate settings.json +./scripts/vscode-setup.sh --settings +``` + +## Usage Examples + +```bash +# Interactive mode +claude + +# One-shot query +echo "Write a Python function to reverse a string" | claude --print + +# From file +claude < prompt.txt + +# With streaming +claude --stream < task.md +``` + +## Internet dependencies (online mode) + +The online installer requires access to: + +| Domain | Purpose | +|--------|---------| +| `claude.ai` | Claude CLI installer script | +| `storage.googleapis.com` | Claude CLI binaries (GCS bucket) | +| `deb.nodesource.com` | Node.js repository | +| `archive.ubuntu.com` | APT packages (x86_64) | +| `ports.ubuntu.com` | APT packages (ARM64) | +| `registry.npmjs.org` | NPM packages | +| `pypi.org` | Python package index | +| `files.pythonhosted.org` | Python package downloads | +| `raw.githubusercontent.com` | Databricks skills | +| `${DATABRICKS_HOST}` | Databricks API endpoints | + +> **Tip**: Run `./scripts/check-network-deps.sh` to verify all dependencies are accessible before installation. + +## Firewall configuration + +If using a firewall, allow HTTPS (443) and HTTP (80) to these domains, or use the offline installation method. + +## Environment Variables + +### Standard Variables (Set automatically by Databricks) + +- `DATABRICKS_HOST` - Workspace URL +- `DATABRICKS_TOKEN` - Authentication token + +### Optional Configuration + +- `MLFLOW_EXPERIMENT_NAME` - Custom experiment name (default: `/Workspace/Shared/claude-code-tracing`) + +## Architecture Support + +The installer supports: + +- ✅ **amd64** (x86_64) - Default +- ✅ **arm64** (aarch64) - Auto-detected + +## Network dependency checker + +Before installation, you can verify that all required domains are accessible using the network dependency checker: + +```bash +# Standard check +./scripts/check-network-deps.sh + +# Detailed output with HTTP status codes +./scripts/check-network-deps.sh --verbose +``` + +Example output: +``` +=== Claude Code Network Dependency Check === + +Checking required domains... + +[OK] claude.ai +[OK] storage.googleapis.com +[OK] deb.nodesource.com +[OK] archive.ubuntu.com +[OK] ports.ubuntu.com +[OK] registry.npmjs.org +[OK] pypi.org +[OK] files.pythonhosted.org +[OK] raw.githubusercontent.com + +---------------------------------------- +Result: 9/9 dependencies reachable + +SUCCESS: All dependencies are accessible +``` + +If any dependencies fail, the script provides troubleshooting guidance: +``` +[OK] claude.ai +[FAIL] deb.nodesource.com - Connection timed out +... +Result: 8/9 dependencies reachable + +FAILED: Some dependencies are not accessible + +Troubleshooting tips: + - Check firewall rules allow HTTPS (443) to the failed domains + - Verify proxy settings if behind a corporate proxy + - For air-gapped environments, use the offline installation module +``` + +## Troubleshooting + +### Installation fails during cluster startup + +Check the init script logs: +```bash +cat /tmp/init-script-claude.log +``` + +### Claude command not found + +Reload bashrc: +```bash +source ~/.bashrc +``` + +### Authentication errors + +Refresh token: +```bash +claude-refresh-token +``` + +### Installation works but Claude fails + +Check configuration: +```bash +check-claude +claude-debug +``` + +## File structure + +``` +scripts/ +├── install-claude.sh # Online installer +├── check-network-deps.sh # Network dependency checker +└── README.md # This file +``` + +> **Offline Installation**: See the [`adb-coding-assistants-cluster-offline`](../adb-coding-assistants-cluster-offline/README.md) module for offline/air-gapped installation support. + +## Version Compatibility + +- **Databricks Runtime**: 13.0+ LTS recommended +- **Python**: 3.9+ (included in DBR) +- **Node.js**: 20.x (installed by script) +- **MLflow**: 3.4+ (installed by script) + +## Security Notes + +### Authentication +- Uses Databricks personal access tokens (auto-configured) +- Tokens are ephemeral and cluster-scoped +- No long-lived credentials stored + +### Network Security +- All traffic uses HTTPS +- Authentication via `ANTHROPIC_AUTH_TOKEN` environment variable +- Custom headers for Databricks integration + + +## Support + +- **Claude CLI Issues**: [Claude AI Documentation](https://claude.ai/docs) +- **Databricks Issues**: Contact Databricks Support +- **Script Issues**: Open issue in repository + +## License + +See repository LICENSE file. diff --git a/modules/adb-coding-assistants-cluster/scripts/check-network-deps.sh b/modules/adb-coding-assistants-cluster/scripts/check-network-deps.sh new file mode 100755 index 00000000..3ef4bec2 --- /dev/null +++ b/modules/adb-coding-assistants-cluster/scripts/check-network-deps.sh @@ -0,0 +1,238 @@ +#!/bin/bash +# +# Network Dependency Checker for Claude Code Installation +# +# Verifies connectivity to all required domains before running install-claude.sh. +# Run this script to diagnose network/firewall issues in restricted environments. +# +# Usage: +# ./check-network-deps.sh # Standard check +# ./check-network-deps.sh --verbose # Detailed output +# + +set -euo pipefail + +# ============================================================================ +# Configuration +# ============================================================================ + +CONNECT_TIMEOUT=5 +VERBOSE=false + +# Color codes (disabled if not a terminal) +if [[ -t 1 ]]; then + GREEN='\033[0;32m' + RED='\033[0;31m' + YELLOW='\033[0;33m' + BOLD='\033[1m' + NC='\033[0m' # No Color +else + GREEN='' + RED='' + YELLOW='' + BOLD='' + NC='' +fi + +# Dependencies to check: "domain|purpose|test_url" +DEPENDENCIES=( + "claude.ai|CLI installer script|https://claude.ai/install.sh" + "storage.googleapis.com|Claude CLI binaries|https://storage.googleapis.com/" + "deb.nodesource.com|Node.js repo|https://deb.nodesource.com/setup_20.x" + "archive.ubuntu.com|APT packages (x86)|http://archive.ubuntu.com/ubuntu/" + "ports.ubuntu.com|APT packages (ARM)|http://ports.ubuntu.com/ubuntu-ports/" + "registry.npmjs.org|NPM packages|https://registry.npmjs.org/" + "pypi.org|Python packages|https://pypi.org/simple/mlflow/" + "files.pythonhosted.org|Package downloads|https://files.pythonhosted.org/" + "raw.githubusercontent.com|Databricks skills|https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/main/README.md" +) + +# ============================================================================ +# Functions +# ============================================================================ + +usage() { + cat </dev/null; then + host "$domain" &>/dev/null + elif command -v nslookup &>/dev/null; then + nslookup "$domain" &>/dev/null + elif command -v getent &>/dev/null; then + getent hosts "$domain" &>/dev/null + else + # Fall back to ping for DNS resolution + ping -c 1 -W 2 "$domain" &>/dev/null + fi +} + +check_url() { + local url=$1 + local http_code + + http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + --connect-timeout "$CONNECT_TIMEOUT" \ + --max-time $((CONNECT_TIMEOUT * 2)) \ + -L "$url" 2>/dev/null || echo "000") + + echo "$http_code" +} + +check_dependency() { + local entry=$1 + local domain purpose test_url + + IFS='|' read -r domain purpose test_url <<< "$entry" + + # Check DNS first + if ! check_dns "$domain"; then + log_fail "$domain - DNS resolution failed" + log_verbose "Purpose: $purpose" + log_verbose "Test URL: $test_url" + return 1 + fi + + # Check HTTP connectivity + local http_code + http_code=$(check_url "$test_url") + + if [[ "$http_code" =~ ^(2[0-9]{2}|3[0-9]{2})$ ]]; then + log_ok "$domain" + log_verbose "Purpose: $purpose" + log_verbose "HTTP status: $http_code" + log_verbose "Test URL: $test_url" + return 0 + else + case "$http_code" in + 000) + log_fail "$domain - Connection timed out" + ;; + 400) + # 400 is common for API endpoints at root - domain is reachable + log_ok "$domain" + log_verbose "Purpose: $purpose" + log_verbose "HTTP status: $http_code (API endpoint - root returns 400)" + log_verbose "Test URL: $test_url" + return 0 + ;; + 403) + log_fail "$domain - Access forbidden (HTTP 403)" + ;; + 404) + # 404 means domain is reachable, just URL changed + log_ok "$domain" + log_verbose "Purpose: $purpose" + log_verbose "HTTP status: $http_code (domain reachable)" + log_verbose "Test URL: $test_url" + return 0 + ;; + *) + log_fail "$domain - HTTP $http_code" + ;; + esac + log_verbose "Purpose: $purpose" + log_verbose "Test URL: $test_url" + return 1 + fi +} + +# ============================================================================ +# Main +# ============================================================================ + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --verbose|-v) + VERBOSE=true + shift + ;; + --help|-h) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" + usage + exit 1 + ;; + esac +done + +# Check for curl +if ! command -v curl &>/dev/null; then + echo "Error: curl is required but not installed" + exit 1 +fi + +echo "" +echo -e "${BOLD}=== Claude Code Network Dependency Check ===${NC}" +echo "" +echo "Checking required domains..." +echo "" + +pass_count=0 +fail_count=0 +total=${#DEPENDENCIES[@]} + +for dep in "${DEPENDENCIES[@]}"; do + if check_dependency "$dep"; then + ((pass_count++)) + else + ((fail_count++)) + fi +done + +echo "" +echo "----------------------------------------" +echo -e "Result: ${BOLD}${pass_count}/${total}${NC} dependencies reachable" + +if [[ $fail_count -gt 0 ]]; then + echo "" + echo -e "${RED}FAILED: Some dependencies are not accessible${NC}" + echo "" + echo "Troubleshooting tips:" + echo " - Check firewall rules allow HTTPS (443) to the failed domains" + echo " - Verify proxy settings if behind a corporate proxy" + echo " - For air-gapped environments, use the offline installation module" + echo "" + exit 1 +else + echo "" + echo -e "${GREEN}SUCCESS: All dependencies are accessible${NC}" + echo "" + exit 0 +fi diff --git a/modules/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh b/modules/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh new file mode 100755 index 00000000..5679d84e --- /dev/null +++ b/modules/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# +# Databricks Cluster Init Script - Claude Code CLI (Minimal Version) +# Installs Claude Code CLI with basic configuration only +# + +set -euo pipefail +export DEBIAN_FRONTEND=noninteractive + +LOG_FILE="/tmp/init-script-claude.log" +log() { + echo "[$(date '+%H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +# Install system dependencies +log "Installing system dependencies..." +sudo apt-get update -qq -y >> "$LOG_FILE" 2>&1 +sudo apt-get install -y -qq curl git >> "$LOG_FILE" 2>&1 || log "Warning: Some packages failed to install" + +# Install Node.js 20.x +if ! command -v node >/dev/null 2>&1; then + log "Installing Node.js 20.x..." + curl -fsSL --max-time 300 --retry 3 https://deb.nodesource.com/setup_20.x | sudo -E bash - >> "$LOG_FILE" 2>&1 + sudo apt-get install -y -qq nodejs >> "$LOG_FILE" 2>&1 + log "Node.js installed: $(node --version)" +else + log "Node.js already installed: $(node --version)" +fi + +# Install Claude Code CLI +if ! command -v claude >/dev/null 2>&1; then + log "Installing Claude Code CLI..." + curl -fsSL https://claude.ai/install.sh | bash >> "$LOG_FILE" 2>&1 + log "Claude Code CLI installed" +else + log "Claude Code CLI already installed" +fi + +# Add basic configuration to bashrc +log "Configuring bashrc..." + +# Remove old Claude section if it exists +if [ -f "$HOME/.bashrc" ]; then + sed -i '/### CLAUDE_CODE_MINIMAL_START ###/,/### CLAUDE_CODE_MINIMAL_END ###/d' "$HOME/.bashrc" || true +fi + +# Add Claude to PATH and set environment variables +cat >> "$HOME/.bashrc" <<'BASHRC_EOF' + +### CLAUDE_CODE_MINIMAL_START ### +# Claude Code CLI - Minimal Setup +export PATH="$HOME/.claude/bin:$HOME/.local/bin:$PATH" + +# Set Anthropic environment variables for Claude CLI +if [ -n "$DATABRICKS_TOKEN" ] && [ -n "$DATABRICKS_HOST" ]; then + export ANTHROPIC_AUTH_TOKEN="$DATABRICKS_TOKEN" + export ANTHROPIC_BASE_URL="${DATABRICKS_HOST}/serving-endpoints/anthropic" + export ANTHROPIC_MODEL="databricks-claude-sonnet-4-5" + export ANTHROPIC_CUSTOM_HEADERS="x-databricks-disable-beta-headers: true" + export CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS=1 +fi +### CLAUDE_CODE_MINIMAL_END ### +BASHRC_EOF + +log "Configuration complete. Log file: $LOG_FILE" +log "After cluster starts, run: source ~/.bashrc" diff --git a/modules/adb-coding-assistants-cluster/scripts/install-claude.sh b/modules/adb-coding-assistants-cluster/scripts/install-claude.sh new file mode 100755 index 00000000..52891fc2 --- /dev/null +++ b/modules/adb-coding-assistants-cluster/scripts/install-claude.sh @@ -0,0 +1,760 @@ +#!/bin/bash +# +# Databricks Cluster Init Script - Claude Code CLI +# Installs Claude Code CLI with MLflow tracing +# +# Note: For offline/air-gapped installations, use the adb-coding-assistants-cluster-offline module instead +# + +set -uo pipefail +export DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a CI=true + +L="/tmp/init-script-claude.log" +log() { echo "[$(date '+%H:%M:%S')] $1" | tee -a "$L"; } +cmd_exists() { command -v "$1" >/dev/null 2>&1; } + +# Install Claude Code CLI +install_claude() { + if cmd_exists claude; then + log "[OK] Claude Code already installed" + return 0 + fi + + log "Installing Claude Code CLI..." + if curl -fsSL https://claude.ai/install.sh | bash &>>$L; then + log "[OK] Claude Code installation completed" + return 0 + else + log "[WARN] Claude Code installation failed (will be available after manual install)" + return 1 + fi +} + +# Install Node.js (required for Claude Code CLI) +install_nodejs() { + if cmd_exists node && cmd_exists npm; then + log "[OK] Node.js already installed ($(node --version))" + return 0 + fi + + log "Installing Node.js 20.x..." + if curl -fsSL --max-time 300 --retry 3 https://deb.nodesource.com/setup_20.x | sudo -E bash - &>>$L; then + if sudo apt-get update -qq -y &>>$L && sudo apt-get install -y -qq nodejs &>>$L; then + if cmd_exists node && cmd_exists npm; then + log "[OK] Node.js/npm installed successfully ($(node --version))" + return 0 + fi + fi + fi + + log "[WARN] Node.js installation failed (Claude Code CLI will not work)" + return 1 +} + +# Add helper functions to bashrc +setup_bashrc() { + local START_MARKER="### CLAUDE_CODE_HELPERS_START ###" + local END_MARKER="### CLAUDE_CODE_HELPERS_END ###" + + # Backup bashrc + [ -f "$HOME/.bashrc" ] && cp "$HOME/.bashrc" "$HOME/.bashrc.backup-$(date +%s)" + + # Remove any existing Claude sections (between markers) + if [ -f "$HOME/.bashrc" ]; then + if grep -q "$START_MARKER" "$HOME/.bashrc" 2>/dev/null; then + log "Removing old bashrc helpers..." + # Remove everything between START and END markers (inclusive) + sed -i "/$START_MARKER/,/$END_MARKER/d" "$HOME/.bashrc" + fi + fi + + W="${DATABRICKS_HOST}" + E="${MLFLOW_EXPERIMENT_NAME:-/Workspace/Shared/claude-code-tracing}" + + log "Adding helpers to bashrc..." + + cat >> "$HOME/.bashrc" <<'EOF' + +### CLAUDE_CODE_HELPERS_START ### +# Claude Code CLI Setup (auto-generated - do not edit manually) +export PATH="$HOME/.claude/bin:$HOME/.local/bin:$PATH" + +# Claude Code MLflow tracing helpers +export DATABRICKS_HOST="${DATABRICKS_HOST:-WS_PH}" +export MLFLOW_EXPERIMENT_NAME="${MLFLOW_EXPERIMENT_NAME:-EXP_PH}" + +# Set Anthropic environment variables for Claude CLI +# NOTE: These env vars are the PRIMARY authentication method and take precedence +# over settings.json. They are always fresh because they're set on every login. +# The settings.json file serves as a fallback for cases where env vars aren't set. +# Using ANTHROPIC_AUTH_TOKEN only (not ANTHROPIC_API_KEY) to avoid auth conflicts. +if [ -n "$DATABRICKS_TOKEN" ] && [ -n "$DATABRICKS_HOST" ]; then + export ANTHROPIC_AUTH_TOKEN="$DATABRICKS_TOKEN" + export ANTHROPIC_BASE_URL="${DATABRICKS_HOST}/serving-endpoints/anthropic" + export ANTHROPIC_MODEL="databricks-claude-sonnet-4-5" + export ANTHROPIC_CUSTOM_HEADERS="x-databricks-disable-beta-headers: true" + export CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS=1 +fi + +# Internal function to generate Claude settings (single source of truth) +_generate_claude_config() { + local config_file="$HOME/.claude/settings.json" + + cat > "$config_file" </dev/null 2>&1; then + if ! jq empty "$config_file" 2>/dev/null; then + echo "[WARN] Claude settings JSON validation failed" >&2 + return 1 + fi + fi + + # Store token hash for change detection + if [ -n "$DATABRICKS_TOKEN" ]; then + echo -n "$DATABRICKS_TOKEN" | sha256sum | cut -d' ' -f1 > "$HOME/.claude/.token_hash" 2>/dev/null || true + fi + + return 0 +} + +# Check if token has changed and refresh if needed +_check_and_refresh_token() { + if [ -z "$DATABRICKS_TOKEN" ] || [ -z "$DATABRICKS_HOST" ]; then + return 0 # Skip if token not available + fi + + local config_file="$HOME/.claude/settings.json" + local token_hash_file="$HOME/.claude/.token_hash" + + # Calculate current token hash + local current_hash + current_hash=$(echo -n "$DATABRICKS_TOKEN" | sha256sum | cut -d' ' -f1 2>/dev/null || echo "") + + if [ -z "$current_hash" ]; then + return 0 # Skip if hash calculation failed + fi + + # Check if token has changed + if [ -f "$token_hash_file" ]; then + local stored_hash + stored_hash=$(cat "$token_hash_file" 2>/dev/null || echo "") + if [ "$current_hash" = "$stored_hash" ]; then + return 0 # Token unchanged, no refresh needed + fi + fi + + # Token changed or first time - refresh config + mkdir -p "$HOME/.claude" + if _generate_claude_config >/dev/null 2>&1; then + # Only show message if in interactive shell (not cron) + if [ -t 0 ]; then + echo "[OK] Claude Code token refreshed automatically" + fi + return 0 + fi + + return 1 +} + +# Auto-generate Claude settings from environment on first login +# NOTE: settings.json acts as a FALLBACK - env vars (set above) are the primary method. +# This is only generated if the file doesn't exist, to provide authentication when +# env vars might not be present (e.g., in some non-standard shell environments). +if [ ! -f "$HOME/.claude/settings.json" ] && [ -n "$DATABRICKS_TOKEN" ] && [ -n "$DATABRICKS_HOST" ]; then + mkdir -p "$HOME/.claude" + if _generate_claude_config; then + echo "[OK] Claude Code settings.json created (fallback - env vars take precedence)" + else + echo "[WARN] Failed to generate Claude settings (run claude-refresh-token to retry)" + fi +fi + +# Auto-refresh token on shell login if it has changed +# This ensures settings.json stays in sync with DATABRICKS_TOKEN +if [ -n "$DATABRICKS_TOKEN" ] && [ -n "$DATABRICKS_HOST" ]; then + _check_and_refresh_token +fi + +# Regenerate Claude settings from current environment +claude-refresh-token() { + if [ -z "$DATABRICKS_TOKEN" ] || [ -z "$DATABRICKS_HOST" ]; then + echo "[WARN] DATABRICKS_TOKEN and DATABRICKS_HOST must be set" + echo " On Databricks clusters, these should be automatically available" + return 1 + fi + + mkdir -p "$HOME/.claude" + if _generate_claude_config; then + echo "[OK] Claude Code settings updated with:" + echo " DATABRICKS_HOST: $DATABRICKS_HOST" + echo " DATABRICKS_TOKEN: ${DATABRICKS_TOKEN:0:20}..." + else + echo "[WARN] Failed to update Claude settings" + return 1 + fi +} + +# Setup cron job for periodic token refresh (runs hourly) +claude-setup-token-refresh() { + local cron_cmd="[ -n \"\$DATABRICKS_TOKEN\" ] && [ -n \"\$DATABRICKS_HOST\" ] && source \"\$HOME/.bashrc\" && _check_and_refresh_token >/dev/null 2>&1" + local cron_job="0 * * * * $cron_cmd" + local cron_file="$HOME/.claude/token-refresh-cron" + + # Create cron wrapper script + mkdir -p "$HOME/.claude" + cat > "$cron_file" <<'CRON_SCRIPT' +#!/bin/bash +# Auto-generated cron script for Claude token refresh +# This script is called by cron to refresh the Claude token periodically + +# Source bashrc to get functions +if [ -f "$HOME/.bashrc" ]; then + source "$HOME/.bashrc" >/dev/null 2>&1 +fi + +# Check and refresh token if needed +_check_and_refresh_token +CRON_SCRIPT + chmod +x "$cron_file" + + # Check if cron job already exists + if crontab -l 2>/dev/null | grep -q "token-refresh-cron"; then + echo "[OK] Token refresh cron job already configured" + return 0 + fi + + # Add cron job + (crontab -l 2>/dev/null; echo "0 * * * * $cron_file") | crontab - + if [ $? -eq 0 ]; then + echo "[OK] Token refresh cron job configured (runs hourly)" + echo " To remove: crontab -e" + else + echo "[WARN] Failed to setup cron job (may require cron service)" + return 1 + fi +} + +# Remove token refresh cron job +claude-remove-token-refresh() { + if crontab -l 2>/dev/null | grep -q "token-refresh-cron"; then + crontab -l 2>/dev/null | grep -v "token-refresh-cron" | crontab - + echo "[OK] Token refresh cron job removed" + else + echo "[INFO] No token refresh cron job found" + fi +} + +# Check token freshness status +claude-token-status() { + if [ -z "$DATABRICKS_TOKEN" ] || [ -z "$DATABRICKS_HOST" ]; then + echo "[WARN] DATABRICKS_TOKEN and DATABRICKS_HOST must be set" + return 1 + fi + + local config_file="$HOME/.claude/settings.json" + local token_hash_file="$HOME/.claude/.token_hash" + + echo "=== Claude Token Status ===" + echo "" + + # Check if config file exists + if [ -f "$config_file" ]; then + echo "[OK] Settings file: $config_file" + local file_age + file_age=$(stat -c %Y "$config_file" 2>/dev/null || stat -f %m "$config_file" 2>/dev/null || echo "0") + local current_time + current_time=$(date +%s) + local age_hours + age_hours=$(( (current_time - file_age) / 3600 )) + echo " Last updated: ${age_hours} hour(s) ago" + else + echo "[ERROR] Settings file: missing" + fi + + echo "" + + # Check token hash + if [ -f "$token_hash_file" ]; then + local current_hash + current_hash=$(echo -n "$DATABRICKS_TOKEN" | sha256sum | cut -d' ' -f1 2>/dev/null || echo "") + local stored_hash + stored_hash=$(cat "$token_hash_file" 2>/dev/null || echo "") + if [ "$current_hash" = "$stored_hash" ] && [ -n "$current_hash" ]; then + echo "[OK] Token: matches stored hash (up to date)" + else + echo "[WARN] Token: differs from stored hash (needs refresh)" + echo " Run: claude-refresh-token" + fi + else + echo "[INFO] Token hash: not stored (will be created on next refresh)" + fi + + echo "" + + # Check cron job + if crontab -l 2>/dev/null | grep -q "token-refresh-cron"; then + echo "[OK] Auto-refresh: enabled (hourly cron job)" + else + echo "[INFO] Auto-refresh: disabled" + echo " Enable with: claude-setup-token-refresh" + fi +} + +claude-tracing-enable() { + if [ -z "$DATABRICKS_TOKEN" ] || [ -z "$DATABRICKS_HOST" ]; then + echo "[WARN] DATABRICKS_TOKEN and DATABRICKS_HOST must be set" + echo " On Databricks clusters, these should be automatically available" + return 1 + fi + + if ! command -v mlflow >/dev/null 2>&1; then + echo "[WARN] MLflow is not installed" + return 1 + fi + + # Create experiment if it doesn't exist + python3 </dev/null 2>&1; then + echo "[OK] Claude Code CLI: $(which claude)" + claude --version 2>&1 | head -1 || echo " (version check failed)" + else + echo "[ERROR] Claude Code CLI: not found" + [ -f "$HOME/.claude/bin/claude" ] && echo " Binary exists at: $HOME/.claude/bin/claude" + [ -f "$HOME/.local/bin/claude" ] && echo " Binary exists at: $HOME/.local/bin/claude" + fi + echo "" + + # Check configs + echo "Configuration files:" + if [ -f "$HOME/.claude/settings.json" ]; then + echo " [OK] Claude settings: $HOME/.claude/settings.json" + echo " Preview: $(head -3 $HOME/.claude/settings.json | tail -1)" + else + echo " [ERROR] Claude settings: missing" + fi + echo "" + + # Check environment + echo "Environment variables:" + [ -n "$DATABRICKS_HOST" ] && echo " [OK] DATABRICKS_HOST: ${DATABRICKS_HOST}" || echo " [ERROR] DATABRICKS_HOST: not set" + [ -n "$DATABRICKS_TOKEN" ] && echo " [OK] DATABRICKS_TOKEN: ${DATABRICKS_TOKEN:0:20}..." || echo " [ERROR] DATABRICKS_TOKEN: not set" + [ -n "$ANTHROPIC_API_KEY" ] && echo " [OK] ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:0:20}..." || echo " [ERROR] ANTHROPIC_API_KEY: not set" + [ -n "$ANTHROPIC_AUTH_TOKEN" ] && echo " [OK] ANTHROPIC_AUTH_TOKEN: ${ANTHROPIC_AUTH_TOKEN:0:20}..." || echo " [ERROR] ANTHROPIC_AUTH_TOKEN: not set" + [ -n "$ANTHROPIC_BASE_URL" ] && echo " [OK] ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL}" || echo " [ERROR] ANTHROPIC_BASE_URL: not set" + [ -n "$ANTHROPIC_MODEL" ] && echo " [OK] ANTHROPIC_MODEL: ${ANTHROPIC_MODEL}" || echo " [ERROR] ANTHROPIC_MODEL: not set" + [ -n "$ANTHROPIC_CUSTOM_HEADERS" ] && echo " [OK] ANTHROPIC_CUSTOM_HEADERS: ${ANTHROPIC_CUSTOM_HEADERS}" || echo " [ERROR] ANTHROPIC_CUSTOM_HEADERS: not set" + echo "" + + # Check MLflow + if command -v mlflow >/dev/null 2>&1; then + echo "[OK] MLflow: $(mlflow --version 2>&1)" + else + echo "[ERROR] MLflow: not found" + fi + echo "" + + # Test Claude authentication + echo "Testing Claude CLI authentication:" + if command -v claude >/dev/null 2>&1; then + if [ -n "$ANTHROPIC_API_KEY" ] || [ -n "$ANTHROPIC_AUTH_TOKEN" ]; then + echo " [OK] Authentication configured via environment variables" + echo " Test with: echo 'what is 1+1?' | claude --print" + else + echo " [WARN] ANTHROPIC_API_KEY/ANTHROPIC_AUTH_TOKEN not set" + echo " Run: source ~/.bashrc" + fi + fi + echo "" + + # VS Code/Cursor Remote SSH info + echo "VS Code/Cursor Remote SSH:" + local venv_path + venv_path=$(claude-vscode-env 2>/dev/null) + if [ $? -eq 0 ] && [ -n "$venv_path" ]; then + echo " [OK] Python virtual environment: $venv_path" + echo " Run 'claude-vscode-setup' for setup instructions" + else + echo " [INFO] Run 'claude-vscode-setup' for Remote SSH setup guide" + fi + echo "" + + echo "Run 'source ~/.bashrc' if commands are still not found" +} + +claude-debug() { + echo "=== Claude CLI Debug Info ===" + echo "" + echo "Settings file:" + [ -f "$HOME/.claude/settings.json" ] && cat "$HOME/.claude/settings.json" || echo " Missing!" + echo "" + echo "Environment:" + env | grep -E "ANTHROPIC|DATABRICKS" || echo " No relevant env vars" + echo "" + echo "Claude config directory:" + ls -la "$HOME/.claude/" 2>/dev/null || echo " Directory doesn't exist" +} + +# VS Code/Cursor Remote SSH helpers +claude-vscode-env() { + # Show the Databricks virtual environment path for VS Code/Cursor + if [ -n "$DATABRICKS_VIRTUAL_ENV" ]; then + echo "$DATABRICKS_VIRTUAL_ENV" + else + # Try to find pythonEnv-* directories + local python_envs + python_envs=$(find /databricks/python* -maxdepth 1 -type d -name "pythonEnv-*" 2>/dev/null | head -1) + if [ -n "$python_envs" ]; then + echo "$python_envs" + else + echo "[WARN] DATABRICKS_VIRTUAL_ENV not set and pythonEnv-* not found" + echo " Try: echo \$DATABRICKS_VIRTUAL_ENV" + return 1 + fi + fi +} + +claude-vscode-setup() { + echo "=== VS Code/Cursor Remote SSH Setup Guide ===" + echo "" + echo "1. Install Remote SSH Extension" + echo " - VS Code: Install 'Remote - SSH' extension" + echo " - Cursor: Built-in Remote SSH extension (already included)" + echo "" + echo "2. Configure Default Extensions" + echo " Open Command Palette (Cmd+Shift+P / Ctrl+Shift+P):" + echo " -> Remote-SSH: Settings" + echo "" + echo " Or edit settings.json and add:" + echo "" + cat <<'VSCODE_SETTINGS' + "remote.SSH.defaultExtensions": [ + "ms-Python.python", + "ms-toolsai.jupyter" + ] +VSCODE_SETTINGS + echo "" + echo "3. Connect to Cluster" + echo " - Command Palette -> Remote-SSH: Connect to Host" + echo " - Enter your cluster SSH connection details" + echo "" + echo "4. Select Python Interpreter" + echo " After connecting, run this command to get the Python path:" + echo "" + echo " $ claude-vscode-env" + echo "" + local venv_path + venv_path=$(claude-vscode-env 2>/dev/null) + if [ $? -eq 0 ] && [ -n "$venv_path" ]; then + echo " Current virtual environment:" + echo " $venv_path" + echo "" + echo " Then in VS Code/Cursor:" + echo " - Command Palette -> Python: Select Interpreter" + echo " - Paste the path above or browse to it" + else + echo " Run 'echo \$DATABRICKS_VIRTUAL_ENV' to find the path" + fi + echo "" + echo "5. Important Notes" + echo " * IPYNB notebooks and *.py Databricks notebooks have access to" + echo " Databricks globals (dbutils, spark, etc.)" + echo " * Regular Python *.py files do NOT have access to Databricks globals" + echo " * Always select the pythonEnv-xxx interpreter for full Databricks" + echo " Runtime library access" + echo "" + echo "6. Verify Setup" + echo " Run: claude-vscode-check" +} + +claude-vscode-check() { + echo "=== VS Code/Cursor Remote SSH Setup Check ===" + echo "" + + # Check for virtual environment + local venv_path + venv_path=$(claude-vscode-env 2>/dev/null) + if [ $? -eq 0 ] && [ -n "$venv_path" ]; then + echo "[OK] Python Virtual Environment:" + echo " $venv_path" + if [ -d "$venv_path/bin" ]; then + echo " [OK] Virtual environment directory exists" + if [ -f "$venv_path/bin/python" ]; then + echo " [OK] Python executable found" + echo " Python version: $($venv_path/bin/python --version 2>&1 || echo 'unknown')" + else + echo " [WARN] Python executable not found" + fi + else + echo " [WARN] Virtual environment directory not found" + fi + else + echo "[ERROR] Python Virtual Environment: Not found" + echo " Run: echo \$DATABRICKS_VIRTUAL_ENV" + fi + echo "" + + # Check for Python + if command -v python3 >/dev/null 2>&1; then + echo "[OK] Python3 available: $(which python3)" + echo " Version: $(python3 --version 2>&1)" + else + echo "[ERROR] Python3 not found in PATH" + fi + echo "" + + # Check for Databricks runtime libraries + echo "Databricks Runtime Libraries:" + python3 <<'PYTHON_CHECK' +import sys +libraries = ['pyspark', 'pandas', 'numpy', 'mlflow', 'databricks'] +found = [] +missing = [] + +for lib in libraries: + try: + __import__(lib) + found.append(lib) + except ImportError: + missing.append(lib) + +if found: + print(f" [OK] Available: {', '.join(found)}") +if missing: + print(f" [WARN] Missing: {', '.join(missing)}") + +# Check for Databricks globals (only available in notebooks) +try: + import dbutils + print(" [OK] dbutils available (notebook context)") +except: + print(" [INFO] dbutils not available (normal for .py files)") +PYTHON_CHECK + + echo "" + echo "VS Code/Cursor Configuration:" + echo " Run 'claude-vscode-setup' for setup instructions" + echo " Run 'claude-vscode-env' to get Python interpreter path" +} + +claude-vscode-config() { + # Generate VS Code settings.json snippet + local venv_path + venv_path=$(claude-vscode-env 2>/dev/null) + + echo "=== VS Code/Cursor settings.json Configuration ===" + echo "" + echo "Add this to your VS Code/Cursor settings.json:" + echo "" + echo "{" + echo " \"remote.SSH.defaultExtensions\": [" + echo " \"ms-Python.python\"," + echo " \"ms-toolsai.jupyter\"" + echo " ]" + if [ $? -eq 0 ] && [ -n "$venv_path" ]; then + echo "," + echo " \"python.defaultInterpreterPath\": \"$venv_path/bin/python\"" + fi + echo "}" + echo "" + if [ $? -eq 0 ] && [ -n "$venv_path" ]; then + echo "Python interpreter path:" + echo " $venv_path/bin/python" + echo "" + echo "To set this in VS Code/Cursor:" + echo " 1. Command Palette -> Python: Select Interpreter" + echo " 2. Enter interpreter path: $venv_path/bin/python" + else + echo "To find Python interpreter path, run:" + echo " claude-vscode-env" + fi +} +### CLAUDE_CODE_HELPERS_END ### +EOF + + sed -i "s|WS_PH|$W|g; s|EXP_PH|$E|g" "$HOME/.bashrc" + log "[OK] Bashrc helpers added" + log " Experiment: $E" +} + +# Install Databricks skills for Claude Code +install_databricks_skills() { + local skills_dir="$HOME/.claude/skills" + local repo_url="https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/main/databricks-skills" + + # Core skills to install (curated list for most common use cases) + local core_skills=( + "databricks-config" + "databricks-python-sdk" + "databricks-unity-catalog" + "databricks-jobs" + "asset-bundles" + "databricks-app-python" + "model-serving" + "mlflow-evaluation" + "aibi-dashboards" + "spark-declarative-pipelines" + ) + + log "Installing Databricks skills for Claude Code..." + + # Create skills directory + mkdir -p "$skills_dir" + + local installed=0 + local failed=0 + + for skill in "${core_skills[@]}"; do + local skill_dir="$skills_dir/$skill" + + # Skip if already exists + if [ -d "$skill_dir" ] && [ -f "$skill_dir/SKILL.md" ]; then + log " [INFO] Skill '$skill' already installed" + installed=$((installed + 1)) + continue + fi + + # Create skill directory + mkdir -p "$skill_dir" + + # Download SKILL.md (required) + if curl -sSL -f "${repo_url}/${skill}/SKILL.md" -o "$skill_dir/SKILL.md" 2>>$L; then + log " [OK] Installed skill: $skill" + installed=$((installed + 1)) + else + log " [WARN] Failed to download skill: $skill" + rm -rf "$skill_dir" + failed=$((failed + 1)) + fi + done + + if [ $installed -gt 0 ]; then + log "[OK] Databricks skills installed: $installed skills" + [ $failed -gt 0 ] && log "[WARN] Failed to install: $failed skills" + return 0 + else + log "[WARN] No Databricks skills installed" + return 1 + fi +} + +# Main installation +main() { + log "Starting installation..." + + # Install system dependencies (curl, git, jq - commonly used by Claude Code) + log "Installing system dependencies..." + if sudo apt-get update -qq -y &>>$L; then + if sudo apt-get install -y -qq curl git jq &>>$L; then + log "[OK] System dependencies installed (curl, git, jq)" + else + log "[WARN] Some system dependencies failed to install" + fi + else + log "[WARN] apt-get update failed" + fi + + # Install MLflow with Databricks support + log "Installing MLflow with Databricks support..." + if pip install --quiet --upgrade "mlflow[databricks]>=3.4" &>>$L; then + log "[OK] MLflow installed successfully" + else + log "[WARN] MLflow installation failed (tracing features will not work)" + fi + + # Install tools (continue even if some fail) + install_nodejs || log "[WARN] Node.js installation skipped or failed" + install_claude || log "[WARN] Claude Code installation skipped or failed" + + # Install Databricks skills for Claude Code + install_databricks_skills || log "[WARN] Databricks skills installation incomplete" + + # Configure tools + if setup_bashrc; then + log "[OK] Bashrc configuration completed" + else + log "[WARN] Bashrc configuration failed" + fi + + log "" + log "=== Installation Summary ===" + log "Installation complete. Full log: $L" + log "" + log "Installed components:" + log " - Claude Code CLI" + log " - Node.js runtime" + log " - MLflow with Databricks support" + log " - Databricks skills (patterns and best practices)" + log "" + log "Next steps (on cluster login):" + log " 1. Run: source ~/.bashrc" + log " 2. Verify: check-claude" + log " 3. Use: claude command" + log "" + log "Databricks skills installed in: ~/.claude/skills/" + log "Skills available: databricks-config, python-sdk, unity-catalog," + log " jobs, asset-bundles, apps, model-serving, mlflow, dashboards, pipelines" + log "" + log "Helper commands:" + log " - check-claude: Verify installation status" + log " - claude-debug: Show Claude CLI configuration details" + log " - claude-refresh-token: Regenerate Claude settings" + log " - claude-token-status: Check token freshness and auto-refresh status" + log " - claude-setup-token-refresh: Enable hourly automatic token refresh (optional)" + log " - claude-remove-token-refresh: Disable automatic token refresh" + log " - claude-tracing-enable/disable/status: Manage MLflow tracing" + log " - claude-vscode-setup: Show VS Code/Cursor Remote SSH setup guide" + log " - claude-vscode-env: Get Python virtual environment path" + log " - claude-vscode-check: Verify VS Code/Cursor setup" + log " - claude-vscode-config: Generate VS Code settings.json snippet" + return 0 +} + +main +exit 0 diff --git a/modules/adb-coding-assistants-cluster/scripts/vscode-setup.sh b/modules/adb-coding-assistants-cluster/scripts/vscode-setup.sh new file mode 100755 index 00000000..89b21ada --- /dev/null +++ b/modules/adb-coding-assistants-cluster/scripts/vscode-setup.sh @@ -0,0 +1,248 @@ +#!/bin/bash +# +# VS Code/Cursor Remote SSH Setup Helper for Databricks Clusters +# This script helps configure VS Code or Cursor for remote development on Databricks clusters +# + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { echo -e "${BLUE}ℹ${NC} $1"; } +log_success() { echo -e "${GREEN}✓${NC} $1"; } +log_warning() { echo -e "${YELLOW}⚠${NC} $1"; } +log_error() { echo -e "${RED}✗${NC} $1"; } + +# Find Databricks Python virtual environment +find_python_env() { + if [ -n "${DATABRICKS_VIRTUAL_ENV:-}" ]; then + echo "$DATABRICKS_VIRTUAL_ENV" + return 0 + fi + + # Try to find pythonEnv-* directories + local python_envs + python_envs=$(find /databricks/python* -maxdepth 1 -type d -name "pythonEnv-*" 2>/dev/null | head -1) + if [ -n "$python_envs" ]; then + echo "$python_envs" + return 0 + fi + + return 1 +} + +# Show setup instructions +show_setup_guide() { + echo "==========================================" + echo "VS Code/Cursor Remote SSH Setup Guide" + echo "==========================================" + echo "" + + echo "1. Install Remote SSH Extension" + echo " • VS Code: Install 'Remote - SSH' extension from marketplace" + echo " • Cursor: Built-in Remote SSH extension (already included)" + echo "" + + echo "2. Configure Default Extensions" + echo " Open Command Palette (Cmd+Shift+P / Ctrl+Shift+P):" + echo " → Type: Remote-SSH: Settings" + echo "" + echo " Or manually edit settings.json:" + echo "" + echo " {" + echo " \"remote.SSH.defaultExtensions\": [" + echo " \"ms-Python.python\"," + echo " \"ms-toolsai.jupyter\"" + echo " ]" + echo " }" + echo "" + + echo "3. Connect to Cluster" + echo " • Command Palette → Remote-SSH: Connect to Host" + echo " • Enter your cluster SSH connection details" + echo " • Format: user@hostname or use SSH config entry" + echo "" + + echo "4. Select Python Interpreter" + local venv_path + if venv_path=$(find_python_env 2>/dev/null); then + echo " ✓ Found Python virtual environment:" + echo " $venv_path" + echo "" + echo " In VS Code/Cursor:" + echo " • Command Palette → Python: Select Interpreter" + echo " • Enter interpreter path:" + echo " $venv_path/bin/python" + echo "" + echo " Or copy this path:" + echo " $venv_path/bin/python" + else + echo " ⚠ Could not auto-detect Python virtual environment" + echo " Run this command to find it:" + echo " echo \$DATABRICKS_VIRTUAL_ENV" + echo "" + echo " Then in VS Code/Cursor:" + echo " • Command Palette → Python: Select Interpreter" + echo " • Paste the path from above" + fi + echo "" + + echo "5. Important Notes" + echo " • IPYNB notebooks and *.py Databricks notebooks have access to" + echo " Databricks globals (dbutils, spark, etc.)" + echo " • Regular Python *.py files do NOT have access to Databricks globals" + echo " • Always select the pythonEnv-xxx interpreter for full Databricks" + echo " Runtime library access (pyspark, pandas, numpy, mlflow, etc.)" + echo "" + + echo "6. Verify Setup" + echo " After connecting, verify Python interpreter:" + echo " • Command Palette → Python: Select Interpreter" + echo " • Should show: pythonEnv-xxx/bin/python" + echo "" + echo " Test in a Python file:" + echo " import pyspark" + echo " import pandas" + echo " print('Setup successful!')" +} + +# Generate VS Code settings.json snippet +generate_settings() { + local venv_path + venv_path=$(find_python_env 2>/dev/null || echo "") + + echo "{" + echo " \"remote.SSH.defaultExtensions\": [" + echo " \"ms-Python.python\"," + echo " \"ms-toolsai.jupyter\"" + echo " ]" + if [ -n "$venv_path" ]; then + echo "," + echo " \"python.defaultInterpreterPath\": \"$venv_path/bin/python\"" + fi + echo "}" +} + +# Check current setup +check_setup() { + echo "==========================================" + echo "VS Code/Cursor Setup Check" + echo "==========================================" + echo "" + + # Check for virtual environment + local venv_path + if venv_path=$(find_python_env 2>/dev/null); then + log_success "Python Virtual Environment found:" + echo " $venv_path" + + if [ -d "$venv_path/bin" ]; then + log_success "Virtual environment directory exists" + if [ -f "$venv_path/bin/python" ]; then + log_success "Python executable found" + echo " Python version: $($venv_path/bin/python --version 2>&1 || echo 'unknown')" + else + log_warning "Python executable not found" + fi + else + log_warning "Virtual environment directory not found" + fi + else + log_error "Python Virtual Environment not found" + echo " Run: echo \$DATABRICKS_VIRTUAL_ENV" + fi + echo "" + + # Check for Python + if command -v python3 >/dev/null 2>&1; then + log_success "Python3 available: $(which python3)" + echo " Version: $(python3 --version 2>&1)" + else + log_error "Python3 not found in PATH" + fi + echo "" + + # Check for Databricks runtime libraries + echo "Databricks Runtime Libraries:" + python3 <<'PYTHON_CHECK' +import sys +libraries = ['pyspark', 'pandas', 'numpy', 'mlflow'] +found = [] +missing = [] + +for lib in libraries: + try: + __import__(lib) + found.append(lib) + except ImportError: + missing.append(lib) + +if found: + print(f" ✓ Available: {', '.join(found)}") +if missing: + print(f" ⚠ Missing: {', '.join(missing)}") + +# Check for Databricks globals (only available in notebooks) +try: + import dbutils + print(" ✓ dbutils available (notebook context)") +except: + print(" ℹ dbutils not available (normal for .py files)") +PYTHON_CHECK + + echo "" + echo "Next steps:" + echo " • Run this script with --guide to see setup instructions" + echo " • Run this script with --settings to generate settings.json" +} + +# Main +main() { + case "${1:-}" in + --guide|-g) + show_setup_guide + ;; + --settings|-s) + generate_settings + ;; + --check|-c) + check_setup + ;; + --env|-e) + find_python_env || { + log_error "Could not find Python virtual environment" + echo "Try: echo \$DATABRICKS_VIRTUAL_ENV" + exit 1 + } + ;; + --help|-h|"") + echo "VS Code/Cursor Remote SSH Setup Helper" + echo "" + echo "Usage: $0 [OPTION]" + echo "" + echo "Options:" + echo " --guide, -g Show complete setup guide" + echo " --settings, -s Generate VS Code settings.json snippet" + echo " --check, -c Check current setup status" + echo " --env, -e Show Python virtual environment path" + echo " --help, -h Show this help message" + echo "" + echo "Examples:" + echo " $0 --guide # Show setup instructions" + echo " $0 --env # Get Python interpreter path" + echo " $0 --check # Verify setup" + ;; + *) + log_error "Unknown option: $1" + echo "Run '$0 --help' for usage information" + exit 1 + ;; + esac +} + +main "$@" diff --git a/modules/adb-coding-assistants-cluster/variables.tf b/modules/adb-coding-assistants-cluster/variables.tf new file mode 100644 index 00000000..cb751f24 --- /dev/null +++ b/modules/adb-coding-assistants-cluster/variables.tf @@ -0,0 +1,94 @@ +variable "cluster_name" { + description = "Name of the Databricks cluster" + type = string +} + +variable "catalog_name" { + description = "Unity Catalog catalog name for the volume. The metastore must have a root storage credential configured." + type = string +} + +variable "schema_name" { + description = "Schema name for the volume" + type = string + default = "default" +} + +variable "volume_name" { + description = "Volume name to store init scripts" + type = string + default = "coding_assistants" +} + +variable "init_script_source_path" { + description = "Local path to the init script" + type = string + default = null +} + +variable "spark_version" { + description = "Databricks Runtime version" + type = string + default = "17.3.x-cpu-ml-scala2.13" +} + +variable "node_type_id" { + description = "Node type for the cluster. Default is Standard_D8pds_v6 (modern, premium SSD + local NVMe). If unavailable in your region, consider Standard_DS13_v2 as fallback." + type = string + default = "Standard_D4ds_v5" +} + +variable "autotermination_minutes" { + description = "Minutes of inactivity before cluster auto-terminates" + type = number + default = 30 +} + +variable "num_workers" { + description = "Number of worker nodes (null for autoscaling). For SINGLE_NODE clusters, this is automatically set to 0." + type = number + default = null +} + +variable "min_workers" { + description = "Minimum number of workers for autoscaling" + type = number + default = 1 +} + +variable "max_workers" { + description = "Maximum number of workers for autoscaling" + type = number + default = 3 +} + +variable "mlflow_experiment_name" { + description = "MLflow experiment name for Claude Code tracing" + type = string + default = "/Workspace/Shared/claude-code-tracing" +} + +variable "cluster_mode" { + description = <<-EOT + Cluster mode: STANDARD or SINGLE_NODE. + - STANDARD: Multi-node cluster with worker nodes (supports autoscaling) + - SINGLE_NODE: Single-node cluster with no worker nodes (driver-only, runs Spark locally). + For SINGLE_NODE clusters, num_workers is automatically set to 0 and autoscaling is disabled. + EOT + type = string + default = "STANDARD" + + validation { + condition = contains(["STANDARD", "SINGLE_NODE"], var.cluster_mode) + error_message = "cluster_mode must be either STANDARD or SINGLE_NODE" + } +} + +variable "tags" { + description = "Custom tags for the cluster" + type = map(string) + default = { + Environment = "dev" + Purpose = "coding-assistants" + } +} diff --git a/modules/adb-coding-assistants-cluster/versions.tf b/modules/adb-coding-assistants-cluster/versions.tf new file mode 100644 index 00000000..07223296 --- /dev/null +++ b/modules/adb-coding-assistants-cluster/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + databricks = { + source = "databricks/databricks" + version = ">= 1.40.0" + } + } +} From 2f9526eab9466f97ee558ed33676da1b76712086 Mon Sep 17 00:00:00 2001 From: dgokeeffe Date: Thu, 5 Feb 2026 13:42:48 +1100 Subject: [PATCH 2/3] fix: address Copilot review comments on PR #227 - Add safe expansion for DATABRICKS_HOST to prevent crash under set -u - Remove unused local variables in claude-setup-token-refresh() - Fix broken $? check in claude-vscode-config() by capturing exit code - Use python3 -m pip instead of bare pip for safer execution - Add supply-chain verification comment to minimal installer - Fix node_type_id description to match actual default value - Update scripts README with all available scripts - Remove wget from system tools list (not installed) - Remove OpenCode references throughout documentation Co-Authored-By: Claude Opus 4.5 --- .../adb-coding-assistants-cluster/README.md | 108 +++++++++++++----- .../scripts/README.md | 13 ++- .../scripts/install-claude-minimal.sh | 2 + .../scripts/install-claude.sh | 14 +-- .../variables.tf | 2 +- 5 files changed, 99 insertions(+), 40 deletions(-) diff --git a/modules/adb-coding-assistants-cluster/README.md b/modules/adb-coding-assistants-cluster/README.md index 8838f877..a699ee71 100644 --- a/modules/adb-coding-assistants-cluster/README.md +++ b/modules/adb-coding-assistants-cluster/README.md @@ -24,32 +24,84 @@ This module can be used to deploy the following: ## Architecture ``` -┌─────────────────────────────────────────────────────┐ -│ Unity Catalog Volume │ -│ /Volumes//// │ -│ └── install-claude.sh │ -└─────────────────────────────────────────────────────┘ - ↓ -┌─────────────────────────────────────────────────────┐ -│ Databricks Cluster (on startup) │ -│ │ -│ 1. Executes init script from volume │ -│ 2. Installs Node.js, OpenCode, Claude CLI │ -│ 3. Configures bashrc with helper functions │ -│ 4. Auto-generates configs on user login │ -└─────────────────────────────────────────────────────┘ - ↓ -┌─────────────────────────────────────────────────────┐ -│ User Login │ -│ │ -│ • DATABRICKS_TOKEN available from environment │ -│ • Configs auto-generate: │ -│ - ~/.claude/settings.json │ -│ - ~/.opencode/config.json │ -│ • Commands ready: claude, opencode │ -└─────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ LOCAL MACHINE │ +│ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ +│ │ Local Terminal │ │ Databricks CLI │ │ VS Code / Cursor │ │ +│ │ │ │ (SSH Setup) │ │ (Remote SSH Ext) │ │ +│ └────────┬─────────┘ └────────┬─────────┘ └────────┬─────────┘ │ +└───────────┼─────────────────────┼─────────────────────┼─────────────────────────┘ + │ │ │ + │ ┌────────────────┴─────────────────────┘ + │ │ 3. databricks ssh setup + ▼ ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ CONNECTION LAYER │ +│ ┌───────────────────────────────────────────────────────────────────────────┐ │ +│ │ SSH Tunnel ─────────────────────────────────────────────────────────────│ │ +│ │ Port Forwarding (8501 for Streamlit, etc.) │ │ +│ └───────────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────┬───────────────────────────────────────┘ + │ 4. VS Code connects + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ DATABRICKS WORKSPACE │ +│ │ +│ ┌─────────────────────────────┐ ┌─────────────────────────────────────┐ │ +│ │ UNITY CATALOG │ │ SINGLE-NODE DEV CLUSTER │ │ +│ │ ┌───────────────────────┐ │ │ │ │ +│ │ │ Catalog │ │ │ ┌───────────────────────────────┐ │ │ +│ │ │ └── Schema │ │ │ │ Init Script Execution │ │ │ +│ │ │ └── Volume │──┼──────│──│ (on cluster startup) │ │ │ +│ │ │ └── init │ │ 2. │ └───────────────┬───────────────┘ │ │ +│ │ │ script │ │ │ │ installs │ │ +│ │ └───────────────────────┘ │ │ ▼ │ │ +│ │ ▲ │ │ ┌───────────────────────────────┐ │ │ +│ │ │ 1. Deploy via │ │ │ DRIVER NODE │ │ │ +│ │ │ Terraform │ │ │ ┌────────────┬────────────┐ │ │ │ +│ └─────────┼───────────────────┘ │ │ │Claude Code │ Node.js │ │ │ │ +│ │ │ │ │ CLI │ Runtime │ │ │ │ +│ │ │ │ ├────────────┼────────────┤ │ │ │ +│ │ │ │ │ Databricks │ Bash │ │ │ │ +│ │ │ │ │ Python │ Helpers │ │ │ │ +│ │ │ │ ├────────────┴────────────┤ │ │ │ +│ │ │ │ │ tmux (persistent) │ │ │ │ +│ │ │ │ └─────────────────────────┘ │ │ │ +│ │ │ └───────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ │ ┌───────────────────────────────┐ │ │ +│ │ │ │ Workspace Storage │ │ │ +│ │ │ │ /Workspace/Users// │ │ │ +│ │ │ └───────────────┬───────────────┘ │ │ +│ │ │ │ 5. git commit │ │ +│ │ └──────────────────┼──────────────────┘ │ +└────────────┼─────────────────────────────────────────────┼──────────────────────┘ + │ │ + │ ┌────────────────────────┘ + │ │ + ▼ ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ EXTERNAL SERVICES │ +│ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ +│ │ Claude AI API │ │ MLflow │ │ GitHub / Git │ │ +│ │ (Anthropic) │ │ (Session Tracing)│ │ (Version Control)│ │ +│ └──────────────────┘ └──────────────────┘ └──────────────────┘ │ +│ ▲ ▲ ▲ │ +│ │ 6. API calls │ 7. Trace sessions │ │ +│ └──────────────────────┴──────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────────┘ ``` +### Workflow + +1. **Terraform deploys** the Unity Catalog volume and cluster configuration +2. **Init script runs** on cluster startup, installing Claude Code CLI and tools +3. **User configures SSH** via `databricks ssh setup` command +4. **VS Code connects** via Remote SSH extension to the cluster +5. **Code is stored** in `/Workspace/Users//` and committed to Git +6. **Claude Code CLI** calls the Claude AI API for coding assistance +7. **Sessions traced** to MLflow for observability + ## Prerequisites - Databricks workspace with Unity Catalog enabled @@ -349,7 +401,7 @@ Check cluster logs: source ~/.bashrc # Check PATH -echo $PATH | grep -E "(claude|opencode)" +echo $PATH | grep -E "(claude)" # Verify installation check-coding-assistants @@ -366,7 +418,7 @@ echo $DATABRICKS_TOKEN # Regenerate configs claude-refresh-token -opencode-refresh-config +claude-refresh-token ``` ### Script Size Limit @@ -395,7 +447,7 @@ GRANT READ VOLUME ON VOLUME .. TO ; - Tokens are **never hardcoded** in configs - Read from environment: `$DATABRICKS_TOKEN` - Configs regenerate per session -- Settings files are user-readable only (`~/.claude/`, `~/.opencode/`) +- Settings files are user-readable only (`~/.claude/`) ## Maintenance @@ -458,4 +510,4 @@ For issues related to: - **Module**: Open an issue in this repository - **Init Script**: See the init script documentation - **Databricks Platform**: Contact Databricks support -- **Claude/OpenCode**: Contact Anthropic or OpenCode support respectively +- **Claude**: Contact Anthropic support diff --git a/modules/adb-coding-assistants-cluster/scripts/README.md b/modules/adb-coding-assistants-cluster/scripts/README.md index ae644a3a..d4ddcecd 100644 --- a/modules/adb-coding-assistants-cluster/scripts/README.md +++ b/modules/adb-coding-assistants-cluster/scripts/README.md @@ -6,7 +6,10 @@ This directory contains installation scripts for Claude Code CLI on Databricks c | Script | Purpose | Network Required | |--------|---------|------------------| -| `install-claude.sh` | Online installation (default) | ✅ Yes | +| `install-claude.sh` | Full online installation with MLflow tracing | Yes | +| `install-claude-minimal.sh` | Minimal installation (Claude CLI only) | Yes | +| `vscode-setup.sh` | VS Code/Cursor Remote SSH helper | No | +| `check-network-deps.sh` | Network connectivity preflight check | Yes | > **Note**: For offline/air-gapped installations, use the separate [`adb-coding-assistants-cluster-offline`](../adb-coding-assistants-cluster-offline/README.md) module. @@ -40,7 +43,7 @@ The script installs: - ✅ **Node.js 20.x** - Required runtime for Claude CLI - ✅ **Claude Code CLI** - AI coding assistant - ✅ **MLflow** - For tracing Claude interactions -- ✅ **System tools** - curl, wget, git, jq +- ✅ **System tools** - curl, git, jq - ✅ **Bash helpers** - Convenience functions for using Claude ## Helper Commands @@ -315,9 +318,11 @@ claude-debug ``` scripts/ -├── install-claude.sh # Online installer +├── install-claude.sh # Full online installer with MLflow +├── install-claude-minimal.sh # Minimal installer (Claude CLI only) +├── vscode-setup.sh # VS Code/Cursor Remote SSH helper ├── check-network-deps.sh # Network dependency checker -└── README.md # This file +└── README.md # This file ``` > **Offline Installation**: See the [`adb-coding-assistants-cluster-offline`](../adb-coding-assistants-cluster-offline/README.md) module for offline/air-gapped installation support. diff --git a/modules/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh b/modules/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh index 5679d84e..3a365444 100755 --- a/modules/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh +++ b/modules/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh @@ -28,6 +28,8 @@ else fi # Install Claude Code CLI +# Note: Uses official Anthropic installer. For supply-chain verification, +# consider npm install @anthropic-ai/claude-code instead. if ! command -v claude >/dev/null 2>&1; then log "Installing Claude Code CLI..." curl -fsSL https://claude.ai/install.sh | bash >> "$LOG_FILE" 2>&1 diff --git a/modules/adb-coding-assistants-cluster/scripts/install-claude.sh b/modules/adb-coding-assistants-cluster/scripts/install-claude.sh index 52891fc2..26f12eb1 100755 --- a/modules/adb-coding-assistants-cluster/scripts/install-claude.sh +++ b/modules/adb-coding-assistants-cluster/scripts/install-claude.sh @@ -68,7 +68,7 @@ setup_bashrc() { fi fi - W="${DATABRICKS_HOST}" + W="${DATABRICKS_HOST:-}" E="${MLFLOW_EXPERIMENT_NAME:-/Workspace/Shared/claude-code-tracing}" log "Adding helpers to bashrc..." @@ -206,8 +206,6 @@ claude-refresh-token() { # Setup cron job for periodic token refresh (runs hourly) claude-setup-token-refresh() { - local cron_cmd="[ -n \"\$DATABRICKS_TOKEN\" ] && [ -n \"\$DATABRICKS_HOST\" ] && source \"\$HOME/.bashrc\" && _check_and_refresh_token >/dev/null 2>&1" - local cron_job="0 * * * * $cron_cmd" local cron_file="$HOME/.claude/token-refresh-cron" # Create cron wrapper script @@ -586,8 +584,10 @@ PYTHON_CHECK claude-vscode-config() { # Generate VS Code settings.json snippet local venv_path + local venv_rc venv_path=$(claude-vscode-env 2>/dev/null) - + venv_rc=$? + echo "=== VS Code/Cursor settings.json Configuration ===" echo "" echo "Add this to your VS Code/Cursor settings.json:" @@ -597,13 +597,13 @@ claude-vscode-config() { echo " \"ms-Python.python\"," echo " \"ms-toolsai.jupyter\"" echo " ]" - if [ $? -eq 0 ] && [ -n "$venv_path" ]; then + if [ $venv_rc -eq 0 ] && [ -n "$venv_path" ]; then echo "," echo " \"python.defaultInterpreterPath\": \"$venv_path/bin/python\"" fi echo "}" echo "" - if [ $? -eq 0 ] && [ -n "$venv_path" ]; then + if [ $venv_rc -eq 0 ] && [ -n "$venv_path" ]; then echo "Python interpreter path:" echo " $venv_path/bin/python" echo "" @@ -702,7 +702,7 @@ main() { # Install MLflow with Databricks support log "Installing MLflow with Databricks support..." - if pip install --quiet --upgrade "mlflow[databricks]>=3.4" &>>$L; then + if python3 -m pip install --quiet --upgrade "mlflow[databricks]>=3.4" &>>$L; then log "[OK] MLflow installed successfully" else log "[WARN] MLflow installation failed (tracing features will not work)" diff --git a/modules/adb-coding-assistants-cluster/variables.tf b/modules/adb-coding-assistants-cluster/variables.tf index cb751f24..b3987843 100644 --- a/modules/adb-coding-assistants-cluster/variables.tf +++ b/modules/adb-coding-assistants-cluster/variables.tf @@ -33,7 +33,7 @@ variable "spark_version" { } variable "node_type_id" { - description = "Node type for the cluster. Default is Standard_D8pds_v6 (modern, premium SSD + local NVMe). If unavailable in your region, consider Standard_DS13_v2 as fallback." + description = "Node type for the cluster. Default is Standard_D4ds_v5 (modern, premium SSD). If unavailable in your region, consider Standard_DS13_v2 as fallback." type = string default = "Standard_D4ds_v5" } From 5604c8226b3bb67c047be72a8ddaf841c86233d4 Mon Sep 17 00:00:00 2001 From: dgokeeffe Date: Fri, 20 Feb 2026 22:15:18 +1100 Subject: [PATCH 3/3] refactor: inline module resources into example, remove standalone module Per review feedback, not every example needs a corresponding module. Inlined all resources from modules/adb-coding-assistants-cluster/ directly into examples/adb-coding-assistants-cluster/ to make it self-contained. Copied scripts into the example directory and updated all references. Co-Authored-By: Claude Opus 4.6 --- README.md | 1 - .../adb-coding-assistants-cluster/README.md | 31 +- .../adb-coding-assistants-cluster/main.tf | 99 +++- .../adb-coding-assistants-cluster/outputs.tf | 24 +- .../providers.tf | 8 +- .../scripts/README.md | 0 .../scripts/check-network-deps.sh | 0 .../scripts/install-claude-minimal.sh | 0 .../scripts/install-claude.sh | 0 .../scripts/vscode-setup.sh | 0 .../adb-coding-assistants-cluster/Makefile | 7 - .../adb-coding-assistants-cluster/README.md | 513 ------------------ modules/adb-coding-assistants-cluster/main.tf | 83 --- .../adb-coding-assistants-cluster/outputs.tf | 34 -- .../variables.tf | 94 ---- .../adb-coding-assistants-cluster/versions.tf | 10 - 16 files changed, 108 insertions(+), 796 deletions(-) rename {modules => examples}/adb-coding-assistants-cluster/scripts/README.md (100%) rename {modules => examples}/adb-coding-assistants-cluster/scripts/check-network-deps.sh (100%) rename {modules => examples}/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh (100%) rename {modules => examples}/adb-coding-assistants-cluster/scripts/install-claude.sh (100%) rename {modules => examples}/adb-coding-assistants-cluster/scripts/vscode-setup.sh (100%) delete mode 100644 modules/adb-coding-assistants-cluster/Makefile delete mode 100644 modules/adb-coding-assistants-cluster/README.md delete mode 100644 modules/adb-coding-assistants-cluster/main.tf delete mode 100644 modules/adb-coding-assistants-cluster/outputs.tf delete mode 100644 modules/adb-coding-assistants-cluster/variables.tf delete mode 100644 modules/adb-coding-assistants-cluster/versions.tf diff --git a/README.md b/README.md index 8c0ad6f6..dcfda364 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,6 @@ The folder `modules` contains the following Terraform modules : | Azure | [adb-overwatch-main-ws](modules/adb-overwatch-main-ws/) | Main Overwatch workspace deployment | | Azure | [adb-overwatch-ws-to-monitor](modules/adb-overwatch-ws-to-monitor/) | Overwatch deployment on the Azure workspace to monitor | | Azure | [adb-overwatch-analysis](modules/adb-overwatch-analysis/) | Overwatch analysis notebooks deployment on Azure | -| Azure | [adb-coding-assistants-cluster](modules/adb-coding-assistants-cluster/) | Databricks cluster with Claude Code CLI for AI-assisted development | | AWS | [aws-workspace-basic](modules/aws-workspace-basic/) | Provisioning AWS Databricks E2 | | AWS | [aws-databricks-base-infra](modules/aws-databricks-base-infra/) | Provisioning AWS Infrastructure to be used for the deployment of a Databricks E2 workspace | | AWS | [aws-databricks-unity-catalog](modules/aws-databricks-unity-catalog/) | Provisioning the AWS Infrastructure and setting up the metastore for Databricks Unity Catalog | diff --git a/examples/adb-coding-assistants-cluster/README.md b/examples/adb-coding-assistants-cluster/README.md index 6dfaa8d7..2d0ff7bd 100644 --- a/examples/adb-coding-assistants-cluster/README.md +++ b/examples/adb-coding-assistants-cluster/README.md @@ -1,8 +1,6 @@ # Provisioning Databricks Cluster with Claude Code CLI -This example uses the [adb-coding-assistants-cluster](../../modules/adb-coding-assistants-cluster) module. - -This template provides an example deployment of a Databricks cluster pre-configured with Claude Code CLI for AI-assisted development directly on the cluster. +This template provides a self-contained deployment of a Databricks cluster pre-configured with Claude Code CLI for AI-assisted development directly on the cluster. ## What Gets Deployed @@ -13,20 +11,16 @@ This template provides an example deployment of a Databricks cluster pre-configu ## How to use -> **Note** -> A detailed module README with full configuration options can be found in [modules/adb-coding-assistants-cluster](../../modules/adb-coding-assistants-cluster) - -1. Reference this module using one of the different [module source types](https://developer.hashicorp.com/terraform/language/modules/sources) -2. Copy `terraform.tfvars.example` to `terraform.tfvars` -3. Update `terraform.tfvars` with your values: +1. Copy `terraform.tfvars.example` to `terraform.tfvars` +2. Update `terraform.tfvars` with your values: - `databricks_resource_id`: Your Azure Databricks workspace resource ID - `cluster_name`: Name for your cluster - `catalog_name`: Unity Catalog name to use -4. (Optional) Customize cluster configuration in `terraform.tfvars` (node type, autoscaling, etc.) -5. (Optional) Configure your [remote backend](https://developer.hashicorp.com/terraform/language/settings/backends/azurerm) -6. Run `terraform init` to initialize terraform and get provider ready -7. Run `terraform plan` to review the resources that will be created -8. Run `terraform apply` to create the resources +3. (Optional) Customize cluster configuration in `terraform.tfvars` (node type, autoscaling, etc.) +4. (Optional) Configure your [remote backend](https://developer.hashicorp.com/terraform/language/settings/backends/azurerm) +5. Run `terraform init` to initialize terraform and get provider ready +6. Run `terraform plan` to review the resources that will be created +7. Run `terraform apply` to create the resources ## Prerequisites @@ -303,10 +297,7 @@ claude-refresh-token ## Additional Resources -- [Module Documentation](../../modules/adb-coding-assistants-cluster/README.md) -- [Offline Module Documentation](../../modules/adb-coding-assistants-cluster-offline/README.md) -- [Offline Installation Guide](../../modules/adb-coding-assistants-cluster-offline/scripts/OFFLINE-INSTALLATION.md) -- [Scripts Documentation](../../modules/adb-coding-assistants-cluster/scripts/README.md) +- [Scripts Documentation](scripts/README.md) - [Databricks Init Scripts Documentation](https://docs.databricks.com/clusters/init-scripts.html) - [Unity Catalog Volumes Documentation](https://docs.databricks.com/data-governance/unity-catalog/volumes.html) @@ -327,9 +318,7 @@ claude-refresh-token ## Modules -| Name | Source | Version | -|------|--------|---------| -| [claude\_cluster](#module\_claude\_cluster) | ../../modules/adb-coding-assistants-cluster | n/a | +No modules. ## Resources diff --git a/examples/adb-coding-assistants-cluster/main.tf b/examples/adb-coding-assistants-cluster/main.tf index cd2519e6..80fe58a3 100644 --- a/examples/adb-coding-assistants-cluster/main.tf +++ b/examples/adb-coding-assistants-cluster/main.tf @@ -1,21 +1,86 @@ # Cluster with Claude Code CLI coding assistant # Provider configuration is in providers.tf -module "claude_cluster" { - source = "../../modules/adb-coding-assistants-cluster" - - cluster_name = var.cluster_name - catalog_name = var.catalog_name - schema_name = var.schema_name - volume_name = var.volume_name - init_script_source_path = var.init_script_source_path - spark_version = var.spark_version - node_type_id = var.node_type_id - autotermination_minutes = var.autotermination_minutes - num_workers = var.num_workers - min_workers = var.min_workers - max_workers = var.max_workers - mlflow_experiment_name = var.mlflow_experiment_name - cluster_mode = var.cluster_mode - tags = var.tags + +# Data source to get current user +data "databricks_current_user" "me" {} + +# Local value for init script path +locals { + init_script_path = var.init_script_source_path != null ? var.init_script_source_path : "${path.root}/scripts/install-claude.sh" +} + +# Create or reference the volume for init scripts +resource "databricks_volume" "init_scripts" { + name = var.volume_name + catalog_name = var.catalog_name + schema_name = var.schema_name + volume_type = "MANAGED" + comment = "Volume for Claude Code CLI init scripts" + + lifecycle { + ignore_changes = [owner] + } +} + +# Upload the init script to the volume +resource "databricks_file" "init_script" { + source = local.init_script_path + path = "${databricks_volume.init_scripts.volume_path}/install-claude.sh" } +# Create the cluster with init script +resource "databricks_cluster" "coding_assistants" { + cluster_name = var.cluster_name + spark_version = var.spark_version + node_type_id = var.node_type_id + autotermination_minutes = var.autotermination_minutes + data_security_mode = "SINGLE_USER" + single_user_name = data.databricks_current_user.me.user_name + + # Autoscaling or fixed size + # Autoscaling is not supported for single-node clusters + dynamic "autoscale" { + for_each = var.cluster_mode == "STANDARD" && var.num_workers == null ? [1] : [] + content { + min_workers = var.min_workers + max_workers = var.max_workers + } + } + + # For single-node clusters, num_workers must be 0 (driver-only) + # For standard clusters, use the provided num_workers value + num_workers = var.cluster_mode == "SINGLE_NODE" ? 0 : var.num_workers + + # Single node configuration + # According to Databricks docs: single-node clusters run Spark locally with no worker nodes + spark_conf = var.cluster_mode == "SINGLE_NODE" ? { + "spark.databricks.cluster.profile" = "singleNode" + "spark.master" = "local[*]" + } : {} + + custom_tags = merge( + var.tags, + { + "ManagedBy" = "Terraform" + }, + var.cluster_mode == "SINGLE_NODE" ? { + "ResourceClass" = "SingleNode" + } : {} + ) + + # Environment variables for Claude Code CLI + spark_env_vars = { + MLFLOW_EXPERIMENT_NAME = var.mlflow_experiment_name + } + + # Init script configuration + init_scripts { + volumes { + destination = "${databricks_volume.init_scripts.volume_path}/install-claude.sh" + } + } + + depends_on = [ + databricks_file.init_script + ] +} diff --git a/examples/adb-coding-assistants-cluster/outputs.tf b/examples/adb-coding-assistants-cluster/outputs.tf index 6c3cce86..b54b8269 100644 --- a/examples/adb-coding-assistants-cluster/outputs.tf +++ b/examples/adb-coding-assistants-cluster/outputs.tf @@ -1,52 +1,52 @@ output "cluster_id" { description = "The ID of the created cluster" - value = module.claude_cluster.cluster_id + value = databricks_cluster.coding_assistants.id } output "cluster_url" { description = "URL to access the cluster in Databricks UI" - value = module.claude_cluster.cluster_url + value = databricks_cluster.coding_assistants.url } output "cluster_name" { description = "Name of the created cluster" - value = module.claude_cluster.cluster_name + value = databricks_cluster.coding_assistants.cluster_name } output "volume_path" { description = "Path to the volume containing init scripts" - value = module.claude_cluster.volume_path + value = databricks_volume.init_scripts.volume_path } output "volume_full_name" { description = "Full name of the volume" - value = module.claude_cluster.volume_full_name + value = "${var.catalog_name}.${var.schema_name}.${var.volume_name}" } output "init_script_path" { description = "Path to the init script in the volume" - value = module.claude_cluster.init_script_path + value = databricks_file.init_script.path } output "mlflow_experiment_name" { description = "MLflow experiment name for tracing" - value = module.claude_cluster.mlflow_experiment_name + value = var.mlflow_experiment_name } output "setup_instructions" { description = "Instructions for using the cluster" value = <<-EOT Cluster deployed successfully! - - 1. Access cluster: ${module.claude_cluster.cluster_url} + + 1. Access cluster: ${databricks_cluster.coding_assistants.url} 2. Wait for cluster to start (init script runs automatically) 3. Open a notebook or terminal 4. Run: source ~/.bashrc 5. Verify: check-claude 6. Start using: claude "your question" - - MLflow Experiment: ${module.claude_cluster.mlflow_experiment_name} - + + MLflow Experiment: ${var.mlflow_experiment_name} + Helper commands: - check-claude: Verify installation status - claude-debug: Show configuration details diff --git a/examples/adb-coding-assistants-cluster/providers.tf b/examples/adb-coding-assistants-cluster/providers.tf index 26c208e7..4d7800f3 100644 --- a/examples/adb-coding-assistants-cluster/providers.tf +++ b/examples/adb-coding-assistants-cluster/providers.tf @@ -21,10 +21,10 @@ locals { use_profile_auth = var.databricks_profile != null # For Azure resource ID approach - resource_regex = var.databricks_resource_id != null ? "(?i)subscriptions/(.+)/resourceGroups/(.+)/providers/Microsoft.Databricks/workspaces/(.+)" : "" + resource_regex = var.databricks_resource_id != null ? "(?i)subscriptions/(.+)/resourceGroups/(.+)/providers/Microsoft.Databricks/workspaces/(.+)" : "" subscription_id_from_resource = var.databricks_resource_id != null ? regex(local.resource_regex, var.databricks_resource_id)[0] : null - resource_group = var.databricks_resource_id != null ? regex(local.resource_regex, var.databricks_resource_id)[1] : null - databricks_workspace_name = var.databricks_resource_id != null ? regex(local.resource_regex, var.databricks_resource_id)[2] : null + resource_group = var.databricks_resource_id != null ? regex(local.resource_regex, var.databricks_resource_id)[1] : null + databricks_workspace_name = var.databricks_resource_id != null ? regex(local.resource_regex, var.databricks_resource_id)[2] : null } # Get Azure subscription ID from Azure CLI or environment variable when not provided via resource ID @@ -60,7 +60,7 @@ provider "azurerm" { subscription_id = local.subscription_id features {} skip_provider_registration = local.use_profile_auth - + # Allow provider to work without explicit subscription_id when using profile auth # It will attempt to auto-detect from Azure CLI or environment variables } diff --git a/modules/adb-coding-assistants-cluster/scripts/README.md b/examples/adb-coding-assistants-cluster/scripts/README.md similarity index 100% rename from modules/adb-coding-assistants-cluster/scripts/README.md rename to examples/adb-coding-assistants-cluster/scripts/README.md diff --git a/modules/adb-coding-assistants-cluster/scripts/check-network-deps.sh b/examples/adb-coding-assistants-cluster/scripts/check-network-deps.sh similarity index 100% rename from modules/adb-coding-assistants-cluster/scripts/check-network-deps.sh rename to examples/adb-coding-assistants-cluster/scripts/check-network-deps.sh diff --git a/modules/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh b/examples/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh similarity index 100% rename from modules/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh rename to examples/adb-coding-assistants-cluster/scripts/install-claude-minimal.sh diff --git a/modules/adb-coding-assistants-cluster/scripts/install-claude.sh b/examples/adb-coding-assistants-cluster/scripts/install-claude.sh similarity index 100% rename from modules/adb-coding-assistants-cluster/scripts/install-claude.sh rename to examples/adb-coding-assistants-cluster/scripts/install-claude.sh diff --git a/modules/adb-coding-assistants-cluster/scripts/vscode-setup.sh b/examples/adb-coding-assistants-cluster/scripts/vscode-setup.sh similarity index 100% rename from modules/adb-coding-assistants-cluster/scripts/vscode-setup.sh rename to examples/adb-coding-assistants-cluster/scripts/vscode-setup.sh diff --git a/modules/adb-coding-assistants-cluster/Makefile b/modules/adb-coding-assistants-cluster/Makefile deleted file mode 100644 index 653039d8..00000000 --- a/modules/adb-coding-assistants-cluster/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -.PHONY: docs test_docs - -docs: - terraform-docs -c ../../.terraform-docs.yml . - -test_docs: - terraform-docs -c ../../.terraform-docs.yml --output-check . diff --git a/modules/adb-coding-assistants-cluster/README.md b/modules/adb-coding-assistants-cluster/README.md deleted file mode 100644 index a699ee71..00000000 --- a/modules/adb-coding-assistants-cluster/README.md +++ /dev/null @@ -1,513 +0,0 @@ -# Provisioning Databricks Cluster with Claude Code CLI - -This module deploys a Databricks cluster pre-configured with Claude Code CLI for AI-assisted development directly on Databricks. - -## Module content - -This module can be used to deploy the following: - -* Unity Catalog Volume for secure init script storage -* Init script with Claude Code CLI installation -* Databricks cluster with automatic AI coding assistant setup -* MLflow experiment configuration for tracing -* Helper bash functions for cluster users - -## Features - -- ✅ **Zero-configuration AI coding tools** on cluster startup -- ✅ **Unity Catalog Volumes** for secure script storage (Databricks recommended practice) -- ✅ **MLflow tracing** integration for Claude Code sessions -- ✅ **Flexible cluster configuration** (single-node or autoscaling) - -> **Note**: For offline/air-gapped environments, use the separate [`adb-coding-assistants-cluster-offline`](../adb-coding-assistants-cluster-offline/README.md) module. - -## Architecture - -``` -┌─────────────────────────────────────────────────────────────────────────────────┐ -│ LOCAL MACHINE │ -│ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ -│ │ Local Terminal │ │ Databricks CLI │ │ VS Code / Cursor │ │ -│ │ │ │ (SSH Setup) │ │ (Remote SSH Ext) │ │ -│ └────────┬─────────┘ └────────┬─────────┘ └────────┬─────────┘ │ -└───────────┼─────────────────────┼─────────────────────┼─────────────────────────┘ - │ │ │ - │ ┌────────────────┴─────────────────────┘ - │ │ 3. databricks ssh setup - ▼ ▼ -┌─────────────────────────────────────────────────────────────────────────────────┐ -│ CONNECTION LAYER │ -│ ┌───────────────────────────────────────────────────────────────────────────┐ │ -│ │ SSH Tunnel ─────────────────────────────────────────────────────────────│ │ -│ │ Port Forwarding (8501 for Streamlit, etc.) │ │ -│ └───────────────────────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────┬───────────────────────────────────────┘ - │ 4. VS Code connects - ▼ -┌─────────────────────────────────────────────────────────────────────────────────┐ -│ DATABRICKS WORKSPACE │ -│ │ -│ ┌─────────────────────────────┐ ┌─────────────────────────────────────┐ │ -│ │ UNITY CATALOG │ │ SINGLE-NODE DEV CLUSTER │ │ -│ │ ┌───────────────────────┐ │ │ │ │ -│ │ │ Catalog │ │ │ ┌───────────────────────────────┐ │ │ -│ │ │ └── Schema │ │ │ │ Init Script Execution │ │ │ -│ │ │ └── Volume │──┼──────│──│ (on cluster startup) │ │ │ -│ │ │ └── init │ │ 2. │ └───────────────┬───────────────┘ │ │ -│ │ │ script │ │ │ │ installs │ │ -│ │ └───────────────────────┘ │ │ ▼ │ │ -│ │ ▲ │ │ ┌───────────────────────────────┐ │ │ -│ │ │ 1. Deploy via │ │ │ DRIVER NODE │ │ │ -│ │ │ Terraform │ │ │ ┌────────────┬────────────┐ │ │ │ -│ └─────────┼───────────────────┘ │ │ │Claude Code │ Node.js │ │ │ │ -│ │ │ │ │ CLI │ Runtime │ │ │ │ -│ │ │ │ ├────────────┼────────────┤ │ │ │ -│ │ │ │ │ Databricks │ Bash │ │ │ │ -│ │ │ │ │ Python │ Helpers │ │ │ │ -│ │ │ │ ├────────────┴────────────┤ │ │ │ -│ │ │ │ │ tmux (persistent) │ │ │ │ -│ │ │ │ └─────────────────────────┘ │ │ │ -│ │ │ └───────────────────────────────┘ │ │ -│ │ │ │ │ -│ │ │ ┌───────────────────────────────┐ │ │ -│ │ │ │ Workspace Storage │ │ │ -│ │ │ │ /Workspace/Users// │ │ │ -│ │ │ └───────────────┬───────────────┘ │ │ -│ │ │ │ 5. git commit │ │ -│ │ └──────────────────┼──────────────────┘ │ -└────────────┼─────────────────────────────────────────────┼──────────────────────┘ - │ │ - │ ┌────────────────────────┘ - │ │ - ▼ ▼ -┌─────────────────────────────────────────────────────────────────────────────────┐ -│ EXTERNAL SERVICES │ -│ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ -│ │ Claude AI API │ │ MLflow │ │ GitHub / Git │ │ -│ │ (Anthropic) │ │ (Session Tracing)│ │ (Version Control)│ │ -│ └──────────────────┘ └──────────────────┘ └──────────────────┘ │ -│ ▲ ▲ ▲ │ -│ │ 6. API calls │ 7. Trace sessions │ │ -│ └──────────────────────┴──────────────────────┘ │ -└─────────────────────────────────────────────────────────────────────────────────┘ -``` - -### Workflow - -1. **Terraform deploys** the Unity Catalog volume and cluster configuration -2. **Init script runs** on cluster startup, installing Claude Code CLI and tools -3. **User configures SSH** via `databricks ssh setup` command -4. **VS Code connects** via Remote SSH extension to the cluster -5. **Code is stored** in `/Workspace/Users//` and committed to Git -6. **Claude Code CLI** calls the Claude AI API for coding assistance -7. **Sessions traced** to MLflow for observability - -## Prerequisites - -- Databricks workspace with Unity Catalog enabled -- Databricks Runtime 13.3 LTS or higher (recommended for Unity Catalog volumes) -- Databricks Terraform provider >= 1.40.0 -- Unity Catalog with an existing catalog and schema -- **Unity Catalog metastore must have a root storage credential configured** (required for volumes) - -> **Note**: If you encounter an error about missing root storage credential, you need to configure the metastore's root storage credential first. See [Databricks documentation](https://docs.databricks.com/api-explorer/workspace/metastores/update) for details. - -## Usage - -### Basic Example - -```hcl -module "coding_cluster" { - source = "./modules/coding-assistants-cluster" - - cluster_name = "ai-dev-cluster" - catalog_name = "main" - schema_name = "default" - - # init_script_source_path is optional - module includes the script -} -``` - -### Single-Node Cluster - -```hcl -module "single_node_cluster" { - source = "./modules/coding-assistants-cluster" - - cluster_name = "ai-dev-single-node" - catalog_name = "main" - schema_name = "default" - - cluster_mode = "SINGLE_NODE" - num_workers = 0 -} -``` - -### Autoscaling Cluster - -```hcl -module "autoscaling_cluster" { - source = "./modules/coding-assistants-cluster" - - cluster_name = "ai-dev-autoscaling" - catalog_name = "main" - schema_name = "default" - - min_workers = 2 - max_workers = 8 - - tags = { - Environment = "production" - Team = "data-science" - } -} -``` - -### Complete Example - -```hcl -module "coding_cluster" { - source = "./modules/coding-assistants-cluster" - - # Cluster configuration - cluster_name = "ai-development-cluster" - spark_version = "17.3.x-cpu-ml-scala2.13" - node_type_id = "Standard_D8pds_v6" - autotermination_minutes = 60 - - # Volume configuration - catalog_name = "main" - schema_name = "default" - volume_name = "coding_assistants" - - # Init script (optional - uses bundled script by default) - # init_script_source_path = "/path/to/custom/script.sh" - - # MLflow configuration - mlflow_experiment_name = "/Users/me@company.com/my-claude-traces" - - # Autoscaling - min_workers = 1 - max_workers = 5 - - # Tags - tags = { - Environment = "development" - Project = "ai-assisted-coding" - CostCenter = "engineering" - ManagedBy = "terraform" - } -} -``` - -## Init Script Storage Best Practices - -According to [Databricks documentation](https://docs.databricks.com/aws/en/init-scripts/): - -> **Databricks Runtime 13.3 LTS and above with Unity Catalog** -> Store init scripts in Unity Catalog volumes. - -### Why Unity Catalog Volumes? - -1. **Governance**: Full Unity Catalog ACL support -2. **Security**: Identity-based access control -3. **Portability**: Works across AWS, Azure, and GCP -4. **Versioning**: Easy to manage and update scripts -5. **No DBFS**: Recommended alternative to legacy DBFS storage - -### Init Script Identity - -- **Single-user access mode**: Uses assigned principal's identity -- **Standard access mode**: Uses cluster owner's identity -- **Volume access**: Governed by Unity Catalog permissions - - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 1.0 | -| [databricks](#requirement\_databricks) | >= 1.40.0 | - -## Providers - -| Name | Version | -|------|---------| -| [databricks](#provider\_databricks) | 1.102.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [databricks_cluster.coding_assistants](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/cluster) | resource | -| [databricks_file.init_script](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/file) | resource | -| [databricks_volume.init_scripts](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/volume) | resource | -| [databricks_current_user.me](https://registry.terraform.io/providers/databricks/databricks/latest/docs/data-sources/current_user) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [catalog\_name](#input\_catalog\_name) | Unity Catalog name for the volume | `string` | n/a | yes | -| [cluster\_name](#input\_cluster\_name) | Name of the Databricks cluster | `string` | n/a | yes | -| [autotermination\_minutes](#input\_autotermination\_minutes) | Minutes of inactivity before cluster auto-terminates | `number` | `30` | no | -| [cluster\_mode](#input\_cluster\_mode) | Cluster mode: STANDARD or SINGLE\_NODE | `string` | `"STANDARD"` | no | -| [init\_script\_source\_path](#input\_init\_script\_source\_path) | Local path to the init script | `string` | `null` | no | -| [max\_workers](#input\_max\_workers) | Maximum number of workers for autoscaling | `number` | `3` | no | -| [min\_workers](#input\_min\_workers) | Minimum number of workers for autoscaling | `number` | `1` | no | -| [mlflow\_experiment\_name](#input\_mlflow\_experiment\_name) | MLflow experiment name for Claude Code tracing | `string` | `"/Workspace/Shared/claude-code-tracing"` | no | -| [node\_type\_id](#input\_node\_type\_id) | Node type for the cluster. Default is Standard_D8pds_v6 (modern, premium SSD + local NVMe). If unavailable in your region, consider Standard_DS13_v2 as fallback. | `string` | `"Standard_D8pds_v6"` | no | -| [num\_workers](#input\_num\_workers) | Number of worker nodes (null for autoscaling) | `number` | `null` | no | -| [schema\_name](#input\_schema\_name) | Schema name for the volume | `string` | `"default"` | no | -| [spark\_version](#input\_spark\_version) | Databricks Runtime version | `string` | `"17.3.x-cpu-ml-scala2.13"` | no | -| [tags](#input\_tags) | Custom tags for the cluster | `map(string)` |
{
"Environment": "dev",
"Purpose": "coding-assistants"
}
| no | -| [volume\_name](#input\_volume\_name) | Volume name to store init scripts | `string` | `"coding_assistants"` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [cluster\_id](#output\_cluster\_id) | The ID of the created cluster | -| [cluster\_name](#output\_cluster\_name) | Name of the created cluster | -| [cluster\_url](#output\_cluster\_url) | URL to access the cluster in Databricks UI | -| [init\_script\_path](#output\_init\_script\_path) | Path to the init script in the volume | -| [mlflow\_experiment\_name](#output\_mlflow\_experiment\_name) | MLflow experiment name for tracing | -| [volume\_full\_name](#output\_volume\_full\_name) | Full name of the volume | -| [volume\_path](#output\_volume\_path) | Path to the volume containing init scripts | - - -## Post-Deployment Usage - -### On the Cluster - -After the cluster starts, users can: - -```bash -# Check installation status -check-claude - -# Debug Claude configuration -claude-debug - -# Use Claude Code -claude "Analyze the customer churn data" - -# Enable MLflow tracing -claude-tracing-enable - -# Check tracing status -claude-tracing-status -``` - -### Persistent Work Storage - -**IMPORTANT: Do not use Databricks Repos (`/Repos/...`) for active development work.** - -Databricks Repos folders can be unreliable for persistent storage and may lose uncommitted changes during cluster restarts or sync operations. Instead: - -✅ **Use `/Workspace/Users//` for all development work** - -This location provides reliable persistent storage across cluster restarts. Use the provided git helpers to manage version control: - -```bash -# Navigate to your workspace -cd /Workspace/Users/$(whoami)/ - -# Set up git (interactive helper) -git-workspace-init - -# Check git status and location -git-workspace-check - -# Configure git authentication -git-workspace-setup-auth -``` - -The git helpers will: -- Warn if you're working in `/Repos` (unreliable location) -- Help you clone existing repos or initialize new ones -- Check for uncommitted or unpushed changes -- Guide you through authentication setup (PAT, SSH, or credential helper) - -### Helper Commands - -The init script installs these helper commands in `~/.bashrc`: - -#### Claude CLI Commands - -| Command | Purpose | -|---------|---------| -| `check-claude` | Verify installation and configuration | -| `claude-debug` | Show detailed Claude CLI configuration | -| `claude-refresh-token` | Regenerate Claude settings | -| `claude-token-status` | Check token freshness and auto-refresh status | -| `claude-tracing-enable` | Enable MLflow tracing | -| `claude-tracing-status` | Check tracing status | -| `claude-tracing-disable` | Disable MLflow tracing | - -#### Git Workspace Commands - -| Command | Purpose | -|---------|---------| -| `git-workspace-init` | Interactive git setup in /Workspace (clone or init) | -| `git-workspace-check` | Check location and uncommitted/unpushed changes | -| `git-workspace-setup-auth` | Configure git authentication (PAT/SSH/credential helper) | - -#### VS Code/Cursor Remote Commands - -| Command | Purpose | -|---------|---------| -| `claude-vscode-setup` | Show Remote SSH setup guide | -| `claude-vscode-env` | Get Python interpreter path | -| `claude-vscode-check` | Verify Remote SSH configuration | -| `claude-vscode-config` | Generate settings.json snippet | - -## Cluster Access Modes - -### Single-User Access Mode - -```hcl -# Automatically configured by the module -data_security_mode = "SINGLE_USER" -single_user_name = data.databricks_current_user.me.user_name -``` - -### Standard Access Mode - -For standard access mode, you must: -1. Set up an allowlist for init scripts -2. Grant permissions to the volume - -See [Allowlist documentation](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/allowlist). - -## Troubleshooting - -### Init Script Fails - -Check cluster logs: -```bash -# Enable cluster log delivery in cluster config -# Then view: /cluster-logs//init_scripts/ -``` - -### Commands Not Found - -```bash -# Reload bashrc -source ~/.bashrc - -# Check PATH -echo $PATH | grep -E "(claude)" - -# Verify installation -check-coding-assistants -``` - -### Authentication Issues - -```bash -# Check environment variables -claude-debug - -# Verify token is set -echo $DATABRICKS_TOKEN - -# Regenerate configs -claude-refresh-token -claude-refresh-token -``` - -### Script Size Limit - -Init scripts must be < 64KB. If exceeded: -- Break into multiple scripts -- Remove unnecessary comments -- Compress/optimize script - -## Security Considerations - -### Volume Permissions - -Ensure appropriate Unity Catalog permissions: - -```sql --- Grant read access to volume -GRANT READ VOLUME ON VOLUME .. TO ; - --- For standard access mode, add to allowlist --- (Requires admin access) -``` - -### Token Security - -- Tokens are **never hardcoded** in configs -- Read from environment: `$DATABRICKS_TOKEN` -- Configs regenerate per session -- Settings files are user-readable only (`~/.claude/`) - -## Maintenance - -### Updating the Init Script - -1. Update the local init script file -2. Run `terraform apply` to upload new version -3. Restart clusters to apply changes - -```bash -terraform apply -target=module.coding_cluster.databricks_file.init_script -``` - -### Updating Cluster Configuration - -```bash -# Update variables in your config -# Then apply -terraform apply - -# Restart cluster for changes to take effect -``` - -## Cost Optimization - -- Use `autotermination_minutes` to automatically shut down idle clusters -- Use single-node mode for development: `cluster_mode = "SINGLE_NODE"` -- Enable autoscaling to scale down during low usage -- Consider spot instances (if supported by your cloud provider) - -## Limitations - -- Init scripts must be < 64KB -- Init script failures cause cluster launch to fail -- Requires Databricks Runtime 13.3 LTS+ for Unity Catalog volumes -- Standard access mode requires admin-configured allowlist - -## References - -- [Databricks Init Scripts Documentation](https://docs.databricks.com/init-scripts/) -- [Unity Catalog Volumes](https://docs.databricks.com/volumes/) -- [Databricks Terraform Provider](https://registry.terraform.io/providers/databricks/databricks/latest/docs) -- [Cluster Configuration](https://docs.databricks.com/compute/configure) - -## License - -This module is provided as-is for use with Databricks workspaces. - -## Contributing - -To contribute improvements to this module: -1. Test changes in an isolated Databricks workspace -2. Run `terraform validate` and `terraform fmt` -3. Update documentation for any new variables or outputs -4. Submit pull request with clear description of changes - -## Support - -For issues related to: -- **Module**: Open an issue in this repository -- **Init Script**: See the init script documentation -- **Databricks Platform**: Contact Databricks support -- **Claude**: Contact Anthropic support diff --git a/modules/adb-coding-assistants-cluster/main.tf b/modules/adb-coding-assistants-cluster/main.tf deleted file mode 100644 index 578c1a20..00000000 --- a/modules/adb-coding-assistants-cluster/main.tf +++ /dev/null @@ -1,83 +0,0 @@ -# Data source to get current user -data "databricks_current_user" "me" {} - -# Local value for init script path -locals { - init_script_path = var.init_script_source_path != null ? var.init_script_source_path : "${path.module}/scripts/install-claude.sh" -} - -# Create or reference the volume for init scripts -resource "databricks_volume" "init_scripts" { - name = var.volume_name - catalog_name = var.catalog_name - schema_name = var.schema_name - volume_type = "MANAGED" - comment = "Volume for Claude Code CLI init scripts" - - lifecycle { - ignore_changes = [owner] - } -} - -# Upload the init script to the volume -resource "databricks_file" "init_script" { - source = local.init_script_path - path = "${databricks_volume.init_scripts.volume_path}/install-claude.sh" -} - -# Create the cluster with init script -resource "databricks_cluster" "coding_assistants" { - cluster_name = var.cluster_name - spark_version = var.spark_version - node_type_id = var.node_type_id - autotermination_minutes = var.autotermination_minutes - data_security_mode = "SINGLE_USER" - single_user_name = data.databricks_current_user.me.user_name - - # Autoscaling or fixed size - # Autoscaling is not supported for single-node clusters - dynamic "autoscale" { - for_each = var.cluster_mode == "STANDARD" && var.num_workers == null ? [1] : [] - content { - min_workers = var.min_workers - max_workers = var.max_workers - } - } - - # For single-node clusters, num_workers must be 0 (driver-only) - # For standard clusters, use the provided num_workers value - num_workers = var.cluster_mode == "SINGLE_NODE" ? 0 : var.num_workers - - # Single node configuration - # According to Databricks docs: single-node clusters run Spark locally with no worker nodes - spark_conf = var.cluster_mode == "SINGLE_NODE" ? { - "spark.databricks.cluster.profile" = "singleNode" - "spark.master" = "local[*]" - } : {} - - custom_tags = merge( - var.tags, - { - "ManagedBy" = "Terraform" - }, - var.cluster_mode == "SINGLE_NODE" ? { - "ResourceClass" = "SingleNode" - } : {} - ) - - # Environment variables for Claude Code CLI - spark_env_vars = { - MLFLOW_EXPERIMENT_NAME = var.mlflow_experiment_name - } - - # Init script configuration - init_scripts { - volumes { - destination = "${databricks_volume.init_scripts.volume_path}/install-claude.sh" - } - } - - depends_on = [ - databricks_file.init_script - ] -} diff --git a/modules/adb-coding-assistants-cluster/outputs.tf b/modules/adb-coding-assistants-cluster/outputs.tf deleted file mode 100644 index c19dca1a..00000000 --- a/modules/adb-coding-assistants-cluster/outputs.tf +++ /dev/null @@ -1,34 +0,0 @@ -output "cluster_id" { - description = "The ID of the created cluster" - value = databricks_cluster.coding_assistants.id -} - -output "cluster_url" { - description = "URL to access the cluster in Databricks UI" - value = databricks_cluster.coding_assistants.url -} - -output "cluster_name" { - description = "Name of the created cluster" - value = databricks_cluster.coding_assistants.cluster_name -} - -output "volume_path" { - description = "Path to the volume containing init scripts" - value = databricks_volume.init_scripts.volume_path -} - -output "volume_full_name" { - description = "Full name of the volume" - value = "${var.catalog_name}.${var.schema_name}.${var.volume_name}" -} - -output "init_script_path" { - description = "Path to the init script in the volume" - value = databricks_file.init_script.path -} - -output "mlflow_experiment_name" { - description = "MLflow experiment name for tracing" - value = var.mlflow_experiment_name -} diff --git a/modules/adb-coding-assistants-cluster/variables.tf b/modules/adb-coding-assistants-cluster/variables.tf deleted file mode 100644 index b3987843..00000000 --- a/modules/adb-coding-assistants-cluster/variables.tf +++ /dev/null @@ -1,94 +0,0 @@ -variable "cluster_name" { - description = "Name of the Databricks cluster" - type = string -} - -variable "catalog_name" { - description = "Unity Catalog catalog name for the volume. The metastore must have a root storage credential configured." - type = string -} - -variable "schema_name" { - description = "Schema name for the volume" - type = string - default = "default" -} - -variable "volume_name" { - description = "Volume name to store init scripts" - type = string - default = "coding_assistants" -} - -variable "init_script_source_path" { - description = "Local path to the init script" - type = string - default = null -} - -variable "spark_version" { - description = "Databricks Runtime version" - type = string - default = "17.3.x-cpu-ml-scala2.13" -} - -variable "node_type_id" { - description = "Node type for the cluster. Default is Standard_D4ds_v5 (modern, premium SSD). If unavailable in your region, consider Standard_DS13_v2 as fallback." - type = string - default = "Standard_D4ds_v5" -} - -variable "autotermination_minutes" { - description = "Minutes of inactivity before cluster auto-terminates" - type = number - default = 30 -} - -variable "num_workers" { - description = "Number of worker nodes (null for autoscaling). For SINGLE_NODE clusters, this is automatically set to 0." - type = number - default = null -} - -variable "min_workers" { - description = "Minimum number of workers for autoscaling" - type = number - default = 1 -} - -variable "max_workers" { - description = "Maximum number of workers for autoscaling" - type = number - default = 3 -} - -variable "mlflow_experiment_name" { - description = "MLflow experiment name for Claude Code tracing" - type = string - default = "/Workspace/Shared/claude-code-tracing" -} - -variable "cluster_mode" { - description = <<-EOT - Cluster mode: STANDARD or SINGLE_NODE. - - STANDARD: Multi-node cluster with worker nodes (supports autoscaling) - - SINGLE_NODE: Single-node cluster with no worker nodes (driver-only, runs Spark locally). - For SINGLE_NODE clusters, num_workers is automatically set to 0 and autoscaling is disabled. - EOT - type = string - default = "STANDARD" - - validation { - condition = contains(["STANDARD", "SINGLE_NODE"], var.cluster_mode) - error_message = "cluster_mode must be either STANDARD or SINGLE_NODE" - } -} - -variable "tags" { - description = "Custom tags for the cluster" - type = map(string) - default = { - Environment = "dev" - Purpose = "coding-assistants" - } -} diff --git a/modules/adb-coding-assistants-cluster/versions.tf b/modules/adb-coding-assistants-cluster/versions.tf deleted file mode 100644 index 07223296..00000000 --- a/modules/adb-coding-assistants-cluster/versions.tf +++ /dev/null @@ -1,10 +0,0 @@ -terraform { - required_version = ">= 1.0" - - required_providers { - databricks = { - source = "databricks/databricks" - version = ">= 1.40.0" - } - } -}