diff --git a/examples/aws-managed-file-events/Makefile b/examples/aws-managed-file-events/Makefile new file mode 100644 index 00000000..653039d8 --- /dev/null +++ b/examples/aws-managed-file-events/Makefile @@ -0,0 +1,7 @@ +.PHONY: docs test_docs + +docs: + terraform-docs -c ../../.terraform-docs.yml . + +test_docs: + terraform-docs -c ../../.terraform-docs.yml --output-check . diff --git a/examples/aws-managed-file-events/README.md b/examples/aws-managed-file-events/README.md new file mode 100644 index 00000000..5c7aad9c --- /dev/null +++ b/examples/aws-managed-file-events/README.md @@ -0,0 +1,157 @@ +# Provisioning Databricks Managed File Events on AWS + +This example is using the [aws-managed-file-events](../../modules/aws-managed-file-events) module. + +This template provides a deployment of AWS infrastructure for Databricks Managed File Events, enabling file notification mode for Auto Loader with automatic S3 event notifications and SQS queues. + +## How to use + +1. Reference this module using one of the different [module source types](https://developer.hashicorp.com/terraform/language/modules/sources) +2. Add a `variables.tf` with the same content in [variables.tf](variables.tf) +3. Add a `terraform.tfvars` file and provide values to each defined variable +4. Configure authentication to your Databricks workspace and AWS account +5. Add a `output.tf` file +6. (Optional) Configure your [remote backend](https://developer.hashicorp.com/terraform/language/settings/backends/s3) +7. Run `terraform init` to initialize terraform and get provider ready +8. Run `terraform apply` to create the resources + +## Complete Example with All Options + +The following shows all available module options: + +```hcl +module "managed_file_events" { + source = "../../modules/aws-managed-file-events" + + # Required variables + prefix = var.prefix + region = var.region + aws_account_id = var.aws_account_id + databricks_account_id = var.databricks_account_id + + # S3 Configuration + create_bucket = true # Set to false to use existing bucket + existing_bucket_name = null # Required if create_bucket = false + bucket_name = "my-custom-bucket-name" # Custom bucket name (default: prefix-file-events) + s3_path_prefix = "data/incoming" # Path prefix within the bucket + force_destroy_bucket = false # Allow bucket deletion with objects + + # External Location Configuration + external_location_name = "my-external-location" # Custom name (default: prefix-file-events-location) + storage_credential_name = "my-storage-credential" # Custom name (default: prefix-file-events-credential) + + # Catalog Configuration (Optional) + create_catalog = true + catalog_name = "my_catalog" + catalog_owner = "data-engineers@company.com" + catalog_isolation_mode = "OPEN" # OPEN or ISOLATED + + # Grants Configuration + external_location_grants = [ + { + principal = "data-engineers@company.com" + privileges = ["READ_FILES", "WRITE_FILES"] + } + ] + + storage_credential_grants = [ + { + principal = "data-engineers@company.com" + privileges = ["CREATE_EXTERNAL_LOCATION"] + } + ] + + catalog_grants = [ + { + principal = "data-engineers@company.com" + privileges = ["USE_CATALOG", "CREATE_SCHEMA"] + }, + { + principal = "analysts@company.com" + privileges = ["USE_CATALOG"] + } + ] + + tags = { + Environment = "production" + ManagedBy = "terraform" + Project = "data-platform" + } +} +``` + +## Using with Auto Loader + +Once deployed, you can use Auto Loader with managed file events in your Databricks notebooks: + +```python +df = spark.readStream.format("cloudFiles") \ + .option("cloudFiles.format", "json") \ + .option("cloudFiles.useManagedFileEvents", "true") \ + .load("s3://your-bucket/path") +``` + +Or in Lakeflow Declarative Pipelines: + +```python +from pyspark import pipelines as dp + +@dp.table +def my_table(): + return spark.readStream.format("cloudFiles") \ + .option("cloudFiles.format", "json") \ + .option("cloudFiles.useManagedFileEvents", "true") \ + .load("/Volumes") # Ingesting from a volume that points to your S3 bucket will be more performant than the S3 location itself. +``` + +## Reference + +- [Databricks File Notification Mode Documentation](https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/file-notification-mode) + + +## Requirements + +| Name | Version | +|------|---------| +| [aws](#requirement\_aws) | >= 5.0 | +| [databricks](#requirement\_databricks) | >= 1.65.0 | + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [managed\_file\_events](#module\_managed\_file\_events) | ../../modules/aws-managed-file-events | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [aws\_account\_id](#input\_aws\_account\_id) | (Required) AWS Account ID | `string` | n/a | yes | +| [databricks\_account\_id](#input\_databricks\_account\_id) | (Required) Databricks Account ID | `string` | n/a | yes | +| [databricks\_client\_id](#input\_databricks\_client\_id) | (Required) Databricks service principal client ID | `string` | n/a | yes | +| [databricks\_client\_secret](#input\_databricks\_client\_secret) | (Required) Databricks service principal client secret | `string` | n/a | yes | +| [databricks\_host](#input\_databricks\_host) | (Required) Databricks workspace URL (e.g., https://xxx.cloud.databricks.com) | `string` | n/a | yes | +| [databricks\_pat\_token](#input\_databricks\_pat\_token) | (Required) Databricks service principal client secret | `string` | n/a | yes | +| [prefix](#input\_prefix) | (Required) Prefix for resource naming | `string` | n/a | yes | +| [region](#input\_region) | (Required) AWS region to deploy to | `string` | n/a | yes | +| [aws\_profile](#input\_aws\_profile) | (Optional) AWS CLI profile name for authentication | `string` | `null` | no | +| [tags](#input\_tags) | (Optional) Tags to add to created resources | `map(string)` | `{}` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [bucket\_name](#output\_bucket\_name) | Name of the S3 bucket | +| [external\_location\_name](#output\_external\_location\_name) | Name of the external location | +| [external\_location\_url](#output\_external\_location\_url) | S3 URL of the external location | +| [iam\_role\_arn](#output\_iam\_role\_arn) | ARN of the IAM role | +| [storage\_credential\_name](#output\_storage\_credential\_name) | Name of the storage credential | + diff --git a/examples/aws-managed-file-events/main.tf b/examples/aws-managed-file-events/main.tf new file mode 100644 index 00000000..5385ee8f --- /dev/null +++ b/examples/aws-managed-file-events/main.tf @@ -0,0 +1,9 @@ +module "managed_file_events" { + source = "../../modules/aws-managed-file-events" + + prefix = var.prefix + region = var.region + aws_account_id = var.aws_account_id + databricks_account_id = var.databricks_account_id + tags = var.tags +} diff --git a/examples/aws-managed-file-events/outputs.tf b/examples/aws-managed-file-events/outputs.tf new file mode 100644 index 00000000..98a20434 --- /dev/null +++ b/examples/aws-managed-file-events/outputs.tf @@ -0,0 +1,24 @@ +output "storage_credential_name" { + description = "Name of the storage credential" + value = module.managed_file_events.storage_credential_name +} + +output "external_location_name" { + description = "Name of the external location" + value = module.managed_file_events.external_location_name +} + +output "external_location_url" { + description = "S3 URL of the external location" + value = module.managed_file_events.external_location_url +} + +output "bucket_name" { + description = "Name of the S3 bucket" + value = module.managed_file_events.bucket_name +} + +output "iam_role_arn" { + description = "ARN of the IAM role" + value = module.managed_file_events.iam_role_arn +} diff --git a/examples/aws-managed-file-events/providers.tf b/examples/aws-managed-file-events/providers.tf new file mode 100644 index 00000000..97f7ae5f --- /dev/null +++ b/examples/aws-managed-file-events/providers.tf @@ -0,0 +1,11 @@ +provider "aws" { + region = var.region + profile = var.aws_profile +} + +# Workspace-level Databricks provider +provider "databricks" { + host = var.databricks_host + client_id = var.databricks_client_id + client_secret = var.databricks_client_secret +} diff --git a/examples/aws-managed-file-events/terraform.tfvars b/examples/aws-managed-file-events/terraform.tfvars new file mode 100644 index 00000000..81fbff9c --- /dev/null +++ b/examples/aws-managed-file-events/terraform.tfvars @@ -0,0 +1,14 @@ +prefix = "my-project" +region = "us-west-2" +aws_account_id = "123456789012" +aws_profile = "default" +databricks_account_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +databricks_host = "https://my-workspace.cloud.databricks.com" +databricks_pat_token = "dapixxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +databricks_client_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +databricks_client_secret = "dosexxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + +tags = { + Environment = "demo" + ManagedBy = "terraform" +} diff --git a/examples/aws-managed-file-events/variables.tf b/examples/aws-managed-file-events/variables.tf new file mode 100644 index 00000000..18f23d59 --- /dev/null +++ b/examples/aws-managed-file-events/variables.tf @@ -0,0 +1,55 @@ +variable "tags" { + default = {} + type = map(string) + description = "(Optional) Tags to add to created resources" +} + +variable "prefix" { + type = string + description = "(Required) Prefix for resource naming" +} + +variable "region" { + type = string + description = "(Required) AWS region to deploy to" +} + +variable "aws_profile" { + type = string + default = null + description = "(Optional) AWS CLI profile name for authentication" +} + +variable "aws_account_id" { + type = string + description = "(Required) AWS Account ID" +} + +variable "databricks_account_id" { + type = string + description = "(Required) Databricks Account ID" +} + +variable "databricks_host" { + type = string + description = "(Required) Databricks workspace URL (e.g., https://xxx.cloud.databricks.com)" +} + +variable "databricks_client_id" { + type = string + description = "(Required) Databricks service principal client ID" +} + +variable "databricks_client_secret" { + type = string + sensitive = true + description = "(Required) Databricks service principal client secret" +} + + +variable "databricks_pat_token" { + type = string + sensitive = true + description = "(Required) Databricks service principal client secret" +} + diff --git a/examples/aws-managed-file-events/versions.tf b/examples/aws-managed-file-events/versions.tf new file mode 100644 index 00000000..3a00f938 --- /dev/null +++ b/examples/aws-managed-file-events/versions.tf @@ -0,0 +1,12 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + databricks = { + source = "databricks/databricks" + version = ">= 1.65.0" + } + } +} diff --git a/modules/aws-managed-file-events/Makefile b/modules/aws-managed-file-events/Makefile new file mode 100644 index 00000000..653039d8 --- /dev/null +++ b/modules/aws-managed-file-events/Makefile @@ -0,0 +1,7 @@ +.PHONY: docs test_docs + +docs: + terraform-docs -c ../../.terraform-docs.yml . + +test_docs: + terraform-docs -c ../../.terraform-docs.yml --output-check . diff --git a/modules/aws-managed-file-events/README.md b/modules/aws-managed-file-events/README.md new file mode 100644 index 00000000..868063cd --- /dev/null +++ b/modules/aws-managed-file-events/README.md @@ -0,0 +1,201 @@ +# Module aws-managed-file-events + +## Description + +This module creates the infrastructure needed for Databricks Managed File Events on AWS. Managed file events enables file notification mode for Auto Loader, allowing Databricks to automatically set up S3 event notifications and SQS queues for efficient file discovery. + +When enabled, Auto Loader can use `cloudFiles.useManagedFileEvents = true` for efficient incremental data ingestion without manually configuring S3 notifications. + +## Features + +- Creates (or uses existing) S3 bucket for file storage +- Creates IAM role with permissions for S3 access, bucket notifications, and SQS management +- Creates Unity Catalog storage credential +- Creates external location with file events enabled and managed SQS +- Optionally creates a catalog using the external location +- Supports configurable grants for storage credential, external location, and catalog + +## Prerequisites + +- Unity Catalog enabled workspace with metastore assigned +- Databricks Runtime 14.3 LTS or above (for Auto Loader with managed file events) +- AWS account with permissions to create IAM roles and S3 buckets + +## Usage + +### Basic Usage (Create new bucket) + +```hcl +module "managed_file_events" { + source = "github.com/databricks/terraform-databricks-examples/modules/aws-managed-file-events" + + prefix = "my-project" + region = "us-west-2" + aws_account_id = "123456789012" + databricks_account_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + + tags = { + Environment = "production" + } +} +``` + +### Using Existing Bucket + +```hcl +module "managed_file_events" { + source = "github.com/databricks/terraform-databricks-examples/modules/aws-managed-file-events" + + prefix = "my-project" + region = "us-west-2" + aws_account_id = "123456789012" + databricks_account_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + + create_bucket = false + existing_bucket_name = "my-existing-bucket" + s3_path_prefix = "data/incoming" + + tags = { + Environment = "production" + } +} +``` + +### With Catalog Creation + +```hcl +module "managed_file_events" { + source = "github.com/databricks/terraform-databricks-examples/modules/aws-managed-file-events" + + prefix = "my-project" + region = "us-west-2" + aws_account_id = "123456789012" + databricks_account_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + + create_catalog = true + catalog_name = "file_events_catalog" + catalog_owner = "data_engineers" + + catalog_grants = [ + { + principal = "data_engineers" + privileges = ["USE_CATALOG", "CREATE_SCHEMA"] + } + ] + + tags = { + Environment = "production" + } +} +``` + +## Using with Auto Loader + +Once the module is deployed, you can use Auto Loader with managed file events: + +```python +df = spark.readStream.format("cloudFiles") \ + .option("cloudFiles.format", "json") \ + .option("cloudFiles.useManagedFileEvents", "true") \ + .load("s3://bucket/path") +``` + +Or in Lakeflow Declarative Pipelines: + +```python +@dlt.table +def my_table(): + return spark.readStream.format("cloudFiles") \ + .option("cloudFiles.format", "json") \ + .option("cloudFiles.useManagedFileEvents", "true") \ + .load("s3://bucket/path") +``` + +## Reference + +- [Databricks File Notification Mode Documentation](https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/file-notification-mode) + + +## Requirements + +| Name | Version | +|------|---------| +| [aws](#requirement\_aws) | >= 5.0 | +| [databricks](#requirement\_databricks) | >= 1.65.0 | +| [time](#requirement\_time) | >=0.9.0 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | 6.31.0 | +| [databricks](#provider\_databricks) | 1.105.0 | +| [time](#provider\_time) | 0.13.1 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [aws_iam_policy.unity_catalog](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_role.file_events_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy_attachment.unity_catalog](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_s3_bucket.file_events](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket) | resource | +| [aws_s3_bucket_public_access_block.file_events](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket_public_access_block) | resource | +| [aws_s3_bucket_server_side_encryption_configuration.file_events](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket_server_side_encryption_configuration) | resource | +| [aws_s3_bucket_versioning.file_events](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket_versioning) | resource | +| [databricks_catalog.file_events](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/catalog) | resource | +| [databricks_external_location.file_events](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/external_location) | resource | +| [databricks_grants.catalog](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grants) | resource | +| [databricks_grants.external_location](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grants) | resource | +| [databricks_grants.storage_credential](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grants) | resource | +| [databricks_storage_credential.file_events](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/storage_credential) | resource | +| [time_sleep.wait_role_creation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | +| [aws_s3_bucket.existing](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/s3_bucket) | data source | +| [databricks_aws_unity_catalog_assume_role_policy.this](https://registry.terraform.io/providers/databricks/databricks/latest/docs/data-sources/aws_unity_catalog_assume_role_policy) | data source | +| [databricks_aws_unity_catalog_policy.this](https://registry.terraform.io/providers/databricks/databricks/latest/docs/data-sources/aws_unity_catalog_policy) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [aws\_account\_id](#input\_aws\_account\_id) | (Required) AWS account ID where the IAM role will be created | `string` | n/a | yes | +| [databricks\_account\_id](#input\_databricks\_account\_id) | (Required) Databricks Account ID | `string` | n/a | yes | +| [prefix](#input\_prefix) | (Required) Prefix to name the resources created by this module | `string` | n/a | yes | +| [region](#input\_region) | (Required) AWS region where the assets will be deployed | `string` | n/a | yes | +| [bucket\_name](#input\_bucket\_name) | (Optional) Name for the S3 bucket. If not provided, uses prefix-file-events | `string` | `null` | no | +| [catalog\_grants](#input\_catalog\_grants) | (Optional) List of grants for the catalog (if created) |
list(object({
principal = string
privileges = list(string)
}))
| `[]` | no | +| [catalog\_isolation\_mode](#input\_catalog\_isolation\_mode) | (Optional) Isolation mode for the catalog (OPEN or ISOLATED) | `string` | `"OPEN"` | no | +| [catalog\_name](#input\_catalog\_name) | (Optional) Name for the catalog. Required if create\_catalog is true | `string` | `null` | no | +| [catalog\_owner](#input\_catalog\_owner) | (Optional) Owner of the catalog | `string` | `null` | no | +| [create\_bucket](#input\_create\_bucket) | (Optional) Whether to create a new S3 bucket or use an existing one | `bool` | `true` | no | +| [create\_catalog](#input\_create\_catalog) | (Optional) Whether to create a catalog using this external location | `bool` | `false` | no | +| [existing\_bucket\_name](#input\_existing\_bucket\_name) | (Optional) Name of existing S3 bucket when create\_bucket is false | `string` | `null` | no | +| [external\_location\_grants](#input\_external\_location\_grants) | (Optional) List of grants for the external location |
list(object({
principal = string
privileges = list(string)
}))
| `[]` | no | +| [external\_location\_name](#input\_external\_location\_name) | (Optional) Name for the external location. If not provided, uses prefix-file-events-location | `string` | `null` | no | +| [force\_destroy\_bucket](#input\_force\_destroy\_bucket) | (Optional) Allow bucket destruction even with objects inside | `bool` | `false` | no | +| [s3\_path\_prefix](#input\_s3\_path\_prefix) | (Optional) Path prefix within the S3 bucket for the external location | `string` | `""` | no | +| [storage\_credential\_grants](#input\_storage\_credential\_grants) | (Optional) List of grants for the storage credential |
list(object({
principal = string
privileges = list(string)
}))
| `[]` | no | +| [storage\_credential\_name](#input\_storage\_credential\_name) | (Optional) Name for the storage credential. If not provided, uses prefix-file-events-credential | `string` | `null` | no | +| [tags](#input\_tags) | (Optional) Tags to be propagated across all AWS resources | `map(string)` | `{}` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [bucket\_arn](#output\_bucket\_arn) | ARN of the S3 bucket used for file events | +| [bucket\_name](#output\_bucket\_name) | Name of the S3 bucket used for file events | +| [catalog\_id](#output\_catalog\_id) | ID of the catalog (if created) | +| [catalog\_name](#output\_catalog\_name) | Name of the catalog (if created) | +| [external\_location\_id](#output\_external\_location\_id) | ID of the external location | +| [external\_location\_name](#output\_external\_location\_name) | Name of the external location | +| [external\_location\_url](#output\_external\_location\_url) | URL of the external location | +| [iam\_role\_arn](#output\_iam\_role\_arn) | ARN of the IAM role for file events access | +| [iam\_role\_name](#output\_iam\_role\_name) | Name of the IAM role for file events access | +| [s3\_url](#output\_s3\_url) | S3 URL for the external location | +| [storage\_credential\_id](#output\_storage\_credential\_id) | ID of the storage credential | +| [storage\_credential\_name](#output\_storage\_credential\_name) | Name of the storage credential | + diff --git a/modules/aws-managed-file-events/catalog.tf b/modules/aws-managed-file-events/catalog.tf new file mode 100644 index 00000000..2c49b6fa --- /dev/null +++ b/modules/aws-managed-file-events/catalog.tf @@ -0,0 +1,25 @@ +resource "databricks_catalog" "file_events" { + count = var.create_catalog ? 1 : 0 + name = var.catalog_name + storage_root = databricks_external_location.file_events.url + comment = "Catalog with managed file events - Managed by Terraform" + isolation_mode = var.catalog_isolation_mode + owner = var.catalog_owner + + force_destroy = var.force_destroy_bucket + + depends_on = [databricks_external_location.file_events] +} + +resource "databricks_grants" "catalog" { + count = var.create_catalog && length(var.catalog_grants) > 0 ? 1 : 0 + catalog = databricks_catalog.file_events[0].name + + dynamic "grant" { + for_each = var.catalog_grants + content { + principal = grant.value.principal + privileges = grant.value.privileges + } + } +} diff --git a/modules/aws-managed-file-events/iam.tf b/modules/aws-managed-file-events/iam.tf new file mode 100644 index 00000000..d30a38b0 --- /dev/null +++ b/modules/aws-managed-file-events/iam.tf @@ -0,0 +1,37 @@ +data "databricks_aws_unity_catalog_assume_role_policy" "this" { + aws_account_id = var.aws_account_id + role_name = local.iam_role_name + external_id = databricks_storage_credential.file_events.aws_iam_role[0].external_id +} + +data "databricks_aws_unity_catalog_policy" "this" { + aws_account_id = var.aws_account_id + bucket_name = local.bucket_name + role_name = local.iam_role_name +} + +resource "aws_iam_policy" "unity_catalog" { + name = "${var.prefix}-file-events-policy" + policy = data.databricks_aws_unity_catalog_policy.this.json + tags = merge(var.tags, { + Name = "${var.prefix}-file-events IAM policy" + }) +} + +resource "aws_iam_role" "file_events_access" { + name = local.iam_role_name + assume_role_policy = data.databricks_aws_unity_catalog_assume_role_policy.this.json + tags = merge(var.tags, { + Name = "${var.prefix}-file-events IAM role" + }) +} + +resource "aws_iam_role_policy_attachment" "unity_catalog" { + role = aws_iam_role.file_events_access.name + policy_arn = aws_iam_policy.unity_catalog.arn +} + +resource "time_sleep" "wait_role_creation" { + depends_on = [aws_iam_role.file_events_access] + create_duration = "20s" +} diff --git a/modules/aws-managed-file-events/main.tf b/modules/aws-managed-file-events/main.tf new file mode 100644 index 00000000..f4e47b8d --- /dev/null +++ b/modules/aws-managed-file-events/main.tf @@ -0,0 +1,55 @@ +resource "databricks_storage_credential" "file_events" { + name = local.storage_credential_name + aws_iam_role { + role_arn = local.iam_role_arn + } + skip_validation = true + comment = "Storage credential for managed file events - Managed by Terraform" +} + +resource "databricks_grants" "storage_credential" { + count = length(var.storage_credential_grants) > 0 ? 1 : 0 + storage_credential = databricks_storage_credential.file_events.id + + dynamic "grant" { + for_each = var.storage_credential_grants + content { + principal = grant.value.principal + privileges = grant.value.privileges + } + } +} + +resource "databricks_external_location" "file_events" { + name = local.external_location_name + url = local.s3_url + credential_name = databricks_storage_credential.file_events.id + comment = "External location with managed file events - Managed by Terraform" + + enable_file_events = true + file_event_queue { + managed_sqs {} + } + + force_destroy = var.force_destroy_bucket + + depends_on = [ + aws_iam_role.file_events_access, + time_sleep.wait_role_creation, + aws_s3_bucket.file_events, + aws_s3_bucket_public_access_block.file_events + ] +} + +resource "databricks_grants" "external_location" { + count = length(var.external_location_grants) > 0 ? 1 : 0 + external_location = databricks_external_location.file_events.id + + dynamic "grant" { + for_each = var.external_location_grants + content { + principal = grant.value.principal + privileges = grant.value.privileges + } + } +} diff --git a/modules/aws-managed-file-events/outputs.tf b/modules/aws-managed-file-events/outputs.tf new file mode 100644 index 00000000..ea1d8537 --- /dev/null +++ b/modules/aws-managed-file-events/outputs.tf @@ -0,0 +1,60 @@ +output "bucket_name" { + description = "Name of the S3 bucket used for file events" + value = local.bucket_name +} + +output "bucket_arn" { + description = "ARN of the S3 bucket used for file events" + value = var.create_bucket ? aws_s3_bucket.file_events[0].arn : data.aws_s3_bucket.existing[0].arn +} + +output "s3_url" { + description = "S3 URL for the external location" + value = local.s3_url +} + +output "iam_role_arn" { + description = "ARN of the IAM role for file events access" + value = aws_iam_role.file_events_access.arn +} + +output "iam_role_name" { + description = "Name of the IAM role for file events access" + value = aws_iam_role.file_events_access.name +} + +output "storage_credential_id" { + description = "ID of the storage credential" + value = databricks_storage_credential.file_events.id +} + +output "storage_credential_name" { + description = "Name of the storage credential" + value = databricks_storage_credential.file_events.name +} + +output "external_location_id" { + description = "ID of the external location" + value = databricks_external_location.file_events.id +} + +output "external_location_name" { + description = "Name of the external location" + value = databricks_external_location.file_events.name +} + +output "external_location_url" { + description = "URL of the external location" + value = databricks_external_location.file_events.url +} + + +output "catalog_id" { + description = "ID of the catalog (if created)" + value = var.create_catalog ? databricks_catalog.file_events[0].id : null +} + +output "catalog_name" { + description = "Name of the catalog (if created)" + value = var.create_catalog ? databricks_catalog.file_events[0].name : null +} diff --git a/modules/aws-managed-file-events/s3.tf b/modules/aws-managed-file-events/s3.tf new file mode 100644 index 00000000..e4065bbb --- /dev/null +++ b/modules/aws-managed-file-events/s3.tf @@ -0,0 +1,42 @@ +resource "aws_s3_bucket" "file_events" { + count = var.create_bucket ? 1 : 0 + bucket = local.bucket_name + force_destroy = var.force_destroy_bucket + tags = merge(var.tags, { + Name = local.bucket_name + }) +} + +resource "aws_s3_bucket_versioning" "file_events" { + count = var.create_bucket ? 1 : 0 + bucket = aws_s3_bucket.file_events[0].id + versioning_configuration { + status = "Disabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "file_events" { + count = var.create_bucket ? 1 : 0 + bucket = aws_s3_bucket.file_events[0].bucket + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } +} + +resource "aws_s3_bucket_public_access_block" "file_events" { + count = var.create_bucket ? 1 : 0 + bucket = aws_s3_bucket.file_events[0].id + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true + depends_on = [aws_s3_bucket.file_events] +} + +data "aws_s3_bucket" "existing" { + count = var.create_bucket ? 0 : 1 + bucket = var.existing_bucket_name +} diff --git a/modules/aws-managed-file-events/variables.tf b/modules/aws-managed-file-events/variables.tf new file mode 100644 index 00000000..8decfef5 --- /dev/null +++ b/modules/aws-managed-file-events/variables.tf @@ -0,0 +1,132 @@ +variable "tags" { + type = map(string) + description = "(Optional) Tags to be propagated across all AWS resources" + default = {} +} + +variable "prefix" { + type = string + description = "(Required) Prefix to name the resources created by this module" +} + +variable "region" { + type = string + description = "(Required) AWS region where the assets will be deployed" +} + +variable "aws_account_id" { + type = string + description = "(Required) AWS account ID where the IAM role will be created" +} + +variable "databricks_account_id" { + type = string + description = "(Required) Databricks Account ID" +} + +variable "create_bucket" { + type = bool + description = "(Optional) Whether to create a new S3 bucket or use an existing one" + default = true +} + +variable "existing_bucket_name" { + type = string + description = "(Optional) Name of existing S3 bucket when create_bucket is false" + default = null +} + +variable "bucket_name" { + type = string + description = "(Optional) Name for the S3 bucket. If not provided, uses prefix-file-events" + default = null +} + +variable "s3_path_prefix" { + type = string + description = "(Optional) Path prefix within the S3 bucket for the external location" + default = "" +} + +variable "force_destroy_bucket" { + type = bool + description = "(Optional) Allow bucket destruction even with objects inside" + default = false +} + +variable "external_location_name" { + type = string + description = "(Optional) Name for the external location. If not provided, uses prefix-file-events-location" + default = null +} + +variable "storage_credential_name" { + type = string + description = "(Optional) Name for the storage credential. If not provided, uses prefix-file-events-credential" + default = null +} + +variable "create_catalog" { + type = bool + description = "(Optional) Whether to create a catalog using this external location" + default = false +} + +variable "catalog_name" { + type = string + description = "(Optional) Name for the catalog. Required if create_catalog is true" + default = null +} + +variable "catalog_owner" { + type = string + description = "(Optional) Owner of the catalog" + default = null +} + +variable "catalog_isolation_mode" { + type = string + description = "(Optional) Isolation mode for the catalog (OPEN or ISOLATED)" + default = "OPEN" + + validation { + condition = contains(["OPEN", "ISOLATED"], var.catalog_isolation_mode) + error_message = "catalog_isolation_mode must be either OPEN or ISOLATED" + } +} + +variable "external_location_grants" { + type = list(object({ + principal = string + privileges = list(string) + })) + description = "(Optional) List of grants for the external location" + default = [] +} + +variable "storage_credential_grants" { + type = list(object({ + principal = string + privileges = list(string) + })) + description = "(Optional) List of grants for the storage credential" + default = [] +} + +variable "catalog_grants" { + type = list(object({ + principal = string + privileges = list(string) + })) + description = "(Optional) List of grants for the catalog (if created)" + default = [] +} + +locals { + bucket_name = var.create_bucket ? (var.bucket_name != null ? var.bucket_name : "${var.prefix}-file-events") : var.existing_bucket_name + external_location_name = var.external_location_name != null ? var.external_location_name : "${var.prefix}-file-events-location" + storage_credential_name = var.storage_credential_name != null ? var.storage_credential_name : "${var.prefix}-file-events-credential" + iam_role_name = "${var.prefix}-file-events-access" + iam_role_arn = "arn:aws:iam::${var.aws_account_id}:role/${local.iam_role_name}" + s3_url = var.s3_path_prefix != "" ? "s3://${local.bucket_name}/${var.s3_path_prefix}" : "s3://${local.bucket_name}" +} diff --git a/modules/aws-managed-file-events/versions.tf b/modules/aws-managed-file-events/versions.tf new file mode 100644 index 00000000..9f731952 --- /dev/null +++ b/modules/aws-managed-file-events/versions.tf @@ -0,0 +1,18 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + + databricks = { + source = "databricks/databricks" + version = ">= 1.65.0" + } + + time = { + source = "hashicorp/time" + version = ">=0.9.0" + } + } +} \ No newline at end of file