From e2129482621fa51026b314ba37df432146129019 Mon Sep 17 00:00:00 2001 From: Alexander Laye Date: Fri, 11 Jul 2025 11:39:46 -0400 Subject: [PATCH 1/5] Add istio replication to operator move scripts to proper folder Add telemetry scripts try new docker image adjust dockerfiles change to local build dockerfile Add telemetry deployment option add final demo touches --- .github/dockerfiles/Dockerfile_docdb | 2 +- .github/dockerfiles/Dockerfile_gateway | 35 +- .../aks-fleet-deployment/dashboard.json | 904 ++++++++++++++++++ .../deploy-multi-region.sh | 8 +- .../aks-fleet-deployment/deploy-telemetry.sh | 429 +++++++++ .../documentdb-resource-crp.yaml | 4 +- .../aks-fleet-deployment/grafana-values.yaml | 106 ++ .../aks-fleet-deployment/otel-collector.yaml | 89 ++ .../prometheus-values.yaml | 51 + .../internal/lifecycle/lifecycle.go | 2 +- operator/src/internal/cnpg/cnpg_cluster.go | 5 +- .../controller/documentdb_controller.go | 2 +- .../scripts/multi-cloud-deployment/.gitignore | 1 + .../scripts/multi-cloud-deployment/README.md | 608 ++++++++++++ .../deploy-documentdb.sh | 431 +++++++++ .../multi-cloud-deployment/deploy-gke.sh | 100 ++ .../scripts/multi-cloud-deployment/deploy.sh | 628 ++++++++++++ .../multi-cloud-deployment/dns_failover.sh | 55 ++ .../documentdb-base.yaml | 99 ++ .../documentdb-cluster.yaml | 61 ++ .../multi-cloud-deployment/insert_test.py | 47 + .../scripts/multi-cloud-deployment/main.bicep | 74 ++ .../parameters.bicepparam | 8 + .../multi-cloud-deployment/read_test.py | 61 ++ 24 files changed, 3788 insertions(+), 22 deletions(-) create mode 100644 documentdb-playground/aks-fleet-deployment/dashboard.json create mode 100755 documentdb-playground/aks-fleet-deployment/deploy-telemetry.sh create mode 100644 documentdb-playground/aks-fleet-deployment/grafana-values.yaml create mode 100644 documentdb-playground/aks-fleet-deployment/otel-collector.yaml create mode 100644 documentdb-playground/aks-fleet-deployment/prometheus-values.yaml create mode 100644 operator/src/scripts/multi-cloud-deployment/.gitignore create mode 100644 operator/src/scripts/multi-cloud-deployment/README.md create mode 100755 operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh create mode 100644 operator/src/scripts/multi-cloud-deployment/deploy-gke.sh create mode 100755 operator/src/scripts/multi-cloud-deployment/deploy.sh create mode 100755 operator/src/scripts/multi-cloud-deployment/dns_failover.sh create mode 100644 operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml create mode 100644 operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml create mode 100644 operator/src/scripts/multi-cloud-deployment/insert_test.py create mode 100644 operator/src/scripts/multi-cloud-deployment/main.bicep create mode 100644 operator/src/scripts/multi-cloud-deployment/parameters.bicepparam create mode 100644 operator/src/scripts/multi-cloud-deployment/read_test.py diff --git a/.github/dockerfiles/Dockerfile_docdb b/.github/dockerfiles/Dockerfile_docdb index f99a803f..b45ce721 100644 --- a/.github/dockerfiles/Dockerfile_docdb +++ b/.github/dockerfiles/Dockerfile_docdb @@ -138,7 +138,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 6B827C12C2D425E227EDCA75089EBE08314DF160) && \ apt-get update && \ apt-get install -qy \ - libproj22 \ + libproj-dev \ libxml2 \ libjson-c5 \ libgeos-c1v5 \ diff --git a/.github/dockerfiles/Dockerfile_gateway b/.github/dockerfiles/Dockerfile_gateway index 0a4f73cf..3ea714a7 100644 --- a/.github/dockerfiles/Dockerfile_gateway +++ b/.github/dockerfiles/Dockerfile_gateway @@ -26,11 +26,13 @@ USER documentdb WORKDIR /home/documentdb/code/ # Get the docuemntdb repository -RUN wget -P /tmp https://github.com/documentdb/documentdb/archive/refs/tags/v${DocumentDB_VERSION}.zip && \ - unzip /tmp/v${DocumentDB_VERSION}.zip -d /home/documentdb/code && \ - rm /tmp/v${DocumentDB_VERSION}.zip - +#RUN wget -P /tmp https://github.com/documentdb/documentdb/archive/refs/tags/v${DocumentDB_VERSION}.zip && \ + #unzip /tmp/v${DocumentDB_VERSION}.zip -d /home/documentdb/code && \ + #rm /tmp/v${DocumentDB_VERSION}.zip RUN sudo chown -R documentdb:documentdb /home/documentdb/ +# For local builds, copy the code over directly +COPY --chown=documentdb:documentdb . /home/documentdb/code/documentdb-${DocumentDB_VERSION} + WORKDIR /home/documentdb/code/documentdb-${DocumentDB_VERSION}/pg_documentdb_gw @@ -45,12 +47,17 @@ ENV DocumentDB_VERSION=${DocumentDB_VERSION} RUN apt-get update && \ apt-get install -y --no-install-recommends \ - jq openssl lsof sudo ca-certificates && \ + jq openssl lsof sudo ca-certificates postgresql-client && \ apt-get upgrade -y && \ rm -rf /var/lib/apt/lists/* ENV LANGUAGE=en_US.UTF-8 \ - TERM=xterm-256color + TERM=xterm-256color \ + OTEL_TRACING_ENABLED=true \ + OTEL_METRICS_ENABLED=true \ + OTEL_LOGGING_ENABLED=true \ + OTEL_LOGS_CONSOLE_ENABLED=true \ + PGHOST=localhost # ENV ENFORCE_SSL="true" \ # CERT_PATH="" \ @@ -73,14 +80,14 @@ RUN echo "%sudo ALL=(ALL:ALL) NOPASSWD: ALL" >> /etc/sudoers.d/no-pass-ask USER documentdb -RUN sudo mkdir /home/documentdb/gateway - -COPY --from=stage /home/documentdb/code/documentdb-${DocumentDB_VERSION}/pg_documentdb_gw/target/debug/documentdb_gateway /home/documentdb/gateway/documentdb_gateway -COPY --from=stage /home/documentdb/code/documentdb-${DocumentDB_VERSION}/pg_documentdb_gw/SetupConfiguration.json /home/documentdb/gateway/SetupConfiguration.json -COPY --from=stage /home/documentdb/code/documentdb-${DocumentDB_VERSION}/scripts/build_and_start_gateway.sh /home/documentdb/gateway/scripts/build_and_start_gateway.sh -COPY --from=stage /home/documentdb/code/documentdb-${DocumentDB_VERSION}/scripts/utils.sh /home/documentdb/gateway/scripts/utils.sh +RUN mkdir /home/documentdb/gateway +RUN mkdir -p /home/documentdb/gateway/pg_documentdb_gw/target/release-with-symbols/ -RUN sudo chown -R documentdb:documentdb /home/documentdb/gateway +COPY --from=stage --chown=documentdb:documentdb /home/documentdb/code/documentdb-${DocumentDB_VERSION}/pg_documentdb_gw/target/debug/documentdb_gateway /home/documentdb/gateway/pg_documentdb_gw/target/release-with-symbols/documentdb_gateway +COPY --from=stage --chown=documentdb:documentdb /home/documentdb/code/documentdb-${DocumentDB_VERSION}/pg_documentdb_gw/SetupConfiguration.json /home/documentdb/gateway/pg_documentdb_gw/SetupConfiguration.json +COPY --from=stage --chown=documentdb:documentdb /home/documentdb/code/documentdb-${DocumentDB_VERSION}/scripts/build_and_start_gateway.sh /home/documentdb/gateway/scripts/build_and_start_gateway.sh +COPY --from=stage --chown=documentdb:documentdb /home/documentdb/code/documentdb-${DocumentDB_VERSION}/scripts/emulator_entrypoint.sh /home/documentdb/gateway/scripts/emulator_entrypoint.sh +COPY --from=stage --chown=documentdb:documentdb /home/documentdb/code/documentdb-${DocumentDB_VERSION}/scripts/utils.sh /home/documentdb/gateway/scripts/utils.sh WORKDIR /home/documentdb/gateway/scripts -#ENTRYPOINT ["/bin/bash", "-c", "/home/documentdb/gateway/scripts/emulator_entrypoint.sh \"$@\"", "--"] \ No newline at end of file +ENTRYPOINT ["/bin/bash", "-c", "/home/documentdb/gateway/scripts/emulator_entrypoint.sh \"$@\"", "--"] diff --git a/documentdb-playground/aks-fleet-deployment/dashboard.json b/documentdb-playground/aks-fleet-deployment/dashboard.json new file mode 100644 index 00000000..76076364 --- /dev/null +++ b/documentdb-playground/aks-fleet-deployment/dashboard.json @@ -0,0 +1,904 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "rate(documentdb_mongodb_requests_total[2m])", + "legendFormat": "{{operation}}-{{region}}", + "refId": "A" + } + ], + "title": "Request Rate by Operation", + "type": "timeseries" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum(rate(documentdb_mongodb_requests_total[2m]))", + "refId": "A" + } + ], + "title": "Total Request Rate", + "type": "gauge" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum(documentdb_mongodb_requests_total)", + "refId": "A" + } + ], + "title": "Total Requests", + "type": "stat" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.50, sum by(le, operation) (rate(documentdb_mongodb_request_duration_milliseconds_bucket{phase=\"total\"}[2m])))", + "legendFormat": "p50 - {{operation}}", + "refId": "A" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.95, sum by(le, operation) (rate(documentdb_mongodb_request_duration_milliseconds_bucket{phase=\"total\"}[2m])))", + "legendFormat": "p95 - {{operation}}", + "refId": "B" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.99, sum by(le, operation) (rate(documentdb_mongodb_request_duration_milliseconds_bucket{phase=\"total\"}[2m])))", + "legendFormat": "p99 - {{operation}}", + "refId": "C" + } + ], + "title": "Request Duration (p50, p95, p99)", + "type": "timeseries" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": ["mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.95, sum by(le, phase) (rate(documentdb_mongodb_request_duration_milliseconds_bucket[2m])))", + "legendFormat": "{{phase}}", + "refId": "A" + } + ], + "title": "Request Duration by Phase (p95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 6, + "options": { + "displayLabels": ["percent"], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": ["value"] + }, + "pieType": "pie", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(operation) (documentdb_mongodb_requests_total)", + "legendFormat": "{{operation}}-{{region}}", + "refId": "A" + } + ], + "title": "Requests by Operation", + "type": "piechart" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 7, + "options": { + "displayLabels": ["percent"], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": ["value"] + }, + "pieType": "pie", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(collection) (documentdb_mongodb_requests_total{collection!=\"\"})", + "legendFormat": "{{collection}}", + "refId": "A" + } + ], + "title": "Requests by Collection", + "type": "piechart" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum(rate(documentdb_mongodb_requests_total{is_error=\"true\"}[2m])) / sum(rate(documentdb_mongodb_requests_total[2m]))", + "legendFormat": "Error Rate", + "refId": "A" + } + ], + "title": "Error Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.95, sum by(le) (rate(documentdb_mongodb_request_size_bytes_bucket[2m])))", + "legendFormat": "Request Size (p95)", + "refId": "A" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.95, sum by(le) (rate(documentdb_mongodb_response_size_bytes_bucket[2m])))", + "legendFormat": "Response Size (p95)", + "refId": "B" + } + ], + "title": "Request/Response Size (p95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "reqps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(operation) (documentdb_mongodb_requests_total)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(operation) (rate(documentdb_mongodb_requests_total[5m]))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "B" + } + ], + "title": "Operations Summary", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "collection": true, + "environment": true, + "is_error": true, + "job": true, + "username": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "operation": 1 + }, + "renameByName": { + "Value #A": "Total", + "Value #B": "Rate", + "operation": "Operation" + } + } + } + ], + "type": "table" + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": ["documentdb", "mongodb", "observability"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "DocumentDB Gateway - Overview", + "uid": "documentdb-gateway", + "version": 0, + "weekStart": "" +} diff --git a/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh b/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh index 614c1753..5c54cb94 100755 --- a/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh +++ b/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh @@ -27,6 +27,8 @@ ENABLE_AZURE_DNS="${ENABLE_AZURE_DNS:-true}" # Set password from argument or environment variable DOCUMENTDB_PASSWORD="${1:-${DOCUMENTDB_PASSWORD:-}}" +DOCUMENTDB_IMAGE="${DOCUMENTDB_IMAGE:-ghcr.io/microsoft/documentdb/documentdb-local:16}" +GATEWAY_IMAGE="${GATEWAY_IMAGE:-${DOCUMENTDB_IMAGE}}" # If no password provided, generate a secure one if [ -z "$DOCUMENTDB_PASSWORD" ]; then @@ -90,7 +92,7 @@ echo "=======================================" for cluster in "${CLUSTER_ARRAY[@]}"; do echo "" - echo "Processing ConfigMap for $cluster..." + echo "Processing ConfigMaps for $cluster..." # Check if context exists if ! kubectl config get-contexts "$cluster" &>/dev/null; then @@ -108,7 +110,7 @@ for cluster in "${CLUSTER_ARRAY[@]}"; do if kubectl --context "$cluster" get configmap cluster-name -n kube-system &>/dev/null; then echo "✓ ConfigMap created/updated for $cluster" else - echo "✗ Failed to create ConfigMap for $cluster" + echo "✗ Failed to create ConfigMap cluster-name for $cluster" fi done @@ -183,6 +185,8 @@ TEMP_YAML=$(mktemp) # Use sed for safer substitution sed -e "s/{{DOCUMENTDB_PASSWORD}}/$DOCUMENTDB_PASSWORD/g" \ -e "s/{{PRIMARY_CLUSTER}}/$PRIMARY_CLUSTER/g" \ + -e "s#{{DOCUMENTDB_IMAGE}}#$DOCUMENTDB_IMAGE#g" \ + -e "s#{{GATEWAY_IMAGE}}#$GATEWAY_IMAGE#g" \ "$SCRIPT_DIR/documentdb-resource-crp.yaml" | \ while IFS= read -r line; do if [[ "$line" == '{{CLUSTER_LIST}}' ]]; then diff --git a/documentdb-playground/aks-fleet-deployment/deploy-telemetry.sh b/documentdb-playground/aks-fleet-deployment/deploy-telemetry.sh new file mode 100755 index 00000000..3c8f95fb --- /dev/null +++ b/documentdb-playground/aks-fleet-deployment/deploy-telemetry.sh @@ -0,0 +1,429 @@ +#!/bin/bash + +# Multi-Tenant DocumentDB + Telemetry Deployment Script +# This script deploys complete DocumentDB clusters with isolated monitoring stacks for different teams + +set -e + +# Deployment options +SKIP_WAIT=true + +# Parse command line arguments +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --skip-wait Skip waiting for deployments to be ready" + echo " --help Show this help message" + echo "" + echo "Examples:" + echo " $0 # Deploy everything (DocumentDB + Telemetry)" +} + +while [[ $# -gt 0 ]]; do + case $1 in + --skip-wait) + SKIP_WAIT=true + shift + ;; + --help) + usage + exit 0 + ;; + *) + error "Unknown option: $1" + usage + exit 1 + ;; + esac +done + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log() { + echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1" +} + +success() { + echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] ✅${NC} $1" +} + +warn() { + echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] ⚠️${NC} $1" +} + +error() { + echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ❌${NC} $1" + exit 1 +} + +# Check if OpenTelemetry Operator is installed +check_prerequisites() { + log "Checking prerequisites..." + + if ! helm version > /dev/null 2>&1; then + error "Helm is not installed. Please install Helm first." + fi + + # Add Prometheus Helm repo if not already added + if ! helm repo list | grep -q prometheus-community; then + log "Adding Prometheus Helm repository..." + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo update + fi + + # Add Grafana Helm repo if not already added + if ! helm repo list | grep -q grafana; then + log "Adding Grafana Helm repository..." + helm repo add grafana https://grafana.github.io/helm-charts + helm repo update + fi + + success "Prerequisites check completed" +} + +install_opentelemetry_operator() { + + kubectl config use-context hub + + log "Installing OpenTelemetry Operator (infrastructure component)..." + + # Check if already installed + if kubectl get deployment opentelemetry-operator-controller-manager -n opentelemetry-operator-system &> /dev/null; then + warn "OpenTelemetry Operator already installed. Skipping installation." + return 0 + fi + + # Install OpenTelemetry Operator on hub + log "Installing OpenTelemetry Operator from upstream..." + kubectl apply -f https://github.com/open-telemetry/opentelemetry-operator/releases/latest/download/opentelemetry-operator.yaml + + # Create ClusterResourcePlacement to deploy operator to all member clusters + log "Creating ClusterResourcePlacement for OpenTelemetry Operator..." + cat < /dev/null && pwd )" + DEPLOYMENT_DIR="$(dirname "$SCRIPT_DIR")/aks-fleet-deployment" + + if [ ! -f "$DEPLOYMENT_DIR/prometheus-values.yaml" ]; then + error "Prometheus values file not found: $DEPLOYMENT_DIR/prometheus-values.yaml" + fi + + helm upgrade --install prometheus prometheus-community/prometheus \ + --namespace $namespace \ + --values "$DEPLOYMENT_DIR/prometheus-values.yaml" \ + --wait --timeout=300s + + success "Prometheus deployed" +} + +# Deploy Grafana for a namespace +deploy_grafana() { + local namespace=$1 + + log "Deploying Grafana in namespace: $namespace" + + # Get the directory where this script is located + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + DEPLOYMENT_DIR="$(dirname "$SCRIPT_DIR")/aks-fleet-deployment" + + if [ ! -f "$DEPLOYMENT_DIR/grafana-values.yaml" ]; then + error "Grafana values file not found: $DEPLOYMENT_DIR/grafana-values.yaml" + fi + + helm upgrade --install grafana grafana/grafana \ + --namespace $namespace \ + --values "$DEPLOYMENT_DIR/grafana-values.yaml" \ + --wait --timeout=300s + + success "Grafana deployed" +} + +# Deploy OpenTelemetry collectors for each member +# TODO figure out how to do this with fleet, currently can't deploy without the operator running (opentelemetry-operator-webhook-service) +deploy_collectors() { + log "Deploying OpenTelemetry collector to each member cluster..." + + # Get the directory where this script is located + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + DEPLOYMENT_DIR="$(dirname "$SCRIPT_DIR")/aks-fleet-deployment" + + # Get member clusters and primary cluster from documentdb resource + MEMBER_CLUSTERS=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o json 2>/dev/null | jq -r '.spec.clusterReplication.clusterList[].name' 2>/dev/null || echo "") + + # Deploy to each member cluster + for cluster in $MEMBER_CLUSTERS; do + log "Waiting for OpenTelemetry Operator webhook service on cluster: $cluster" + kubectl --context "$cluster" wait --for=jsonpath='{.subsets[*].addresses[*].ip}' endpoints/opentelemetry-operator-webhook-service -n opentelemetry-operator-system --timeout=300s || warn "Webhook service not ready on $cluster, proceeding anyway..." + + log "Deploying OpenTelemetry Collector to cluster: $cluster" + sed "s/{{CLUSTER_NAME}}/$cluster/g" "$DEPLOYMENT_DIR/otel-collector.yaml" | kubectl --context "$cluster" apply -f - + done + success "All collectors deployed" +} + +# Deploy monitoring stack only on primary +deploy_monitoring_stack() { + + #primary=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.primary}') + primary="azure-documentdb" + kubectl config use-context "$primary" + + log "Deploying monitoring stack to primary" + + deploy_prometheus documentdb-preview-ns + deploy_grafana documentdb-preview-ns + + success "All monitoring stacks deployed" +} + +# Create placeholder OTEL collector services on primary cluster for non-primary members +create_placeholder_prometheus_services() { + log "Creating placeholder OTEL collector services on primary cluster..." + + # Get primary cluster and all member clusters + #PRIMARY_CLUSTER=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.primary}' 2>/dev/null || echo "") + PRIMARY_CLUSTER="azure-documentdb" + MEMBER_CLUSTERS=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o json 2>/dev/null | jq -r '.spec.clusterReplication.clusterList[].name' 2>/dev/null || echo "") + + if [ -z "$PRIMARY_CLUSTER" ] || [ -z "$MEMBER_CLUSTERS" ]; then + warn "Could not determine primary or member clusters, skipping placeholder services" + return 0 + fi + + # Deploy placeholder services on primary cluster for each non-primary member + for cluster in $MEMBER_CLUSTERS; do + if [ "$cluster" = "$PRIMARY_CLUSTER" ]; then + log "Skipping primary cluster: $cluster" + continue + fi + + log "Creating placeholder OTEL collector service for $cluster on primary cluster" + cat </dev/null +apiVersion: v1 +kind: Service +metadata: + name: ${cluster}-collector + namespace: documentdb-preview-ns + labels: + app: otel-collector + cluster: ${cluster} +spec: + type: ClusterIP + ports: + - name: prometheus + port: 8889 + targetPort: 8889 + protocol: TCP + selector: + app: nonexistent-placeholder +EOF + if [ $? -eq 0 ]; then + success "Placeholder service ${cluster}-collector created on primary cluster" + else + warn "Failed to create placeholder service for $cluster on primary cluster" + fi + done + + success "Placeholder OTEL collector services created on primary cluster" +} + +# Create Fleet ServiceExport and MultiClusterService for OTEL collectors +create_service_exports_and_imports() { + log "Creating Fleet ServiceExport and MultiClusterService for OTEL collector endpoints..." + + # Get primary cluster and all member clusters + #PRIMARY_CLUSTER=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.primary}' 2>/dev/null || echo "") + PRIMARY_CLUSTER="azure-documentdb" + MEMBER_CLUSTERS=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o json 2>/dev/null | jq -r '.spec.clusterReplication.clusterList[].name' 2>/dev/null || echo "") + + if [ -z "$PRIMARY_CLUSTER" ] || [ -z "$MEMBER_CLUSTERS" ]; then + warn "Could not determine primary or member clusters, skipping service export/import" + return 0 + fi + + # Create ServiceExport on each non-primary member cluster for their OTEL collector + for cluster in $MEMBER_CLUSTERS; do + if [ "$cluster" = "$PRIMARY_CLUSTER" ]; then + log "Skipping ServiceExport on primary cluster: $cluster" + continue + fi + + log "Creating ServiceExport for documentdb-collector-collector on cluster: $cluster" + cat </dev/null +apiVersion: networking.fleet.azure.com/v1alpha1 +kind: MultiClusterService +metadata: + name: $cluster-collector + namespace: documentdb-preview-ns +spec: + serviceImport: + name: $cluster-collector +EOF + done + + # Create MultiClusterService on primary cluster to import all OTEL collector endpoints + + + success "Fleet ServiceExport and MultiClusterService resources created for OTEL collectors" +} + +# Wait for collectors to be ready +wait_for_collectors() { + log "Waiting for OpenTelemetry collectors to be ready..." + + # Wait for Sales collector + kubectl wait --for=condition=available deployment/documentdb-sales-collector-collector -n $SALES_NAMESPACE --timeout=300s + success "Sales collector is ready" + + # Wait for Accounts collector + kubectl wait --for=condition=available deployment/documentdb-accounts-collector-collector -n $ACCOUNTS_NAMESPACE --timeout=300s + success "Accounts collector is ready" +} + +# Wait for monitoring stacks to be ready +wait_for_monitoring_stacks() { + log "Waiting for monitoring stacks to be ready..." + + # Wait for Sales monitoring stack + kubectl wait --for=condition=available deployment/prometheus-sales-server -n $SALES_NAMESPACE --timeout=300s + kubectl wait --for=condition=available deployment/grafana-sales -n $SALES_NAMESPACE --timeout=300s + success "Sales monitoring stack is ready" + + # Wait for Accounts monitoring stack + kubectl wait --for=condition=available deployment/prometheus-accounts-server -n $ACCOUNTS_NAMESPACE --timeout=300s + kubectl wait --for=condition=available deployment/grafana-accounts -n $ACCOUNTS_NAMESPACE --timeout=300s + success "Accounts monitoring stack is ready" +} + +# Main execution +main() { + log "Starting Multi-Tenant DocumentDB + Telemetry Deployment..." + log "=========================================================" + log "Configuration:" + log " Deploy DocumentDB: $DEPLOY_DOCUMENTDB" + log " Deploy Telemetry: $DEPLOY_TELEMETRY" + log " Skip Wait: $SKIP_WAIT" + log "" + + check_prerequisites + + install_opentelemetry_operator + + CROSS_CLOUD_STRATEGY=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.crossCloudNetworkingStrategy}' 2>/dev/null || echo "") + + deploy_collectors $CROSS_CLOUD_STRATEGY + + deploy_monitoring_stack + + # Only create placeholder services if using Istio networking + CROSS_CLOUD_STRATEGY=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.crossCloudNetworkingStrategy}' 2>/dev/null || echo "") + if [ "$CROSS_CLOUD_STRATEGY" = "Istio" ]; then + log "Cross-cloud networking strategy is Istio. Creating placeholder services..." + create_placeholder_prometheus_services + elif [ "$CROSS_CLOUD_STRATEGY" = "AzureFleet" ]; then + log "Cross-cloud networking strategy is Istio. Creating placeholder services..." + create_service_exports_and_imports + else + log "Cross-cloud networking strategy is '$CROSS_CLOUD_STRATEGY', not 'Istio'. Skipping placeholder services." + fi + + if [[ "$SKIP_WAIT" == "false" ]]; then + error "Wait not yet implemented" + #wait_for_collectors + #wait_for_monitoring_stacks + fi +} + +# Run main function +main "$@" \ No newline at end of file diff --git a/documentdb-playground/aks-fleet-deployment/documentdb-resource-crp.yaml b/documentdb-playground/aks-fleet-deployment/documentdb-resource-crp.yaml index 2087d324..4ce41e7c 100644 --- a/documentdb-playground/aks-fleet-deployment/documentdb-resource-crp.yaml +++ b/documentdb-playground/aks-fleet-deployment/documentdb-resource-crp.yaml @@ -26,8 +26,8 @@ metadata: spec: nodeCount: 1 instancesPerNode: 1 - documentDBImage: ghcr.io/microsoft/documentdb/documentdb-local:16 - gatewayImage: ghcr.io/microsoft/documentdb/documentdb-local:16 + documentDBImage: {{DOCUMENTDB_IMAGE}} + gatewayImage: {{GATEWAY_IMAGE}} resource: storage: pvcSize: 10Gi diff --git a/documentdb-playground/aks-fleet-deployment/grafana-values.yaml b/documentdb-playground/aks-fleet-deployment/grafana-values.yaml new file mode 100644 index 00000000..8cea36e4 --- /dev/null +++ b/documentdb-playground/aks-fleet-deployment/grafana-values.yaml @@ -0,0 +1,106 @@ +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: http://prometheus-server.documentdb-preview-ns.svc.cluster.local + access: proxy + isDefault: true + +adminPassword: admin123 + +service: + type: LoadBalancer + port: 3000 + +ingress: + enabled: false + +persistence: + enabled: true + size: 1Gi + +dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + +dashboards: + default: + documentdb-overview: + json: | + { + "dashboard": { + "id": null, + "title": "DocumentDB Overview", + "tags": ["documentdb"], + "timezone": "browser", + "panels": [ + { + "id": 1, + "title": "CPU Usage", + "type": "graph", + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total{tenant=\"documentdb-preview\",container!=\"POD\",container!=\"\",name!=\"\"}[5m]) * 100", + "legendFormat": "{{pod}} - {{container}}" + } + ], + "gridPos": {"h": 9, "w": 12, "x": 0, "y": 0}, + "yAxes": [{"unit": "percent"}] + }, + { + "id": 2, + "title": "Memory Usage", + "type": "graph", + "targets": [ + { + "expr": "container_memory_usage_bytes{tenant=\"documentdb-preview\",container!=\"POD\",container!=\"\",name!=\"\"} / 1024 / 1024", + "legendFormat": "{{pod}} - {{container}}" + } + ], + "gridPos": {"h": 9, "w": 12, "x": 12, "y": 0}, + "yAxes": [{"unit": "bytes"}] + }, + { + "id": 3, + "title": "Pod Status", + "type": "stat", + "targets": [ + { + "expr": "count(container_memory_usage_bytes{tenant=\"documentdb-preview\",container!=\"POD\",container!=\"\",name!=\"\"})", + "legendFormat": "Running Containers" + } + ], + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 9} + }, + { + "id": 4, + "title": "Network I/O", + "type": "graph", + "targets": [ + { + "expr": "rate(container_network_receive_bytes_total{tenant=\"documentdb-preview\"}[5m])", + "legendFormat": "{{pod}} RX" + }, + { + "expr": "rate(container_network_transmit_bytes_total{tenant=\"documentdb-preview\"}[5m])", + "legendFormat": "{{pod}} TX" + } + ], + "gridPos": {"h": 6, "w": 12, "x": 12, "y": 9} + } + ], + "time": {"from": "now-1h", "to": "now"}, + "refresh": "30s" + } + } diff --git a/documentdb-playground/aks-fleet-deployment/otel-collector.yaml b/documentdb-playground/aks-fleet-deployment/otel-collector.yaml new file mode 100644 index 00000000..9fce06ff --- /dev/null +++ b/documentdb-playground/aks-fleet-deployment/otel-collector.yaml @@ -0,0 +1,89 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: documentdb-preview-ns +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: otel-collector +rules: +- apiGroups: [""] + resources: ["nodes", "nodes/proxy", "nodes/metrics", "services", "endpoints", "pods"] + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics", "/metrics/cadvisor"] + verbs: ["get"] +- apiGroups: ["apps"] + resources: ["daemonsets", "deployments", "replicasets"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: otel-collector +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: otel-collector +subjects: +- kind: ServiceAccount + name: otel-collector + namespace: documentdb-preview-ns +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: {{CLUSTER_NAME}} + namespace: documentdb-preview-ns +spec: + mode: deployment # Single pod per namespace, not DaemonSet + replicas: 1 + serviceAccount: otel-collector + config: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + batch: + timeout: 10s + send_batch_size: 1024 + + memory_limiter: + check_interval: 1s + limit_mib: 512 + + attributes: + actions: + - key: service.name + action: upsert + value: documentdb-gateway + + exporters: + # Prometheus for metrics + prometheus: + endpoint: "0.0.0.0:8889" + namespace: documentdb + const_labels: + environment: demo + + extensions: + health_check: + endpoint: 0.0.0.0:13133 + + pprof: + endpoint: 0.0.0.0:1777 + + service: + extensions: [health_check, pprof] + + pipelines: + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [prometheus] \ No newline at end of file diff --git a/documentdb-playground/aks-fleet-deployment/prometheus-values.yaml b/documentdb-playground/aks-fleet-deployment/prometheus-values.yaml new file mode 100644 index 00000000..eac090b4 --- /dev/null +++ b/documentdb-playground/aks-fleet-deployment/prometheus-values.yaml @@ -0,0 +1,51 @@ +server: + persistentVolume: + size: 10Gi + retention: 15d + service: + type: LoadBalancer + ingress: + enabled: false + +alertmanager: + enabled: false + +prometheus-node-exporter: + enabled: false + +prometheus-pushgateway: + enabled: false + +kube-state-metrics: + enabled: false + +global: + scrape_interval: 15s + evaluation_interval: 15s + +serverFiles: + prometheus.yml: + scrape_configs: + - job_name: 'otel-collector-azure' + static_configs: + - targets: ['azure-documentdb-collector.documentdb-preview-ns.svc.cluster.local:8889'] + labels: + service: 'documentdb-gateway' + region: 'azure' + - job_name: 'otel-collector-aws' + static_configs: + - targets: ['aws-documentdb-collector.documentdb-preview-ns.svc.cluster.local:8889'] + labels: + service: 'documentdb-gateway' + region: 'aws' + - job_name: 'otel-collector-westus' + static_configs: + - targets: ['gcp-documentdb-collector.documentdb-preview-ns.svc.cluster.local:8889'] + labels: + service: 'documentdb-gateway' + region: 'gcp' + + # Prometheus self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] \ No newline at end of file diff --git a/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go b/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go index 0fbf40c3..b9a21015 100644 --- a/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go +++ b/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go @@ -133,7 +133,7 @@ func (impl Implementation) reconcileMetadata( envVars := []corev1.EnvVar{ { Name: "OTEL_EXPORTER_OTLP_ENDPOINT", - Value: "http://localhost:4412", + Value: "http://" + cluster.Name + "-collector." + cluster.Namespace + ".svc.cluster.local:4317", }, } diff --git a/operator/src/internal/cnpg/cnpg_cluster.go b/operator/src/internal/cnpg/cnpg_cluster.go index 5761160b..59844bdf 100644 --- a/operator/src/internal/cnpg/cnpg_cluster.go +++ b/operator/src/internal/cnpg/cnpg_cluster.go @@ -62,7 +62,10 @@ func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, docu }, InheritedMetadata: getInheritedMetadataLabels(documentdb.Name), Plugins: func() []cnpgv1.PluginConfiguration { - params := map[string]string{"gatewayImage": gatewayImage} + params := map[string]string{ + "gatewayImage": gatewayImage, + "documentDbCredentialSecret": credentialSecretName, + } // If TLS is ready, surface secret name to plugin so it can mount certs. if documentdb.Status.TLS != nil && documentdb.Status.TLS.Ready && documentdb.Status.TLS.SecretName != "" { params["gatewayTLSSecret"] = documentdb.Status.TLS.SecretName diff --git a/operator/src/internal/controller/documentdb_controller.go b/operator/src/internal/controller/documentdb_controller.go index c0e9b126..adb93492 100644 --- a/operator/src/internal/controller/documentdb_controller.go +++ b/operator/src/internal/controller/documentdb_controller.go @@ -97,7 +97,7 @@ func (r *DocumentDBReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Check if the DocumentDB Service already exists for this instance foundService, err := util.UpsertService(ctx, r.Client, ddbService) if err != nil { - logger.Info("Failed to create DocumentDB Service; Requeuing.") + logger.Error(err, "Failed to create DocumentDB Service; Requeuing.") return ctrl.Result{RequeueAfter: RequeueAfterShort}, nil } diff --git a/operator/src/scripts/multi-cloud-deployment/.gitignore b/operator/src/scripts/multi-cloud-deployment/.gitignore new file mode 100644 index 00000000..1503cc8a --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/.gitignore @@ -0,0 +1 @@ +certs \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/README.md b/operator/src/scripts/multi-cloud-deployment/README.md new file mode 100644 index 00000000..dfd87b56 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/README.md @@ -0,0 +1,608 @@ +# Multi-Cloud DocumentDB Deployment + +This directory contains templates and scripts for deploying DocumentDB across multiple cloud providers (Azure AKS, Google GKE, and AWS EKS) with cross-cloud replication using Istio service mesh and AKS Fleet for resource propagation. + +## Architecture + +- **Fleet Resource**: Deployed in East US 2 (management hub for resource propagation) +- **Multi-Cloud Clusters**: + - **AKS**: Single member cluster in configurable region (default: eastus2) + - **GKE**: Cluster in us-central1-a + - **EKS**: Cluster in us-west-2 +- **Network**: + - AKS: Uses default Azure CNI + - GKE: Default GKE networking + - EKS: Default EKS networking with NLB for cross-cloud connectivity +- **Service Mesh**: Istio multi-cluster mesh for cross-cloud service discovery +- **VM Size**: Standard_DS3_v2 for AKS, e2-standard-4 for GKE, m5.large for EKS (configurable) +- **Node Count**: 1-2 nodes per cluster for cost optimization +- **Kubernetes Version**: Uses region default GA version (configurable) +- **DocumentDB**: Multi-cloud deployment with primary/replica architecture and Istio-based replication + +## Prerequisites + +- **Azure**: Azure CLI installed and logged in (`az login`) +- **GCP**: Google Cloud SDK installed and logged in (`gcloud auth login`) + - gke-gcloud-auth-plugin: `sudo apt-get install google-cloud-cli-gke-gcloud-auth-plugin` +- **AWS**: AWS CLI installed and configured (`aws configure`) + - eksctl installed for EKS cluster management +- **Kubernetes Tools**: + - kubectl installed + - kubelogin for Azure AD authentication: `az aks install-cli` + - Helm 3.x installed +- **Other Tools**: + - jq for JSON processing: `brew install jq` (macOS) or `apt-get install jq` (Linux) + - openssl for password generation +- **Permissions**: + - Azure: Contributor access to the subscription + - GCP: Container Admin, Compute Network Admin, and Service Account User roles + - AWS: Sufficient IAM permissions to create EKS clusters and IAM roles +- **Quotas**: Sufficient quota in target regions for clusters + +## Quick Start + +### Deploy Everything (One Command) + +```bash +./deploy.sh +``` + +This single script will: +1. **Deploy Infrastructure**: + - Create Azure resource group + - Deploy AKS Fleet resource + - Deploy AKS member cluster + - Deploy GKE cluster (in parallel) + - Deploy EKS cluster with EBS CSI driver and AWS Load Balancer Controller +2. **Configure Multi-Cloud Mesh**: + - Join GKE and EKS clusters to the AKS Fleet + - Install cert-manager on all clusters + - Set up Istio multi-cluster service mesh with shared root CA + - Configure cross-cloud networking with east-west gateways +3. **Deploy DocumentDB Operator**: + - Install DocumentDB operator on hub cluster + - Propagate base resources (CRDs, RBAC) to all member clusters via Fleet +4. **Set Up Access**: + - Configure kubectl contexts for all clusters + - Set up RBAC access for Fleet + +### Deploy DocumentDB Database + +After the infrastructure is deployed: + +```bash +# With auto-generated password +./deploy-documentdb.sh + +# With custom password +./deploy-documentdb.sh "MySecureP@ssw0rd" + +# Disable Azure DNS creation (for testing) +ENABLE_AZURE_DNS=false ./deploy-documentdb.sh +``` + +This will: +- Create cluster identification ConfigMaps on each member cluster +- Select a primary cluster (defaults to EKS cluster) +- Deploy DocumentDB with Istio-based cross-cloud replication +- Create Azure DNS zone with records for each cluster (if enabled) +- Create SRV record for MongoDB connection string +- Provide connection information and failover commands + +## Configuration + +### Infrastructure Configuration + +Edit `parameters.bicepparam` to customize AKS deployment: +- Hub cluster name (used for fleet naming) +- Hub region (fleet location) +- Member cluster name and region +- VM sizes +- Node counts +- Kubernetes version + +Or use environment variables for all clouds: + +```bash +# Azure AKS +export RESOURCE_GROUP="my-multi-cloud-rg" +export RG_LOCATION="eastus2" +export HUB_REGION="eastus2" +export AKS_CLUSTER_NAME="azure-documentdb" +export AKS_REGION="eastus2" +export HUB_VM_SIZE="Standard_D4s_v3" + +# Google GKE +export PROJECT_ID="my-gcp-project-id" +export GCP_USER="user@example.com" +export ZONE="us-central1-a" +export GKE_CLUSTER_NAME="gcp-documentdb" + +# AWS EKS +export EKS_CLUSTER_NAME="aws-documentdb" +export EKS_REGION="us-west-2" + +# DocumentDB Operator +export VERSION="200" # Operator version +export VALUES_FILE="/path/to/custom/values.yaml" # Optional Helm values + +./deploy.sh +``` + +### DocumentDB Configuration + +Edit `documentdb-cluster.yaml` to customize: +- Database size and instances +- Replication settings (primary cluster, HA mode) +- Cross-cloud networking strategy (Istio) +- Storage class per environment +- Service exposure type +- Log levels + +The template uses placeholders replaced at runtime: +- `{{DOCUMENTDB_PASSWORD}}`: The database password +- `{{PRIMARY_CLUSTER}}`: The selected primary cluster +- `{{CLUSTER_LIST}}`: YAML list of all clusters with their environments + +### Azure DNS Configuration + +```bash +export ENABLE_AZURE_DNS="true" # Enable/disable DNS creation +export AZURE_DNS_ZONE_NAME="my-documentdb-zone" # DNS zone name (default: resource group name) +export AZURE_DNS_PARENT_ZONE_RESOURCE_ID="/subscriptions/.../dnszones/parent.zone" +``` + +## Environment Variables + +The deployment scripts automatically set and export: +- `FLEET_ID`: Full resource ID of the AKS fleet +- `IDENTITY`: Your Azure AD user ID +- `DOCUMENTDB_PASSWORD`: Database password (when deploying DocumentDB) +- `RESOURCE_GROUP`: Resource group name (default: german-aks-fleet-rg) +- `PROJECT_ID`: GCP project ID (default: sanguine-office-475117-s6) +- `ZONE`: GCP zone (default: us-central1-a) +- `EKS_REGION`: AWS region (default: us-west-2) + +## kubectl Contexts + +After deployment, contexts are automatically configured for: +- `hub`: AKS Fleet hub cluster +- `azure-documentdb`: AKS member cluster (default name) +- `gcp-documentdb`: GKE cluster (default name) +- `aws-documentdb`: EKS cluster (default name) + +## Management + +### Check Deployment Status + +```bash +# Check operator status on hub +kubectl --context hub get deploy -n documentdb-operator + +# Check DocumentDB base resources propagation +kubectl --context hub get clusterresourceplacement documentdb-base -o wide + +# Check DocumentDB cluster resources propagation +kubectl --context hub get clusterresourceplacement documentdb-crp -o wide + +# View specific cluster +kubectl --context get documentdb,pods -n documentdb-preview-ns +``` + +### Connect to Database + +#### Via Port-Forward (for testing) + +```bash +# Connect to primary cluster +kubectl --context port-forward \ + -n documentdb-preview-ns svc/documentdb-service- 10260:10260 + +mongosh localhost:10260 -u default_user -p \ + --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates +``` + +#### Via Azure DNS (production) + +When `ENABLE_AZURE_DNS=true`, use the MongoDB SRV connection string: + +```bash +mongosh "mongodb+srv://default_user:@_mongodb._tcp../?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" +``` + +Example: +```bash +mongosh "mongodb+srv://default_user:mypassword@_mongodb._tcp.german-aks-fleet-rg.multi-cloud.pgmongo-dev.cosmos.windows-int.net/?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" +``` + +### Failover Operations + +Failover is performed using the DocumentDB kubectl plugin: + +```bash +kubectl documentdb promote \ + --documentdb documentdb-preview \ + --namespace documentdb-preview-ns \ + --hub-context hub \ + --target-cluster \ + --cluster-context +``` + +## Fleet Management + +```bash +# Show AKS fleet details +az fleet show --name --resource-group $RESOURCE_GROUP + +# List fleet members (includes Azure members only, not cross-cloud) +az fleet member list --fleet-name --resource-group $RESOURCE_GROUP + +# Check all ClusterResourcePlacements +kubectl --context hub get clusterresourceplacement + +# View base resources placement (CRDs, RBAC) +kubectl --context hub describe clusterresourceplacement documentdb-base + +# View DocumentDB cluster placement +kubectl --context hub describe clusterresourceplacement documentdb-crp + +# Check multi-cloud fleet membership (GKE and EKS) +kubectl --context hub get membercluster +``` + +## Multi-Cloud Mesh Management + +### Verify Istio Installation + +```bash +# Check Istio components on each cluster +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $cluster ===" + kubectl --context $cluster get pods -n istio-system + echo +done + +# Verify east-west gateway services +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $cluster ===" + kubectl --context $cluster get svc -n istio-system istio-eastwestgateway + echo +done +``` + +### Verify Cross-Cloud Connectivity + +```bash +# Check remote secrets (for service discovery) +kubectl --context azure-documentdb get secrets -n istio-system | grep "istio-remote-secret" +kubectl --context gcp-documentdb get secrets -n istio-system | grep "istio-remote-secret" +kubectl --context aws-documentdb get secrets -n istio-system | grep "istio-remote-secret" + +# Verify mesh network configuration +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $cluster ===" + kubectl --context $cluster get namespace istio-system --show-labels + echo +done +``` + +## DocumentDB Management + +### Check Deployment Status + +```bash +# Quick status across all clusters +for c in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $c ===" + kubectl --context $c get documentdb,pods -n documentdb-preview-ns 2>/dev/null || echo 'Not deployed yet' + echo +done + +# Check operator status on all clusters +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $cluster ===" + kubectl --context $cluster get deploy -n documentdb-operator + kubectl --context $cluster get pods -n documentdb-operator +done +``` + +### Monitor Replication + +```bash +# Watch ClusterResourcePlacement status +watch 'kubectl --context hub get clusterresourceplacement documentdb-crp -o wide' + +# Monitor all DocumentDB instances +watch 'for c in azure-documentdb gcp-documentdb aws-documentdb; do \ + echo "=== $c ==="; \ + kubectl --context $c get documentdb,pods -n documentdb-preview-ns; \ + echo; \ +done' + +# Check DocumentDB service endpoints +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $cluster ===" + kubectl --context $cluster get svc -n documentdb-preview-ns + echo +done +``` + +### Verify Cross-Cloud Replication + +```bash +# Check WAL replica status in Istio mesh +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $cluster ===" + kubectl --context $cluster get pods -n documentdb-preview-ns -l component=wal-replica + echo +done + +# Verify Istio sidecar injection +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $cluster ===" + kubectl --context $cluster get pods -n documentdb-preview-ns -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[*].name}{"\n"}{end}' + echo +done +``` + +### Azure DNS Management + +```bash +# List DNS records for DocumentDB +az network dns record-set list \ + --zone-name \ + --resource-group $RESOURCE_GROUP \ + --output table + +# Show SRV record for MongoDB connection +az network dns record-set srv show \ + --name "_mongodb._tcp" \ + --zone-name \ + --resource-group $RESOURCE_GROUP + +# Show A/CNAME records for each cluster +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $cluster ===" + az network dns record-set a show --name $cluster --zone-name --resource-group $RESOURCE_GROUP 2>/dev/null || \ + az network dns record-set cname show --name $cluster --zone-name --resource-group $RESOURCE_GROUP 2>/dev/null || \ + echo "Record not found" + echo +done +``` + +## RBAC Management + +The deployment script automatically assigns the "Azure Kubernetes Fleet Manager RBAC Cluster Admin" role for AKS Fleet access. To manage RBAC: + +```bash +# View current role assignment +az role assignment list --assignee $IDENTITY --scope $FLEET_ID + +# Add another user +az role assignment create --role "Azure Kubernetes Fleet Manager RBAC Cluster Admin" \ + --assignee --scope $FLEET_ID +``` + +For GCP and AWS, ensure you have appropriate IAM permissions configured via `gcloud` and `aws` CLI. + +## Troubleshooting + +### Authentication Issues + +**Azure AKS:** +```bash +# Get fleet credentials +az fleet get-credentials --resource-group $RESOURCE_GROUP --name + +# If web authentication is blocked, use Azure CLI +kubelogin convert-kubeconfig -l azurecli + +# Use admin credentials for member clusters +az aks get-credentials --resource-group $RESOURCE_GROUP --name --admin +``` + +**Google GKE:** +```bash +# Refresh credentials +gcloud container clusters get-credentials --zone + +# Verify authentication +gcloud auth list +gcloud config get-value account +``` + +**AWS EKS:** +```bash +# Update kubeconfig +aws eks update-kubeconfig --name --region + +# Verify IAM identity +aws sts get-caller-identity +``` + +### Resource Propagation Issues + +```bash +# Check ClusterResourcePlacement status +kubectl --context hub get clusterresourceplacement documentdb-base -o yaml +kubectl --context hub get clusterresourceplacement documentdb-crp -o yaml + +# Verify fleet members (Azure native) +az fleet member list --fleet-name --resource-group $RESOURCE_GROUP + +# Verify multi-cloud member clusters +kubectl --context hub get membercluster +kubectl --context hub describe membercluster + +# Check if resources reached target clusters +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $cluster ===" + kubectl --context $cluster get documentdb -n documentdb-preview-ns + kubectl --context $cluster get pods -n documentdb-preview-ns + echo +done +``` + +### Istio Mesh Issues + +```bash +# Verify Istio installation +istioctl --context version + +# Check proxy status +istioctl --context proxy-status + +# Verify mesh configuration +istioctl --context analyze + +# Check east-west gateway connectivity +kubectl --context get svc -n istio-system istio-eastwestgateway + +# Verify remote secrets +kubectl --context get secrets -n istio-system | grep istio-remote-secret +``` + +### EKS-Specific Issues + +**EBS CSI Driver:** +```bash +# Check CSI driver status +kubectl --context aws-documentdb get pods -n kube-system -l app=ebs-csi-controller + +# Verify storage class +kubectl --context aws-documentdb get storageclass documentdb-storage +``` + +**AWS Load Balancer Controller:** +```bash +# Check controller status +kubectl --context aws-documentdb get pods -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller + +# Verify subnet tags +VPC_ID=$(aws eks describe-cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION --query 'cluster.resourcesVpcConfig.vpcId' --output text) +aws ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" --query 'Subnets[].{ID:SubnetId,Tags:Tags}' --region $EKS_REGION +``` + +### DNS Issues + +```bash +# Verify DNS zone exists +az network dns zone show --name --resource-group $RESOURCE_GROUP + +# Check DNS records +az network dns record-set list --zone-name --resource-group $RESOURCE_GROUP + +# Test DNS resolution +nslookup .. +nslookup _mongodb._tcp.. -type=SRV +``` + +### Cross-Cloud Connectivity + +```bash +# Deploy test pod with network tools +kubectl --context azure-documentdb run test-pod --image=nicolaka/netshoot -it --rm -- /bin/bash + +# From within the pod, test connectivity to other clusters +# Using Istio service discovery +curl -v http://documentdb-service-gcp-documentdb.documentdb-preview-ns.svc.cluster.local:10260 +curl -v http://documentdb-service-aws-documentdb.documentdb-preview-ns.svc.cluster.local:10260 +``` + +### Debugging + +```bash +# Check operator logs on hub +kubectl --context hub logs -n documentdb-operator deployment/documentdb-operator --tail=100 + +# Check operator logs on member clusters +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + echo "=== $cluster ===" + kubectl --context $cluster logs -n documentdb-operator deployment/documentdb-operator --tail=50 + echo +done + +# View DocumentDB resource status +kubectl --context describe documentdb documentdb-preview -n documentdb-preview-ns + +# Check Istio sidecar logs +kubectl --context logs -n documentdb-preview-ns -c istio-proxy +``` + +## Clean Up + +```bash +# Delete DocumentDB resources from all clusters +kubectl --context hub delete clusterresourceplacement documentdb-crp +kubectl --context hub delete namespace documentdb-preview-ns + +# Wait for namespace deletion to complete on all clusters +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do + kubectl --context $cluster wait --for=delete namespace/documentdb-preview-ns --timeout=60s || true +done + +# Delete base operator resources +kubectl --context hub delete clusterresourceplacement documentdb-base + +# Delete entire Azure resource group (includes AKS fleet and member) +az group delete --name $RESOURCE_GROUP --yes --no-wait + +# Delete GKE cluster +gcloud container clusters delete $GKE_CLUSTER_NAME \ + --zone $ZONE \ + --project $PROJECT_ID \ + --quiet + +# Delete EKS cluster (also deletes associated IAM roles and service accounts) +eksctl delete cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION + +# Delete Azure DNS zone (if created) +az network dns zone delete \ + --name \ + --resource-group $RESOURCE_GROUP \ + --yes + +# Clean up local kubectl contexts +kubectl config delete-context hub +kubectl config delete-context azure-documentdb +kubectl config delete-context gcp-documentdb +kubectl config delete-context aws-documentdb +``` + +## Scripts + +- **`deploy.sh`**: All-in-one multi-cloud deployment (AKS Fleet + GKE + EKS + cert-manager + Istio mesh + operator) +- **`deploy-documentdb.sh`**: Deploy multi-cloud DocumentDB with Istio-based replication and optional Azure DNS +- **`main.bicep`**: Bicep template for AKS Fleet and single member cluster +- **`parameters.bicepparam`**: Configuration parameters for AKS deployment +- **`documentdb-base.yaml`**: Fleet ClusterResourcePlacement for base resources (CRDs, RBAC, namespaces) +- **`documentdb-cluster.yaml`**: DocumentDB multi-cloud configuration template with Fleet ClusterResourcePlacement + +## Key Features + +- **Multi-Cloud Architecture**: Deploy across Azure AKS, Google GKE, and AWS EKS +- **Istio Service Mesh**: Cross-cloud service discovery and secure communication +- **Automated Mesh Setup**: Shared root CA, east-west gateways, and remote secrets +- **AKS Fleet Integration**: Resource propagation via ClusterResourcePlacement to all clouds +- **Cross-Cloud Replication**: DocumentDB replication using Istio for connectivity +- **Dynamic Discovery**: Automatically configures all clusters and generates failover commands +- **Azure DNS Integration**: Optional DNS zone creation with A/CNAME and SRV records for MongoDB +- **Cloud-Specific Configuration**: + - EKS: EBS CSI driver and AWS Load Balancer Controller + - GKE: Default persistent disk provisioner + - AKS: Azure Disk CSI driver +- **Parallel Deployment**: AKS, GKE, and EKS deployed concurrently for faster setup +- **Smart Defaults**: Sensible defaults with environment variable overrides + +## Additional Resources + +- [Azure AKS Fleet Documentation](https://learn.microsoft.com/en-us/azure/kubernetes-fleet/) +- [AKS Authentication Guide](https://learn.microsoft.com/en-us/azure/aks/kubelogin-authentication) +- [Fleet ClusterResourcePlacement API](https://learn.microsoft.com/en-us/azure/kubernetes-fleet/concepts-resource-propagation) +- [Istio Multi-Cluster Installation](https://istio.io/latest/docs/setup/install/multicluster/) +- [Istio Multi-Primary Multi-Network](https://istio.io/latest/docs/setup/install/multicluster/multi-primary_multi-network/) +- [Google GKE Documentation](https://cloud.google.com/kubernetes-engine/docs) +- [AWS EKS Documentation](https://docs.aws.amazon.com/eks/) +- [AWS Load Balancer Controller](https://kubernetes-sigs.github.io/aws-load-balancer-controller/) +- [eksctl Documentation](https://eksctl.io/) +- [DocumentDB Kubernetes Operator Documentation](../../README.md) \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh b/operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh new file mode 100755 index 00000000..a1a0eefd --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh @@ -0,0 +1,431 @@ +#!/usr/bin/env bash +# filepath: /Users/geeichbe/Projects/documentdb-kubernetes-operator/scripts/aks-fleet-deployment/deploy-multi-region.sh +set -euo pipefail + +# Deploy multi-region DocumentDB using Fleet with Traffic Manager +# Usage: ./deploy-documentdb.sh [password] +# +# Environment variables: +# RESOURCE_GROUP: Azure resource group (default: german-aks-fleet-rg) +# DOCUMENTDB_PASSWORD: Database password (will be generated if not provided) +# ENABLE_AZURE_DNS: Enable Azure DNS creation (default: true) +# AZURE_DNS_ZONE_NAME: Azure DNS zone name (default: same as resource group) +# AZURE_DNS_PARENT_ZONE_RESOURCE_ID: Azure DNS parent zone resource ID (default: multi-cloud.pgmongo-dev.cosmos.windows-int.net) +# +# Examples: +# ./deploy-multi-region.sh +# ENABLE_AZURE_DNS=false ./deploy-multi-region.sh mypassword + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Resource group +RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" + +AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-azure-documentdb}" +GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gcp-documentdb}" +EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-aws-documentdb}" + +# Azure DNS configuration +AZURE_DNS_ZONE_NAME="${AZURE_DNS_ZONE_NAME:-${RESOURCE_GROUP}}" +AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-/subscriptions/81901d5e-31aa-46c5-b61a-537dbd5df1e7/resourceGroups/alaye-documentdb-dns/providers/Microsoft.Network/dnszones/multi-cloud.pgmongo-dev.cosmos.windows-int.net}" +AZURE_DNS_ZONE_FULL_NAME="${AZURE_DNS_ZONE_FULL_NAME:-}" +AZURE_DNS_ZONE_RG="${AZURE_DNS_ZONE_RG:-${RESOURCE_GROUP}}" +ENABLE_AZURE_DNS="${ENABLE_AZURE_DNS:-true}" + +# Set password from argument or environment variable +DOCUMENTDB_PASSWORD="${1:-${DOCUMENTDB_PASSWORD:-}}" +DOCUMENTDB_IMAGE="${DOCUMENTDB_IMAGE:-ghcr.io/microsoft/documentdb/documentdb-local:16}" +GATEWAY_IMAGE="${GATEWAY_IMAGE:-${DOCUMENTDB_IMAGE}}" + +# If no password provided, generate a secure one +if [ -z "$DOCUMENTDB_PASSWORD" ]; then + echo "No password provided. Generating a secure password..." + DOCUMENTDB_PASSWORD=$(openssl rand -base64 32 | tr -d "=+/" | cut -c1-25) + echo "Generated password: $DOCUMENTDB_PASSWORD" + echo "(Save this password - you'll need it to connect to the database)" + echo "" +fi + +# Export for envsubst +export DOCUMENTDB_PASSWORD + + +# Convert to array and add GCP +CLUSTER_ARRAY=("$EKS_CLUSTER_NAME" "$AKS_CLUSTER_NAME" "$GKE_CLUSTER_NAME") +echo "Found ${#CLUSTER_ARRAY[@]} member clusters:" +for cluster in "${CLUSTER_ARRAY[@]}"; do + echo " - $cluster" +done + +PRIMARY_CLUSTER=${CLUSTER_ARRAY[0]} +echo "" +echo "Selected primary cluster: $PRIMARY_CLUSTER" + +# Build the cluster list YAML with proper indentation +CLUSTER_LIST=$(cat </dev/null; then + echo "✗ Context $cluster not found, skipping" + continue + fi + + # Create or update the cluster-name ConfigMap + kubectl --context "$cluster" create configmap cluster-name \ + -n kube-system \ + --from-literal=name="$cluster" \ + --dry-run=client -o yaml | kubectl --context "$cluster" apply -f - + + # Verify the ConfigMap was created + if kubectl --context "$cluster" get configmap cluster-name -n kube-system &>/dev/null; then + echo "✓ ConfigMap created/updated for $cluster" + else + echo "✗ Failed to create ConfigMap for $cluster" + fi +done + +# Step 2: Deploy DocumentDB resources via Fleet +echo "" +echo "=======================================" +echo "Deploying DocumentDB multi-region configuration..." +echo "=======================================" + +# Determine hub context +HUB_CONTEXT="${HUB_CONTEXT:-hub}" +if ! kubectl config get-contexts "$HUB_CONTEXT" &>/dev/null; then + echo "Hub context not found, trying to find first member cluster..." + HUB_CONTEXT="${CLUSTER_ARRAY[0]}" + if [ -z "$HUB_CONTEXT" ]; then + echo "Error: No suitable context found. Please ensure you have credentials for the fleet." + exit 1 + fi +fi + +echo "Using hub context: $HUB_CONTEXT" + +# Check if resources already exist +EXISTING_RESOURCES="" +if kubectl --context "$HUB_CONTEXT" get namespace documentdb-preview-ns &>/dev/null 2>&1; then + EXISTING_RESOURCES="${EXISTING_RESOURCES}namespace " +fi +if kubectl --context "$HUB_CONTEXT" get secret documentdb-credentials -n documentdb-preview-ns &>/dev/null 2>&1; then + EXISTING_RESOURCES="${EXISTING_RESOURCES}secret " +fi +if kubectl --context "$HUB_CONTEXT" get documentdb documentdb-preview -n documentdb-preview-ns &>/dev/null 2>&1; then + EXISTING_RESOURCES="${EXISTING_RESOURCES}documentdb " +fi +if kubectl --context "$HUB_CONTEXT" get clusterresourceplacement documentdb-crp &>/dev/null 2>&1; then + EXISTING_RESOURCES="${EXISTING_RESOURCES}clusterresourceplacement " +fi + +if [ -n "$EXISTING_RESOURCES" ]; then + echo "" + echo "⚠️ Warning: The following resources already exist: $EXISTING_RESOURCES" + echo "" + echo "Options:" + echo "1. Delete existing resources and redeploy ()" + echo "2. Update existing deployment (preserve data)" + echo "3. Cancel" + echo "" + read -p "Choose an option (1/2/3): " CHOICE + + case $CHOICE in + 1) + echo "Deleting existing resources..." + kubectl --context "$HUB_CONTEXT" delete clusterresourceplacement documentdb-crp --ignore-not-found=true + kubectl --context "$HUB_CONTEXT" delete namespace documentdb-preview-ns --ignore-not-found=true + echo "Waiting for namespace deletion to complete..." + for cluster in "${CLUSTER_ARRAY[@]}"; do + kubectl --context "$cluster" wait --for=delete namespace/documentdb-preview-ns --timeout=60s + done + ;; + 2) + echo "Updating existing deployment..." + ;; + 3) + echo "Cancelled." + exit 0 + ;; + *) + echo "Invalid choice. Cancelled." + exit 1 + ;; + esac +fi + +# Create a temporary file with substituted values +TEMP_YAML=$(mktemp) + +# Use sed for safer substitution +sed -e "s/{{DOCUMENTDB_PASSWORD}}/$DOCUMENTDB_PASSWORD/g" \ + -e "s/{{PRIMARY_CLUSTER}}/$PRIMARY_CLUSTER/g" \ + -e "s#{{DOCUMENTDB_IMAGE}}#$DOCUMENTDB_IMAGE#g" \ + -e "s#{{GATEWAY_IMAGE}}#$GATEWAY_IMAGE#g" \ + "$SCRIPT_DIR/documentdb-cluster.yaml" | \ +while IFS= read -r line; do + if [[ "$line" == '{{CLUSTER_LIST}}' ]]; then + echo "$CLUSTER_LIST" + else + echo "$line" + fi +done > "$TEMP_YAML" + +# Debug: show the generated YAML section with clusterReplication +echo "" +echo "Generated configuration preview:" +echo "--------------------------------" +echo "Primary cluster: $PRIMARY_CLUSTER" +echo "Cluster list:" +echo "$CLUSTER_LIST" +echo "--------------------------------" + +# cat "$TEMP_YAML" + +# Apply the configuration +echo "" +echo "Applying DocumentDB multi-region configuration..." +kubectl --context "$HUB_CONTEXT" apply -f "$TEMP_YAML" + +# Clean up temp file +rm -f "$TEMP_YAML" + +# Check the ClusterResourcePlacement status +echo "" +echo "Checking ClusterResourcePlacement status..." +kubectl --context "$HUB_CONTEXT" get clusterresourceplacement documentdb-crp -o wide + +# Wait a bit for propagation +echo "" +echo "Waiting for resources to propagate to member clusters..." +sleep 10 + +# Step 3: Verify deployment on each member cluster +echo "" +echo "=======================================" +echo "Checking deployment status on member clusters..." +echo "=======================================" + +for cluster in "${CLUSTER_ARRAY[@]}"; do + echo "" + echo "=== $cluster ===" + + # Check if context exists + if ! kubectl config get-contexts "$cluster" &>/dev/null; then + echo "✗ Context not found, skipping" + continue + fi + + # Check ConfigMap + if kubectl --context "$cluster" get configmap cluster-name -n kube-system &>/dev/null; then + CLUSTER_ID=$(kubectl --context "$cluster" get configmap cluster-name -n kube-system -o jsonpath='{.data.name}') + echo "✓ Cluster identified as: $CLUSTER_ID" + else + echo "✗ Cluster identification ConfigMap not found" + fi + + # Check if namespace exists + if kubectl --context "$cluster" get namespace documentdb-preview-ns &>/dev/null; then + echo "✓ Namespace exists" + + # Check if secret exists + if kubectl --context "$cluster" get secret documentdb-credentials -n documentdb-preview-ns &>/dev/null; then + echo "✓ Secret exists" + else + echo "✗ Secret not found" + fi + + # Check if DocumentDB exists + if kubectl --context "$cluster" get documentdb documentdb-preview -n documentdb-preview-ns &>/dev/null; then + echo "✓ DocumentDB resource exists" + + # Get DocumentDB status + STATUS=$(kubectl --context "$cluster" get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown") + echo " Status: $STATUS" + + # Check if this is the primary or replica + if [ "$cluster" = "$PRIMARY_CLUSTER" ]; then + echo " Role: PRIMARY" + else + echo " Role: REPLICA" + fi + else + echo "✗ DocumentDB resource not found" + fi + + # Check pods + PODS=$(kubectl --context "$cluster" get pods -n documentdb-preview-ns --no-headers 2>/dev/null | wc -l || echo "0") + echo " Pods: $PODS" + + # Show pod status if any exist + if [ "$PODS" -gt 0 ]; then + kubectl --context "$cluster" get pods -n documentdb-preview-ns 2>/dev/null | head -5 + fi + else + echo "✗ Namespace not found (resources may still be propagating)" + fi +done + +# Step 4: Create Azure DNS zone for DocumentDB +if [ "$ENABLE_AZURE_DNS" = "true" ]; then + echo "" + echo "=======================================" + echo "Creating Azure DNS zone for DocumentDB..." + echo "=======================================" + + if [ -n "$AZURE_DNS_ZONE_FULL_NAME" ]; then + fullName="$AZURE_DNS_ZONE_FULL_NAME" + else + parentName=$(az network dns zone show --id $AZURE_DNS_PARENT_ZONE_RESOURCE_ID | jq -r ".name") + fullName="${AZURE_DNS_ZONE_NAME}.${parentName}" + + # Create Azure DNS zone + if az network dns zone show --name "$AZURE_DNS_ZONE_NAME" --resource-group "$AZURE_DNS_ZONE_RG" &>/dev/null; then + echo "Azure DNS zone already exists, updating..." + else + az network dns zone create \ + --name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --parent-name "$AZURE_DNS_PARENT_ZONE_RESOURCE_ID" + fi + fi + + # Wait for DocumentDB services to be ready and create endpoints + echo "" + echo "Waiting for DocumentDB services to be ready..." + sleep 30 + + # Create DNS records for each cluster + for cluster in "${CLUSTER_ARRAY[@]}"; do + echo "Creating DNS record: $cluster" + + # Create service name by concatenating documentdb-preview with cluster name (max 63 chars) + SERVICE_NAME="documentdb-service-${cluster}" + SERVICE_NAME="${SERVICE_NAME:0:63}" + + # Get the external IP of the DocumentDB service + EXTERNAL_IP="" + for attempt in {1..12}; do # Try for 2 minutes + EXTERNAL_IP=$(kubectl --context "$cluster" get svc "$SERVICE_NAME" -n documentdb-preview-ns -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "") + if [ -n "$EXTERNAL_IP" ] && [ "$EXTERNAL_IP" != "" ]; then + break + fi + EXTERNAL_HOSTNAME=$(kubectl --context "$cluster" get svc "$SERVICE_NAME" -n documentdb-preview-ns -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || echo "") + if [ -n "$EXTERNAL_HOSTNAME" ] && [ "$EXTERNAL_HOSTNAME" != "" ]; then + break + fi + echo " Waiting for external IP for $cluster (service: $SERVICE_NAME, attempt $attempt/12)..." + sleep 10 + done + + if [ -n "$EXTERNAL_IP" ] && [ "$EXTERNAL_IP" != "" ]; then + echo " External IP for $cluster: $EXTERNAL_IP" + + # TODO Delete existing DNS record if it exists + az network dns record-set a delete \ + --name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --yes + + # Create DNS record + az network dns record-set a create \ + --name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --ttl 5 + az network dns record-set a add-record \ + --record-set-name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --ipv4-address "$EXTERNAL_IP" \ + --ttl 5 + + echo " ✓ Created DNS record $cluster" + elif [ -n "$EXTERNAL_HOSTNAME" ] && [ "$EXTERNAL_HOSTNAME" != "" ]; then + echo " External hostname for $cluster: $EXTERNAL_HOSTNAME" + + # TODO Delete existing DNS record if it exists + az network dns record-set cname delete \ + --name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --yes + + # Create DNS record + az network dns record-set cname create \ + --name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --ttl 5 + az network dns record-set cname set-record \ + --record-set-name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --cname "$EXTERNAL_HOSTNAME" \ + --ttl 5 + + echo " ✓ Created DNS record $cluster" + else + echo " ✗ Failed to get external IP for $cluster" + fi + done + + az network dns record-set srv delete \ + --name "_mongodb._tcp" \ + --zone-name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --yes + + az network dns record-set srv create \ + --name "_mongodb._tcp" \ + --zone-name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --ttl 5 + + mongoFQDN=$(az network dns record-set srv add-record \ + --record-set-name "_mongodb._tcp" \ + --zone-name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --priority 0 \ + --weight 0 \ + --port 10260 \ + --target "$PRIMARY_CLUSTER.$fullName" | jq -r ".fqdn") + + echo "" + echo "✓ DNS zone created successfully!" + echo " Zone Name: $fullName" + echo " MongoDB FQDN: $mongoFQDN" +fi + +echo "" +echo "Connection Information:" +echo " Username: docdb" +echo " Password: $DOCUMENTDB_PASSWORD" +echo "" +echo "To monitor the deployment:" +echo "watch 'kubectl --context $HUB_CONTEXT get clusterresourceplacement documentdb-crp -o wide'" +echo "" +echo "To check DocumentDB status across all clusters:" +# Create a space-separated string from the array +CLUSTER_STRING=$(IFS=' '; echo "${CLUSTER_ARRAY[*]}") +echo "for c in $CLUSTER_STRING; do echo \"=== \$c ===\"; kubectl --context \$c get documentdb,pods -n documentdb-preview-ns 2>/dev/null || echo 'Not deployed yet'; echo; done" \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/deploy-gke.sh b/operator/src/scripts/multi-cloud-deployment/deploy-gke.sh new file mode 100644 index 00000000..1b903152 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/deploy-gke.sh @@ -0,0 +1,100 @@ +#!/bin/bash + +PROJECT_ID="${PROJECT_ID:-gke-documentdb-demo}" +GKE_USER="${GKE_USER:-alexanderlaye57@gmail.com}" +CLUSTER_NAME="${CLUSTER_NAME:-gcp-documentdb}" +ZONE="${ZONE:-us-central1-a}" + +# one time +#gcloud projects create $PROJECT_ID +#sudo apt-get install google-cloud-cli-gke-gcloud-auth-plugin + +gcloud config set project $PROJECT_ID +gcloud config set account $USER +gcloud auth login --brief + +gcloud services enable container.googleapis.com +gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$USER" --role="roles/container.admin" +gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$USER" --role="roles/compute.networkAdmin" +gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$USER" --role="roles/iam.serviceAccountUser" + +gcloud container clusters create "$CLUSTER_NAME" \ + --zone "$ZONE" \ + --num-nodes "2" \ + --machine-type "e2-standard-4" \ + --enable-ip-access \ + --project $PROJECT_ID + +gcloud container clusters get-credentials "$CLUSTER_NAME" \ + --location="$ZONE" +kubectl config rename-context "$(kubectl config current-context)" $CLUSTER_NAME + +helm repo add jetstack https://charts.jetstack.io +helm repo update +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version v1.13.2 \ + --set installCRDs=true \ + --set prometheus.enabled=false \ + --set webhook.timeoutSeconds=30 + + +cat < /dev/null; then + echo "ERROR: Azure CLI not found. Please install Azure CLI first." >&2 + exit 1 + fi + + # Check kubectl + if ! command -v kubectl &> /dev/null; then + echo "ERROR: kubectl not found. Please install kubectl first." >&2 + exit 1 + fi + + # Check Helm + if ! command -v helm &> /dev/null; then + echo "ERROR: Helm not found. Please install Helm first." >&2 + exit 1 + fi + + # Check gcloud CLI + if ! command -v gcloud &> /dev/null; then + echo "ERROR: gcloud CLI not found. Please install Google Cloud SDK first." >&2 + exit 1 + fi + + # Check AWS CLI + if ! command -v aws &> /dev/null; then + echo "ERROR: AWS CLI not found. Please install AWS CLI first." >&2 + exit 1 + fi + + # Check eksctl + if ! command -v eksctl &> /dev/null; then + echo "ERROR: eksctl not found. Please install eksctl first." >&2 + exit 1 + fi + + # Check jq + if ! command -v jq &> /dev/null; then + echo "ERROR: jq not found. Please install jq first." >&2 + exit 1 + fi + + # Check Azure login + if ! az account show &> /dev/null; then + echo "ERROR: Not logged into Azure. Please run 'az login' first." >&2 + exit 1 + fi + + # Check gcloud login + gcloud config set account $GCP_USER + if ! gcloud auth list --filter=status:ACTIVE --format="value(account)" 2>/dev/null | grep -q .; then + echo "ERROR: Not logged into Google Cloud. Please run 'gcloud auth login' first." >&2 + exit 1 + fi + + # Check AWS credentials + if ! aws sts get-caller-identity &> /dev/null; then + echo "ERROR: AWS credentials not configured. Please run 'aws configure' first." >&2 + exit 1 + fi + + echo "✅ All prerequisites met" +} + +wait_for_no_inprogress() { + local rg="$1" + echo "Checking for in-progress AKS operations in resource group '$rg'..." + local inprogress + inprogress=$(az aks list -g "$rg" -o json \ + | jq -r '.[] | select(.provisioningState != "Succeeded" and .provisioningState != null) | [.name, .provisioningState] | @tsv') + + if [ -z "$inprogress" ]; then + echo "No in-progress AKS operations detected." + return 0 + fi + + echo "Found clusters still provisioning:" + echo "$inprogress" | while IFS=$'\t' read -r name state; do echo " - $name: $state"; done + echo "Please re-run this script after the above operations complete." >&2 + return 1 +} + +# ============================================================================ +# Step 1: Deploy AKS Fleet Infrastructure +# ============================================================================ + +aks_fleet_deploy() { + echo "Creating or using resource group..." + EXISTING_RG_LOCATION=$(az group show --name "$RESOURCE_GROUP" --query location -o tsv 2>/dev/null || true) + if [ -n "$EXISTING_RG_LOCATION" ]; then + echo "Using existing resource group '$RESOURCE_GROUP' in location '$EXISTING_RG_LOCATION'" + RG_LOCATION="$EXISTING_RG_LOCATION" + else + az group create --name "$RESOURCE_GROUP" --location "$RG_LOCATION" + fi + + echo "Deploying AKS Fleet with Bicep..." + if ! wait_for_no_inprogress "$RESOURCE_GROUP"; then + echo "Exiting without changes due to in-progress operations." >&2 + exit 1 + fi + + PARAMS=( + --parameters "$TEMPLATE_DIR/parameters.bicepparam" + --parameters hubRegion="$HUB_REGION" + --parameters memberRegion="$AKS_REGION" + --parameters memberName="$AKS_CLUSTER_NAME" + ) + + if [ -n "$HUB_VM_SIZE" ]; then + echo "Overriding hubVmSize with: $HUB_VM_SIZE" + PARAMS+=( --parameters hubVmSize="$HUB_VM_SIZE" ) + fi + + DEPLOYMENT_NAME="aks-fleet-$(date +%s)" + az deployment group create \ + --name "$DEPLOYMENT_NAME" \ + --resource-group $RESOURCE_GROUP \ + --template-file "$TEMPLATE_DIR/main.bicep" \ + "${PARAMS[@]}" >/dev/null + + # Retrieve outputs + DEPLOYMENT_OUTPUT=$(az deployment group show \ + --resource-group $RESOURCE_GROUP \ + --name "$DEPLOYMENT_NAME" \ + --query "properties.outputs" -o json) + + FLEET_NAME=$(echo $DEPLOYMENT_OUTPUT | jq -r '.fleetName.value') + FLEET_ID_FROM_OUTPUT=$(echo $DEPLOYMENT_OUTPUT | jq -r '.fleetId.value') + AKS_CLUSTER_NAME=$(echo $DEPLOYMENT_OUTPUT | jq -r '.memberClusterName.value') + + SUBSCRIPTION_ID=$(az account show --query id -o tsv) + export FLEET_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP}/providers/Microsoft.ContainerService/fleets/${FLEET_NAME}" + + # Set up RBAC + echo "Setting up RBAC access for Fleet..." + export IDENTITY=$(az ad signed-in-user show --query "id" --output tsv) + export ROLE="Azure Kubernetes Fleet Manager RBAC Cluster Admin" + echo "Assigning role '$ROLE' to user '$IDENTITY'..." + az role assignment create --role "${ROLE}" --assignee ${IDENTITY} --scope ${FLEET_ID} >/dev/null 2>&1 || true + + # Fetch kubeconfig contexts + echo "Fetching kubeconfig contexts..." + az fleet get-credentials --resource-group "$RESOURCE_GROUP" --name "$FLEET_NAME" --overwrite-existing + + az aks get-credentials --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" --overwrite-existing +} + +# ============================================================================ +# Step 1.2: Deploy GKE Infrastructure +# ============================================================================ + +# TODO move this to a check at the top +# sudo apt-get install google-cloud-cli-gke-gcloud-auth-plugin + +# Create project if it doesn't exist +gke_deploy() { + if ! gcloud projects describe $PROJECT_ID &>/dev/null; then + gcloud projects create $PROJECT_ID + fi + + gcloud config set project $PROJECT_ID + + gcloud services enable container.googleapis.com + gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$GCP_USER" --role="roles/container.admin" + gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$GCP_USER" --role="roles/compute.networkAdmin" + gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$GCP_USER" --role="roles/iam.serviceAccountUser" + + # Delete cluster if it exists + if gcloud container clusters describe "$GKE_CLUSTER_NAME" --zone "$ZONE" --project $PROJECT_ID &>/dev/null; then + gcloud container clusters delete "$GKE_CLUSTER_NAME" \ + --zone "$ZONE" \ + --project $PROJECT_ID \ + --quiet + fi + + gcloud container clusters create "$GKE_CLUSTER_NAME" \ + --zone "$ZONE" \ + --num-nodes "2" \ + --machine-type "e2-standard-4" \ + --enable-ip-access \ + --project $PROJECT_ID + + kubectl config delete-context "$GKE_CLUSTER_NAME" || true + kubectl config delete-cluster "$GKE_CLUSTER_NAME" || true + kubectl config delete-user "$GKE_CLUSTER_NAME" || true + gcloud container clusters get-credentials "$GKE_CLUSTER_NAME" \ + --location="$ZONE" + fullName="gke_${PROJECT_ID}_${ZONE}_${GKE_CLUSTER_NAME}" + # Replace all occurrences of the generated name with GKE_CLUSTER_NAME in kubeconfig + sed -i "s|$fullName|$GKE_CLUSTER_NAME|g" ~/.kube/config +} + + +# ============================================================================ +# Step 1.3: Deploy EKS Infrastructure +# ============================================================================ + +eks_deploy() { + NODE_TYPE="m5.large" + + if eksctl get cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION &> /dev/null; then + echo "Cluster $EKS_CLUSTER_NAME already exists." + else + eksctl create cluster \ + --name $EKS_CLUSTER_NAME \ + --region $EKS_REGION \ + --node-type $NODE_TYPE \ + --nodes 2 \ + --nodes-min 2 \ + --nodes-max 2 \ + --managed \ + --with-oidc + fi + + eksctl create iamserviceaccount \ + --cluster $EKS_CLUSTER_NAME \ + --namespace kube-system \ + --name ebs-csi-controller-sa \ + --attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy \ + --override-existing-serviceaccounts \ + --approve \ + --region $EKS_REGION + + # Install EBS CSI driver addon + eksctl create addon \ + --name aws-ebs-csi-driver \ + --cluster $EKS_CLUSTER_NAME \ + --region $EKS_REGION \ + --force + + # Wait for EBS CSI driver to be ready + echo "Waiting for EBS CSI driver to be ready..." + sleep 5 + kubectl wait --for=condition=ready pod -l app=ebs-csi-controller -n kube-system --timeout=300s || echo "EBS CSI driver pods may still be starting" + + echo "Installing AWS Load Balancer Controller..." + + # Check if already installed + if helm list -n kube-system | grep -q aws-load-balancer-controller; then + echo "AWS Load Balancer Controller already installed. Skipping installation." + else + # Get VPC ID for the cluster + VPC_ID=$(aws eks describe-cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION --query 'cluster.resourcesVpcConfig.vpcId' --output text) + echo "Using VPC ID: $VPC_ID" + + # Verify subnet tags for Load Balancer Controller + echo "Verifying subnet tags for Load Balancer Controller..." + PUBLIC_SUBNETS=$(aws ec2 describe-subnets \ + --filters "Name=vpc-id,Values=$VPC_ID" "Name=map-public-ip-on-launch,Values=true" \ + --query 'Subnets[].SubnetId' --output text --region $EKS_REGION) + + PRIVATE_SUBNETS=$(aws ec2 describe-subnets \ + --filters "Name=vpc-id,Values=$VPC_ID" "Name=map-public-ip-on-launch,Values=false" \ + --query 'Subnets[].SubnetId' --output text --region $EKS_REGION) + + # Tag public subnets for internet-facing load balancers + if [ -n "$PUBLIC_SUBNETS" ]; then + echo "Tagging public subnets for internet-facing load balancers..." + for subnet in $PUBLIC_SUBNETS; do + aws ec2 create-tags --resources "$subnet" --tags Key=kubernetes.io/role/elb,Value=1 --region $EKS_REGION 2>/dev/null || true + echo "Tagged public subnet: $subnet" + done + fi + + # Tag private subnets for internal load balancers + if [ -n "$PRIVATE_SUBNETS" ]; then + echo "Tagging private subnets for internal load balancers..." + for subnet in $PRIVATE_SUBNETS; do + aws ec2 create-tags --resources "$subnet" --tags Key=kubernetes.io/role/internal-elb,Value=1 --region $EKS_REGION 2>/dev/null || true + echo "Tagged private subnet: $subnet" + done + fi + + # Download the official IAM policy (latest version) + echo "Downloading AWS Load Balancer Controller IAM policy (latest version)..." + curl -o /tmp/iam_policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/main/docs/install/iam_policy.json + + # Get account ID + ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) + + # Check if policy exists and create/update as needed + if aws iam get-policy --policy-arn arn:aws:iam::$ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy &>/dev/null; then + echo "IAM policy already exists, updating to latest version..." + # Delete and recreate to ensure we have the latest version + aws iam delete-policy --policy-arn arn:aws:iam::$ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy 2>/dev/null || true + sleep 5 # Wait for deletion to propagate + fi + + # Create IAM policy with latest permissions + echo "Creating IAM policy with latest permissions..." + aws iam create-policy \ + --policy-name AWSLoadBalancerControllerIAMPolicy \ + --policy-document file:///tmp/iam_policy.json 2>/dev/null || \ + echo "IAM policy already exists or was just created" + # Wait a moment for policy to be available + sleep 5 + + # Create IAM service account with proper permissions using eksctl + echo "Creating IAM service account with proper permissions..." + eksctl create iamserviceaccount \ + --cluster=$EKS_CLUSTER_NAME \ + --namespace=kube-system \ + --name=aws-load-balancer-controller \ + --role-name "AmazonEKSLoadBalancerControllerRole-$EKS_CLUSTER_NAME" \ + --attach-policy-arn=arn:aws:iam::$ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy \ + --approve \ + --override-existing-serviceaccounts \ + --region=$EKS_REGION + + # Add EKS Helm repository + helm repo add eks https://aws.github.io/eks-charts + helm repo update eks + + # Install Load Balancer Controller using the existing service account + helm install aws-load-balancer-controller eks/aws-load-balancer-controller \ + -n kube-system \ + --set clusterName=$EKS_CLUSTER_NAME \ + --set serviceAccount.create=false \ + --set serviceAccount.name=aws-load-balancer-controller \ + --set region=$EKS_REGION \ + --set vpcId=$VPC_ID + + # Wait for Load Balancer Controller to be ready + echo "Waiting for Load Balancer Controller to be ready..." + sleep 5 + kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=aws-load-balancer-controller -n kube-system --timeout=300s || echo "Load Balancer Controller pods may still be starting" + + # Clean up temp file + rm -f /tmp/iam_policy.json + + echo "AWS Load Balancer Controller installed" + fi + + if kubectl get storageclass documentdb-storage &> /dev/null; then + echo "DocumentDB storage class already exists. Skipping creation." + else + kubectl apply -f - </dev/null || true +helm repo update >/dev/null 2>&1 + +for cluster in ${MEMBER_CLUSTER_NAMES[@]}; do + echo "Installing cert-manager on $cluster..." + kubectl config use-context "$cluster" 2>/dev/null + helm upgrade --install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --set installCRDs=true \ + --wait --timeout=5m >/dev/null 2>&1 || echo " Warning: cert-manager installation issue on $cluster" + echo "✓ cert-manager installed on $cluster" +done + +echo "✅ cert-manager installed on all clusters" + +# ============================================================================ +# Step 5: Install Istio and setup mesh +# ============================================================================ + +# Create an issuer in istio-system namespace on hub +temp_dir=$(mktemp -d) +echo "Temporary directory created at: $temp_dir" + +# Check if istioctl is installed, if not install it to temp_dir +if ! command -v istioctl &> /dev/null; then + echo "istioctl not found, installing to $temp_dir..." + ISTIO_VERSION="1.24.0" + curl -L https://istio.io/downloadIstio | ISTIO_VERSION=$ISTIO_VERSION TARGET_ARCH=x86_64 sh - -d "$temp_dir" >/dev/null 2>&1 + export PATH="$temp_dir/istio-$ISTIO_VERSION/bin:$PATH" + echo "✓ istioctl installed to $temp_dir/istio-$ISTIO_VERSION/bin" +else + echo "✓ istioctl already installed: $(which istioctl)" +fi + +if [ -z "$ISTIO_DIR" ]; then + git clone https://github.com/istio/istio.git "$temp_dir/istio" + export ISTIO_DIR="$temp_dir/istio" +fi +rm -rf "$TEMPLATE_DIR/certs" +mkdir $TEMPLATE_DIR/certs +pushd $TEMPLATE_DIR/certs +make -f "$ISTIO_DIR/tools/certs/Makefile.selfsigned.mk" root-ca +index=1 +for cluster in ${MEMBER_CLUSTER_NAMES[@]}; do + make -f "$ISTIO_DIR/tools/certs/Makefile.selfsigned.mk" "${cluster}-cacerts" + kubectl --context "$cluster" delete namespace/istio-system --wait=true --ignore-not-found=true + kubectl --context "$cluster" create namespace istio-system + kubectl --context "$cluster" wait --for=jsonpath='{.status.phase}'=Active namespace/istio-system --timeout=60s + # create certs + kubectl --context "$cluster" create secret generic cacerts -n istio-system \ + --from-file="${cluster}/ca-cert.pem" \ + --from-file="${cluster}/ca-key.pem" \ + --from-file="${cluster}/root-cert.pem" \ + --from-file="${cluster}/cert-chain.pem" + + kubectl --context="${cluster}" label namespace istio-system topology.istio.io/network=network${index} + + #install istio on each cluster + cat < $remoteSecretFile + for other_cluster in ${MEMBER_CLUSTER_NAMES[@]}; do + if [ "$cluster" = "$other_cluster" ]; then + continue + fi + kubectl apply -f $remoteSecretFile --context="${other_cluster}" + done +done + +popd + +# 5.1 add lb tags to istio ew gateway on aws +kubectl --context "$EKS_CLUSTER_NAME" -n istio-system annotate service istio-eastwestgateway \ + service.beta.kubernetes.io/aws-load-balancer-type="nlb" \ + service.beta.kubernetes.io/aws-load-balancer-scheme="internet-facing" \ + service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled="true" \ + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type="ip" + +# ============================================================================ +# Step 6: Install DocumentDB Operator +# ============================================================================ + +CHART_DIR="$(cd "$TEMPLATE_DIR/../../.." && pwd)/documentdb-helm-chart" +CHART_PKG="$TEMPLATE_DIR/documentdb-operator-0.0.${VERSION}.tgz" + +# Apply cert-manager CRDs on hub +echo "Applying cert-manager CRDs on hub ($HUB_CONTEXT)..." +kubectl --context "$HUB_CONTEXT" apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.crds.yaml #>/dev/null 2>&1 + +# Create documentdb-operator namespace with Istio injection on hub +cat </dev/null || echo "0") + DESIRED=$(kubectl --context "$cluster" get deploy documentdb-operator -n documentdb-operator -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0") + echo " $cluster: $READY/$DESIRED replicas ready" +done + +# ============================================================================ +# Save environment variables and aliases +# ============================================================================ diff --git a/operator/src/scripts/multi-cloud-deployment/dns_failover.sh b/operator/src/scripts/multi-cloud-deployment/dns_failover.sh new file mode 100755 index 00000000..21608b8a --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/dns_failover.sh @@ -0,0 +1,55 @@ +#/bin/bash + +RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" +DOCUMENTDB_NAME="${DOCUMENTDB_NAME:-documentdb-preview}" +DOCUMENTDB_NAMESPACE="${DOCUMENTDB_NAMESPACE:-documentdb-preview-ns}" +HUB_CONTEXT="${HUB_CONTEXT:-hub}" +GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gcp-documentdb}" + +MEMBER_CLUSTERS=$(kubectl --context "$HUB_CONTEXT" get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq -r ".spec.clusterReplication.clusterList[].name") +PRIMARY_CLUSTER=$(kubectl --context "$HUB_CONTEXT" get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq -r ".spec.clusterReplication.primary") +TARGET_CLUSTER=$1 + +# Convert to array +CLUSTER_ARRAY=($MEMBER_CLUSTERS) +echo "Found ${#CLUSTER_ARRAY[@]} member clusters:" +for cluster in "${CLUSTER_ARRAY[@]}"; do + echo " - $cluster" + if [ "$cluster" == "$PRIMARY_CLUSTER" ]; then + echo " (current primary)" + elif [ "$cluster" == "$TARGET_CLUSTER" ]; then + echo " (target primary)" + fi +done + + +dnsName=$(az network dns zone list --resource-group $RESOURCE_GROUP --query="[0].name" -o tsv) + +#delete old srv record +az network dns record-set srv remove-record \ + --record-set-name "_mongodb._tcp" \ + --zone-name "$dnsName" \ + --resource-group "$RESOURCE_GROUP" \ + --priority 0 \ + --weight 0 \ + --port 10260 \ + --target "$PRIMARY_CLUSTER.$dnsName" \ + --keep-empty-record-set + +#create new one +az network dns record-set srv add-record \ + --record-set-name "_mongodb._tcp" \ + --zone-name "$dnsName" \ + --resource-group "$RESOURCE_GROUP" \ + --priority 0 \ + --weight 0 \ + --port 10260 \ + --target "$TARGET_CLUSTER.$dnsName" + +echo "To initiate failover to $TARGET_CLUSTER run:" +echo "kubectl documentdb promote \\" +echo " --documentdb documentdb-preview \\" +echo " --namespace documentdb-preview-ns \\" +echo " --hub-context $HUB_CONTEXT \\" +echo " --target-cluster $TARGET_CLUSTER \\" +echo " --cluster-context $TARGET_CLUSTER" diff --git a/operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml b/operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml new file mode 100644 index 00000000..c33dacb6 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml @@ -0,0 +1,99 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterResourcePlacement +metadata: + name: documentdb-base +spec: + resourceSelectors: + - group: "" + version: v1 + kind: Namespace + name: documentdb-operator + - group: "" + version: v1 + kind: Namespace + name: cnpg-system + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: documentdbs.db.microsoft.com + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: publications.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: poolers.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: clusterimagecatalogs.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: imagecatalogs.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: scheduledbackups.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: backups.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: subscriptions.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: databases.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: clusters.postgresql.cnpg.io + # RBAC roles and bindings + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: documentdb-operator-cluster-role + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: documentdb-operator-cloudnative-pg + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: documentdb-operator-cloudnative-pg-edit + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: documentdb-operator-cloudnative-pg-view + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRoleBinding + name: documentdb-operator-cluster-rolebinding + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRoleBinding + name: documentdb-operator-cloudnative-pg + - group: "admissionregistration.k8s.io" + version: v1 + kind: MutatingWebhookConfiguration + name: cnpg-mutating-webhook-configuration + - group: "admissionregistration.k8s.io" + version: v1 + kind: ValidatingWebhookConfiguration + name: cnpg-validating-webhook-configuration + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: wal-replica-manager + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRoleBinding + name: wal-replica-manager-binding + policy: + placementType: PickAll + strategy: + type: RollingUpdate \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml b/operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml new file mode 100644 index 00000000..389c6991 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml @@ -0,0 +1,61 @@ +# Namespace definition +apiVersion: v1 +kind: Namespace +metadata: + name: documentdb-preview-ns + labels: + istio-injection: enabled + +--- + +apiVersion: v1 +kind: Secret +metadata: + name: documentdb-credentials + namespace: documentdb-preview-ns +type: Opaque +stringData: + username: docdb + password: {{DOCUMENTDB_PASSWORD}} + +--- + +apiVersion: db.microsoft.com/preview +kind: DocumentDB +metadata: + name: documentdb-preview + namespace: documentdb-preview-ns +spec: + nodeCount: 1 + instancesPerNode: 1 + documentDBImage: {{DOCUMENTDB_IMAGE}} + gatewayImage: {{GATEWAY_IMAGE}} + resource: + storage: + pvcSize: 10Gi + clusterReplication: + highAvailability: true + primary: {{PRIMARY_CLUSTER}} + crossCloudNetworkingStrategy: Istio + clusterList: +{{CLUSTER_LIST}} + exposeViaService: + serviceType: LoadBalancer + logLevel: info + +--- + +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterResourcePlacement +metadata: + name: documentdb-crp +spec: + resourceSelectors: + - group: "" + version: v1 + kind: Namespace + name: documentdb-preview-ns + policy: + placementType: PickAll + strategy: + type: RollingUpdate \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/insert_test.py b/operator/src/scripts/multi-cloud-deployment/insert_test.py new file mode 100644 index 00000000..9434868b --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/insert_test.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 + +import sys +import time +from pymongo import MongoClient, errors +from datetime import datetime + +if len(sys.argv) != 2: + print(f"Usage: python insert_test.py ") + sys.exit(1) + +connection_string = sys.argv[1] + +client = MongoClient(connection_string) + +db = client.testdb +collection = db.testcollection + +print(f"{'Inserted Document':<30} {'Insert Count':<15}") +print("-" * 77) +start_time = time.time() +end_time = start_time + (60 * 60) # 60 minutes +count = 0 +first_error_seen = False + +while time.time() < end_time: + write_result = "" + try: + doc = { + "count": count, + "message": f"Insert operation {count}" + } + result = collection.insert_one(doc) + write_result = result.inserted_id + count += 1 + print(f"{str(write_result):<30} {count:<15}") + except Exception as e: + if not first_error_seen: + print("Switching cloud to Azure") + first_error_seen = True + + time.sleep(.25) + +print(f"Completed {count} insert operations in 10 minutes") +final_read_count = collection.count_documents({}) +print(f"Final read count: {final_read_count}") +client.close() diff --git a/operator/src/scripts/multi-cloud-deployment/main.bicep b/operator/src/scripts/multi-cloud-deployment/main.bicep new file mode 100644 index 00000000..da8934b0 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/main.bicep @@ -0,0 +1,74 @@ +targetScope = 'resourceGroup' + +@description('Name of the Fleet Hub AKS cluster') +param hubClusterName string = 'aks-fleet-hub' + +@description('Location for the Fleet Hub') +param hubRegion string = 'eastus2' + +@description('Name for member cluster') +param memberName string = 'aks-fleet-member' + +@description('Location for member cluster') +param memberRegion string = 'eastus2' + +@description('Kubernetes version. Leave empty to use the region default GA version.') +param kubernetesVersion string = '' + +@description('VM size for cluster nodes') +param hubVmSize string = 'Standard_DS3_v2' + +@description('Number of nodes per cluster') +param nodeCount int = 1 + +var fleetName = '${hubClusterName}-fleet' + +// Optionally include kubernetesVersion in cluster properties +var maybeK8sVersion = empty(kubernetesVersion) ? {} : { kubernetesVersion: kubernetesVersion } + +// Fleet resource +resource fleet 'Microsoft.ContainerService/fleets@2025-03-01' = { + name: fleetName + location: hubRegion + properties: { + hubProfile: { + dnsPrefix: fleetName + } + } +} + +// Member AKS Cluster (using default Azure CNI without custom VNets) +resource memberCluster 'Microsoft.ContainerService/managedClusters@2023-10-01' = { + name: memberName + location: memberRegion + identity: { + type: 'SystemAssigned' + } + properties: union({ + dnsPrefix: 'member-${memberRegion}-dns' + agentPoolProfiles: [ + { + name: 'agentpool' + count: nodeCount + vmSize: hubVmSize + mode: 'System' + osType: 'Linux' + } + ] + }, maybeK8sVersion) +} + +// Member clusters fleet membership +resource memberFleetMembers 'Microsoft.ContainerService/fleets/members@2023-10-15' = { + name: memberName + parent: fleet + properties: { + clusterResourceId: memberCluster.id + } +} + +// Outputs +output fleetId string = fleet.id +output fleetName string = fleet.name +output memberClusterId string = memberCluster.id +output memberClusterName string = memberCluster.name diff --git a/operator/src/scripts/multi-cloud-deployment/parameters.bicepparam b/operator/src/scripts/multi-cloud-deployment/parameters.bicepparam new file mode 100644 index 00000000..c58e7310 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/parameters.bicepparam @@ -0,0 +1,8 @@ +using './main.bicep' + +param hubClusterName = 'aks-fleet-hub' +param hubRegion = 'eastus2' +param memberRegion = 'eastus2' +param kubernetesVersion = '' +param nodeCount = 1 +param hubVmSize = 'Standard_DS3_v2' diff --git a/operator/src/scripts/multi-cloud-deployment/read_test.py b/operator/src/scripts/multi-cloud-deployment/read_test.py new file mode 100644 index 00000000..20b6eea8 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/read_test.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +import sys +import time +from pymongo import MongoClient, errors +from datetime import datetime + +if len(sys.argv) != 2: + print("Usage: python insert_test.py ") + sys.exit(1) + +connection_string = sys.argv[1] + +client = MongoClient(connection_string) + +db = client.testdb +collection = db.testcollection + +# Perform single insert operation +print(f"Performing initial insert operation...") +print(f"Using: {connection_string.split('@')[1] if '@' in connection_string else 'local'}") +print() +print(f"Starting read operations for 10 minutes...") +print(f"{'Timestamp':<20} {'Read Count':<15} {'Status':<20}") +print("-" * 80) + +start_time = time.time() +end_time = start_time + (10 * 60) # 10 minutes +read_count = 0 +error_count = 0 + +while time.time() < end_time: + timestamp = datetime.now().strftime("%H:%M:%S") + try: + count = collection.count_documents({}) + read_count += 1 + print(f"{timestamp:<20} {count:<15} {'Success':<20}") + except Exception as e: + error_count += 1 + print(f"{timestamp:<20} {'N/A':<15} {'ERROR':<20}") + print(f" Exception Type: {type(e).__name__}") + print(f" Exception Message: {str(e)}") + if hasattr(e, 'details'): + print(f" Details: {e.details}") + if hasattr(e, '__cause__'): + print(f" Cause: {e.__cause__}") + print() + + time.sleep(1) + +print() +print(f"Completed {read_count} successful read operations in 10 minutes") +print(f"Total errors: {error_count}") +try: + final_count = collection.count_documents({}) + print(f"Final document count: {final_count}") +except Exception as e: + print(f"ERROR reading final count:") + print(f" Exception Type: {type(e).__name__}") + print(f" Exception Message: {str(e)}") +client.close() From 098fc67d00f86f14d666636ce041c0cc6a49fd5d Mon Sep 17 00:00:00 2001 From: Alexander Laye Date: Tue, 11 Nov 2025 11:15:44 -0500 Subject: [PATCH 2/5] move files to proper folder move to new folder and add docs Signed-off-by: Alexander Laye adjust scripts fix deploy location add backup resource delete commented code revert to pulling the archive instead of using local docdb --- .github/dockerfiles/Dockerfile_gateway | 37 +- .../aks-fleet-deployment/grafana-values.yaml | 106 --- .../multi-cloud-deployment/README.md | 113 +++- .../deploy-documentdb.sh | 62 +- .../multi-cloud-deployment/deploy.sh | 13 +- .../multi-cloud-deployment/dns_failover.sh | 2 +- .../documentdb-base.yaml | 8 + .../documentdb-cluster.yaml | 6 +- .../multi-cloud-deployment/insert_test.py | 8 +- .../multi-cloud-deployment/main.bicep | 2 +- .../telemetry}/dashboard.json | 0 .../telemetry}/deploy-telemetry.sh | 23 +- .../telemetry/grafana-values.yaml | 22 + .../telemetry}/otel-collector.yaml | 0 .../telemetry}/prometheus-values.yaml | 0 .../scripts/multi-cloud-deployment/.gitignore | 1 - .../scripts/multi-cloud-deployment/README.md | 608 ----------------- .../deploy-documentdb.sh | 431 ------------ .../multi-cloud-deployment/deploy-gke.sh | 100 --- .../scripts/multi-cloud-deployment/deploy.sh | 628 ------------------ .../multi-cloud-deployment/dns_failover.sh | 55 -- .../documentdb-base.yaml | 99 --- .../documentdb-cluster.yaml | 61 -- .../multi-cloud-deployment/insert_test.py | 47 -- .../scripts/multi-cloud-deployment/main.bicep | 74 --- .../parameters.bicepparam | 8 - .../multi-cloud-deployment/read_test.py | 61 -- 27 files changed, 190 insertions(+), 2385 deletions(-) delete mode 100644 documentdb-playground/aks-fleet-deployment/grafana-values.yaml rename documentdb-playground/{aks-fleet-deployment => multi-cloud-deployment/telemetry}/dashboard.json (100%) rename documentdb-playground/{aks-fleet-deployment => multi-cloud-deployment/telemetry}/deploy-telemetry.sh (93%) create mode 100644 documentdb-playground/multi-cloud-deployment/telemetry/grafana-values.yaml rename documentdb-playground/{aks-fleet-deployment => multi-cloud-deployment/telemetry}/otel-collector.yaml (100%) rename documentdb-playground/{aks-fleet-deployment => multi-cloud-deployment/telemetry}/prometheus-values.yaml (100%) delete mode 100644 operator/src/scripts/multi-cloud-deployment/.gitignore delete mode 100644 operator/src/scripts/multi-cloud-deployment/README.md delete mode 100755 operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh delete mode 100644 operator/src/scripts/multi-cloud-deployment/deploy-gke.sh delete mode 100755 operator/src/scripts/multi-cloud-deployment/deploy.sh delete mode 100755 operator/src/scripts/multi-cloud-deployment/dns_failover.sh delete mode 100644 operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml delete mode 100644 operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml delete mode 100644 operator/src/scripts/multi-cloud-deployment/insert_test.py delete mode 100644 operator/src/scripts/multi-cloud-deployment/main.bicep delete mode 100644 operator/src/scripts/multi-cloud-deployment/parameters.bicepparam delete mode 100644 operator/src/scripts/multi-cloud-deployment/read_test.py diff --git a/.github/dockerfiles/Dockerfile_gateway b/.github/dockerfiles/Dockerfile_gateway index 3ea714a7..6ccdc926 100644 --- a/.github/dockerfiles/Dockerfile_gateway +++ b/.github/dockerfiles/Dockerfile_gateway @@ -26,13 +26,13 @@ USER documentdb WORKDIR /home/documentdb/code/ # Get the docuemntdb repository -#RUN wget -P /tmp https://github.com/documentdb/documentdb/archive/refs/tags/v${DocumentDB_VERSION}.zip && \ - #unzip /tmp/v${DocumentDB_VERSION}.zip -d /home/documentdb/code && \ - #rm /tmp/v${DocumentDB_VERSION}.zip -RUN sudo chown -R documentdb:documentdb /home/documentdb/ -# For local builds, copy the code over directly -COPY --chown=documentdb:documentdb . /home/documentdb/code/documentdb-${DocumentDB_VERSION} +RUN wget -P /tmp https://github.com/documentdb/documentdb/archive/refs/tags/v${DocumentDB_VERSION}.zip && \ + unzip /tmp/v${DocumentDB_VERSION}.zip -d /home/documentdb/code && \ + rm /tmp/v${DocumentDB_VERSION}.zip +# For local builds, copy the code over directly (uncomment the line below, and comment the wget line above) +# COPY --chown=documentdb:documentdb . /home/documentdb/code/documentdb-${DocumentDB_VERSION} +RUN sudo chown -R documentdb:documentdb /home/documentdb/ WORKDIR /home/documentdb/code/documentdb-${DocumentDB_VERSION}/pg_documentdb_gw @@ -47,17 +47,12 @@ ENV DocumentDB_VERSION=${DocumentDB_VERSION} RUN apt-get update && \ apt-get install -y --no-install-recommends \ - jq openssl lsof sudo ca-certificates postgresql-client && \ + jq openssl lsof sudo ca-certificates && \ apt-get upgrade -y && \ rm -rf /var/lib/apt/lists/* ENV LANGUAGE=en_US.UTF-8 \ - TERM=xterm-256color \ - OTEL_TRACING_ENABLED=true \ - OTEL_METRICS_ENABLED=true \ - OTEL_LOGGING_ENABLED=true \ - OTEL_LOGS_CONSOLE_ENABLED=true \ - PGHOST=localhost + TERM=xterm-256color # ENV ENFORCE_SSL="true" \ # CERT_PATH="" \ @@ -80,14 +75,14 @@ RUN echo "%sudo ALL=(ALL:ALL) NOPASSWD: ALL" >> /etc/sudoers.d/no-pass-ask USER documentdb -RUN mkdir /home/documentdb/gateway -RUN mkdir -p /home/documentdb/gateway/pg_documentdb_gw/target/release-with-symbols/ +RUN sudo mkdir /home/documentdb/gateway + +COPY --from=stage /home/documentdb/code/documentdb-${DocumentDB_VERSION}/pg_documentdb_gw/target/debug/documentdb_gateway /home/documentdb/gateway/documentdb_gateway +COPY --from=stage /home/documentdb/code/documentdb-${DocumentDB_VERSION}/pg_documentdb_gw/SetupConfiguration.json /home/documentdb/gateway/SetupConfiguration.json +COPY --from=stage /home/documentdb/code/documentdb-${DocumentDB_VERSION}/scripts/build_and_start_gateway.sh /home/documentdb/gateway/scripts/build_and_start_gateway.sh +COPY --from=stage /home/documentdb/code/documentdb-${DocumentDB_VERSION}/scripts/utils.sh /home/documentdb/gateway/scripts/utils.sh -COPY --from=stage --chown=documentdb:documentdb /home/documentdb/code/documentdb-${DocumentDB_VERSION}/pg_documentdb_gw/target/debug/documentdb_gateway /home/documentdb/gateway/pg_documentdb_gw/target/release-with-symbols/documentdb_gateway -COPY --from=stage --chown=documentdb:documentdb /home/documentdb/code/documentdb-${DocumentDB_VERSION}/pg_documentdb_gw/SetupConfiguration.json /home/documentdb/gateway/pg_documentdb_gw/SetupConfiguration.json -COPY --from=stage --chown=documentdb:documentdb /home/documentdb/code/documentdb-${DocumentDB_VERSION}/scripts/build_and_start_gateway.sh /home/documentdb/gateway/scripts/build_and_start_gateway.sh -COPY --from=stage --chown=documentdb:documentdb /home/documentdb/code/documentdb-${DocumentDB_VERSION}/scripts/emulator_entrypoint.sh /home/documentdb/gateway/scripts/emulator_entrypoint.sh -COPY --from=stage --chown=documentdb:documentdb /home/documentdb/code/documentdb-${DocumentDB_VERSION}/scripts/utils.sh /home/documentdb/gateway/scripts/utils.sh +RUN sudo chown -R documentdb:documentdb /home/documentdb/gateway WORKDIR /home/documentdb/gateway/scripts -ENTRYPOINT ["/bin/bash", "-c", "/home/documentdb/gateway/scripts/emulator_entrypoint.sh \"$@\"", "--"] +#ENTRYPOINT ["/bin/bash", "-c", "/home/documentdb/gateway/scripts/emulator_entrypoint.sh \"$@\"", "--"] \ No newline at end of file diff --git a/documentdb-playground/aks-fleet-deployment/grafana-values.yaml b/documentdb-playground/aks-fleet-deployment/grafana-values.yaml deleted file mode 100644 index 8cea36e4..00000000 --- a/documentdb-playground/aks-fleet-deployment/grafana-values.yaml +++ /dev/null @@ -1,106 +0,0 @@ -datasources: - datasources.yaml: - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - url: http://prometheus-server.documentdb-preview-ns.svc.cluster.local - access: proxy - isDefault: true - -adminPassword: admin123 - -service: - type: LoadBalancer - port: 3000 - -ingress: - enabled: false - -persistence: - enabled: true - size: 1Gi - -dashboardProviders: - dashboardproviders.yaml: - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: '' - type: file - disableDeletion: false - editable: true - options: - path: /var/lib/grafana/dashboards/default - -dashboards: - default: - documentdb-overview: - json: | - { - "dashboard": { - "id": null, - "title": "DocumentDB Overview", - "tags": ["documentdb"], - "timezone": "browser", - "panels": [ - { - "id": 1, - "title": "CPU Usage", - "type": "graph", - "targets": [ - { - "expr": "rate(container_cpu_usage_seconds_total{tenant=\"documentdb-preview\",container!=\"POD\",container!=\"\",name!=\"\"}[5m]) * 100", - "legendFormat": "{{pod}} - {{container}}" - } - ], - "gridPos": {"h": 9, "w": 12, "x": 0, "y": 0}, - "yAxes": [{"unit": "percent"}] - }, - { - "id": 2, - "title": "Memory Usage", - "type": "graph", - "targets": [ - { - "expr": "container_memory_usage_bytes{tenant=\"documentdb-preview\",container!=\"POD\",container!=\"\",name!=\"\"} / 1024 / 1024", - "legendFormat": "{{pod}} - {{container}}" - } - ], - "gridPos": {"h": 9, "w": 12, "x": 12, "y": 0}, - "yAxes": [{"unit": "bytes"}] - }, - { - "id": 3, - "title": "Pod Status", - "type": "stat", - "targets": [ - { - "expr": "count(container_memory_usage_bytes{tenant=\"documentdb-preview\",container!=\"POD\",container!=\"\",name!=\"\"})", - "legendFormat": "Running Containers" - } - ], - "gridPos": {"h": 6, "w": 12, "x": 0, "y": 9} - }, - { - "id": 4, - "title": "Network I/O", - "type": "graph", - "targets": [ - { - "expr": "rate(container_network_receive_bytes_total{tenant=\"documentdb-preview\"}[5m])", - "legendFormat": "{{pod}} RX" - }, - { - "expr": "rate(container_network_transmit_bytes_total{tenant=\"documentdb-preview\"}[5m])", - "legendFormat": "{{pod}} TX" - } - ], - "gridPos": {"h": 6, "w": 12, "x": 12, "y": 9} - } - ], - "time": {"from": "now-1h", "to": "now"}, - "refresh": "30s" - } - } diff --git a/documentdb-playground/multi-cloud-deployment/README.md b/documentdb-playground/multi-cloud-deployment/README.md index 39f2ac56..788a2e98 100644 --- a/documentdb-playground/multi-cloud-deployment/README.md +++ b/documentdb-playground/multi-cloud-deployment/README.md @@ -108,7 +108,7 @@ Or use environment variables for all clouds: export RESOURCE_GROUP="my-multi-cloud-rg" export RG_LOCATION="eastus2" export HUB_REGION="eastus2" -export AKS_CLUSTER_NAME="aks-documentdb-cluster" +export AKS_CLUSTER_NAME="azure-documentdb" export AKS_REGION="eastus2" export HUB_VM_SIZE="Standard_D4s_v3" @@ -116,10 +116,10 @@ export HUB_VM_SIZE="Standard_D4s_v3" export PROJECT_ID="my-gcp-project-id" export GCP_USER="user@example.com" export ZONE="us-central1-a" -export GKE_CLUSTER_NAME="gke-documentdb-cluster" +export GKE_CLUSTER_NAME="gcp-documentdb" # AWS EKS -export EKS_CLUSTER_NAME="eks-documentdb-cluster" +export EKS_CLUSTER_NAME="aws-documentdb" export EKS_REGION="us-west-2" # DocumentDB Operator @@ -158,9 +158,9 @@ export AZURE_DNS_PARENT_ZONE_RESOURCE_ID="/subscriptions/.../dnszones/parent.zon After deployment, contexts are automatically configured for: - `hub`: AKS Fleet hub cluster -- `aks-documentdb-cluster`: AKS member cluster (default name) -- `gke-documentdb-cluster`: GKE cluster (default name) -- `eks-documentdb-cluster`: EKS cluster (default name) +- `azure-documentdb`: AKS member cluster (default name) +- `gcp-documentdb`: GKE cluster (default name) +- `aws-documentdb`: EKS cluster (default name) ## Management @@ -203,9 +203,60 @@ mongosh "mongodb+srv://default_user:@./?tls=tr Example: ```bash -mongosh "mongodb+srv://default_user:mypassword@german-aks-fleet-rg.multi-cloud.pgmongo-dev.cosmos.windows-int.net/?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" +mongosh "mongodb+srv://default_user:mypassword@documentdb-aks-fleet-rg.multi-cloud.pgmongo-dev.cosmos.windows-int.net/?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" ``` +### Observability and Telemetry + +The `telemetry` folder contains configuration files for setting up a comprehensive observability stack across your multi-cloud DocumentDB deployment: + +#### Components + +- **Prometheus**: Metrics collection and storage +- **Grafana**: Visualization and dashboards +- **OpenTelemetry Collector**: Unified telemetry collection (metrics, logs, traces) + +#### Deploy Telemetry Stack + +```bash +cd telemetry +./deploy-telemetry.sh +``` + +This script will: +1. Deploy OpenTelemetry Collector on all clusters +2. Install Prometheus on the azure-documentdb cluster +2. Install Grafana on the azure-documentdb cluster +4. Configure Prometheus to scrape DocumentDB metrics + +#### Access Grafana Dashboard + +```bash +# Port-forward to Grafana +kubectl --context hub port-forward -n monitoring svc/grafana 3000:80 + +# Open browser to http://localhost:3000 +# Default credentials: admin/admin (change on first login) +``` + +From there you can import dashboard.json + +#### Configuration Files + +- **`deploy-telemetry.sh`**: Automated deployment script for the entire observability stack +- **`prometheus-values.yaml`**: Prometheus Helm chart configuration +- **`grafana-values.yaml`**: Grafana Helm chart configuration with dashboard provisioning +- **`otel-collector.yaml`**: OpenTelemetry Collector configuration for metrics and logs +- **`dashboard.json`**: Pre-built Grafana dashboard for DocumentDB monitoring + +#### Custom Configuration + +Edit the values files to customize: +- Prometheus retention period and storage +- Grafana plugins and data sources +- OpenTelemetry Collector pipelines and exporters +- Dashboard refresh intervals and panels + ### Failover Operations Failover is performed using the DocumentDB kubectl plugin: @@ -238,14 +289,14 @@ kubectl --context hub get membercluster ```bash # Check Istio components on each cluster -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get pods -n istio-system echo done # Verify east-west gateway services -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get svc -n istio-system istio-eastwestgateway echo @@ -256,12 +307,12 @@ done ```bash # Check remote secrets (for service discovery) -kubectl --context aks-documentdb-cluster get secrets -n istio-system | grep "istio-remote-secret" -kubectl --context gke-documentdb-cluster get secrets -n istio-system | grep "istio-remote-secret" -kubectl --context eks-documentdb-cluster get secrets -n istio-system | grep "istio-remote-secret" +kubectl --context azure-documentdb get secrets -n istio-system | grep "istio-remote-secret" +kubectl --context gcp-documentdb get secrets -n istio-system | grep "istio-remote-secret" +kubectl --context aws-documentdb get secrets -n istio-system | grep "istio-remote-secret" # Verify mesh network configuration -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get namespace istio-system --show-labels echo @@ -274,14 +325,14 @@ done ```bash # Quick status across all clusters -for c in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for c in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $c ===" kubectl --context $c get documentdb,pods -n documentdb-preview-ns 2>/dev/null || echo 'Not deployed yet' echo done # Check operator status on all clusters -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get deploy -n documentdb-operator kubectl --context $cluster get pods -n documentdb-operator @@ -292,14 +343,14 @@ done ```bash # Monitor all DocumentDB instances -watch 'for c in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do \ +watch 'for c in azure-documentdb gcp-documentdb aws-documentdb; do \ echo "=== $c ==="; \ kubectl --context $c get documentdb,pods -n documentdb-preview-ns; \ echo; \ done' # Check DocumentDB service endpoints -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get svc -n documentdb-preview-ns echo @@ -310,14 +361,14 @@ done ```bash # Check WAL replica status in Istio mesh -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get pods -n documentdb-preview-ns -l component=wal-replica echo done # Verify Istio sidecar injection -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get pods -n documentdb-preview-ns -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[*].name}{"\n"}{end}' echo @@ -340,7 +391,7 @@ az network dns record-set srv show \ --resource-group $RESOURCE_GROUP # Show A/CNAME records for each cluster -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" az network dns record-set a show --name $cluster --zone-name --resource-group $RESOURCE_GROUP 2>/dev/null || \ az network dns record-set cname show --name $cluster --zone-name --resource-group $RESOURCE_GROUP 2>/dev/null || \ @@ -420,16 +471,16 @@ kubectl --context get secrets -n istio-system | grep istio-remote **EBS CSI Driver:** ```bash # Check CSI driver status -kubectl --context eks-documentdb-cluster get pods -n kube-system -l app=ebs-csi-controller +kubectl --context aws-documentdb get pods -n kube-system -l app=ebs-csi-controller # Verify storage class -kubectl --context eks-documentdb-cluster get storageclass documentdb-storage +kubectl --context aws-documentdb get storageclass documentdb-storage ``` **AWS Load Balancer Controller:** ```bash # Check controller status -kubectl --context eks-documentdb-cluster get pods -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller +kubectl --context aws-documentdb get pods -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller # Verify subnet tags VPC_ID=$(aws eks describe-cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION --query 'cluster.resourcesVpcConfig.vpcId' --output text) @@ -454,19 +505,19 @@ nslookup _mongodb._tcp.. -type=SRV ```bash # Deploy test pod with network tools -kubectl --context aks-documentdb-cluster run test-pod --image=nicolaka/netshoot -it --rm -- /bin/bash +kubectl --context azure-documentdb run test-pod --image=nicolaka/netshoot -it --rm -- /bin/bash # From within the pod, test connectivity to other clusters # Using Istio service discovery -curl -v http://documentdb-service-gke-documentdb-cluster.documentdb-preview-ns.svc.cluster.local:10260 -curl -v http://documentdb-service-eks-documentdb-cluster.documentdb-preview-ns.svc.cluster.local:10260 +curl -v http://documentdb-service-gcp-documentdb.documentdb-preview-ns.svc.cluster.local:10260 +curl -v http://documentdb-service-aws-documentdb.documentdb-preview-ns.svc.cluster.local:10260 ``` ### Debugging ```bash # Check operator logs on member clusters -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster logs -n documentdb-operator deployment/documentdb-operator --tail=50 echo @@ -487,7 +538,7 @@ kubectl --context hub delete clusterresourceplacement documentdb-crp kubectl --context hub delete namespace documentdb-preview-ns # Wait for namespace deletion to complete on all clusters -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do kubectl --context $cluster wait --for=delete namespace/documentdb-preview-ns --timeout=60s || true done @@ -514,9 +565,9 @@ az network dns zone delete \ # Clean up local kubectl contexts kubectl config delete-context hub -kubectl config delete-context aks-documentdb-cluster -kubectl config delete-context gke-documentdb-cluster -kubectl config delete-context eks-documentdb-cluster +kubectl config delete-context azure-documentdb +kubectl config delete-context gcp-documentdb +kubectl config delete-context aws-documentdb ``` ## Scripts diff --git a/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh b/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh index 7ce4e31d..ff3f32f8 100755 --- a/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh +++ b/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh @@ -6,7 +6,7 @@ set -euo pipefail # Usage: ./deploy-documentdb.sh [password] # # Environment variables: -# RESOURCE_GROUP: Azure resource group (default: german-aks-fleet-rg) +# RESOURCE_GROUP: Azure resource group (default: documentdb-aks-fleet-rg) # DOCUMENTDB_PASSWORD: Database password (will be generated if not provided) # ENABLE_AZURE_DNS: Enable Azure DNS creation (default: true) # AZURE_DNS_ZONE_NAME: Azure DNS zone name (default: same as resource group) @@ -20,19 +20,23 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Resource group -RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" +RESOURCE_GROUP="${RESOURCE_GROUP:-documentdb-aks-fleet-rg}" -AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-aks-documentdb-cluster}" -GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gke-documentdb-cluster}" -EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-eks-documentdb-cluster}" +AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-azure-documentdb}" +GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gcp-documentdb}" +EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-aws-documentdb}" # Azure DNS configuration AZURE_DNS_ZONE_NAME="${AZURE_DNS_ZONE_NAME:-${RESOURCE_GROUP}}" AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-/subscriptions/81901d5e-31aa-46c5-b61a-537dbd5df1e7/resourceGroups/alaye-documentdb-dns/providers/Microsoft.Network/dnszones/multi-cloud.pgmongo-dev.cosmos.windows-int.net}" +AZURE_DNS_ZONE_FULL_NAME="${AZURE_DNS_ZONE_FULL_NAME:-}" +AZURE_DNS_ZONE_RG="${AZURE_DNS_ZONE_RG:-${RESOURCE_GROUP}}" ENABLE_AZURE_DNS="${ENABLE_AZURE_DNS:-true}" # Set password from argument or environment variable DOCUMENTDB_PASSWORD="${1:-${DOCUMENTDB_PASSWORD:-}}" +DOCUMENTDB_IMAGE="${DOCUMENTDB_IMAGE:-ghcr.io/microsoft/documentdb/documentdb-local:16}" +GATEWAY_IMAGE="${GATEWAY_IMAGE:-${DOCUMENTDB_IMAGE}}" # If no password provided, generate a secure one if [ -z "$DOCUMENTDB_PASSWORD" ]; then @@ -54,7 +58,7 @@ for cluster in "${CLUSTER_ARRAY[@]}"; do echo " - $cluster" done -PRIMARY_CLUSTER=${CLUSTER_ARRAY[0]} +PRIMARY_CLUSTER=${CLUSTER_ARRAY[1]} echo "" echo "Selected primary cluster: $PRIMARY_CLUSTER" @@ -175,6 +179,8 @@ TEMP_YAML=$(mktemp) # Use sed for safer substitution sed -e "s/{{DOCUMENTDB_PASSWORD}}/$DOCUMENTDB_PASSWORD/g" \ -e "s/{{PRIMARY_CLUSTER}}/$PRIMARY_CLUSTER/g" \ + -e "s#{{DOCUMENTDB_IMAGE}}#$DOCUMENTDB_IMAGE#g" \ + -e "s#{{GATEWAY_IMAGE}}#$GATEWAY_IMAGE#g" \ "$SCRIPT_DIR/documentdb-cluster.yaml" | \ while IFS= read -r line; do if [[ "$line" == '{{CLUSTER_LIST}}' ]]; then @@ -286,17 +292,21 @@ if [ "$ENABLE_AZURE_DNS" = "true" ]; then echo "Creating Azure DNS zone for DocumentDB..." echo "=======================================" - parentName=$(az network dns zone show --id $AZURE_DNS_PARENT_ZONE_RESOURCE_ID | jq -r ".name") - fullName="${AZURE_DNS_ZONE_NAME}.${parentName}" - - # Create Azure DNS zone - if az network dns zone show --name "$AZURE_DNS_ZONE_NAME" --resource-group "$RESOURCE_GROUP" &>/dev/null; then - echo "Azure DNS zone already exists, updating..." + if [ -n "$AZURE_DNS_ZONE_FULL_NAME" ]; then + fullName="$AZURE_DNS_ZONE_FULL_NAME" else - az network dns zone create \ - --name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ - --parent-name "$AZURE_DNS_PARENT_ZONE_RESOURCE_ID" + parentName=$(az network dns zone show --id $AZURE_DNS_PARENT_ZONE_RESOURCE_ID | jq -r ".name") + fullName="${AZURE_DNS_ZONE_NAME}.${parentName}" + + # Create Azure DNS zone + if az network dns zone show --name "$AZURE_DNS_ZONE_NAME" --resource-group "$AZURE_DNS_ZONE_RG" &>/dev/null; then + echo "Azure DNS zone already exists, updating..." + else + az network dns zone create \ + --name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --parent-name "$AZURE_DNS_PARENT_ZONE_RESOURCE_ID" + fi fi # Wait for DocumentDB services to be ready and create endpoints @@ -334,19 +344,19 @@ if [ "$ENABLE_AZURE_DNS" = "true" ]; then az network dns record-set a delete \ --name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --yes # Create DNS record az network dns record-set a create \ --name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --ttl 5 az network dns record-set a add-record \ --record-set-name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --ipv4-address "$EXTERNAL_IP" \ --ttl 5 @@ -358,19 +368,19 @@ if [ "$ENABLE_AZURE_DNS" = "true" ]; then az network dns record-set cname delete \ --name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --yes # Create DNS record az network dns record-set cname create \ --name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --ttl 5 az network dns record-set cname set-record \ --record-set-name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --cname "$EXTERNAL_HOSTNAME" \ --ttl 5 @@ -383,19 +393,19 @@ if [ "$ENABLE_AZURE_DNS" = "true" ]; then az network dns record-set srv delete \ --name "_mongodb._tcp" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --yes az network dns record-set srv create \ --name "_mongodb._tcp" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --ttl 5 mongoFQDN=$(az network dns record-set srv add-record \ --record-set-name "_mongodb._tcp" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --priority 0 \ --weight 0 \ --port 10260 \ @@ -409,7 +419,7 @@ fi echo "" echo "Connection Information:" -echo " Username: default_user" +echo " Username: docdb" echo " Password: $DOCUMENTDB_PASSWORD" echo "" echo "To monitor the deployment:" diff --git a/documentdb-playground/multi-cloud-deployment/deploy.sh b/documentdb-playground/multi-cloud-deployment/deploy.sh index 04709061..186d5b62 100755 --- a/documentdb-playground/multi-cloud-deployment/deploy.sh +++ b/documentdb-playground/multi-cloud-deployment/deploy.sh @@ -8,7 +8,7 @@ set -euo pipefail # Configuration # ============================================================================ -RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" +RESOURCE_GROUP="${RESOURCE_GROUP:-documentdb-aks-fleet-rg}" RG_LOCATION="${RG_LOCATION:-eastus2}" HUB_REGION="${HUB_REGION:-$RG_LOCATION}" TEMPLATE_DIR="$(dirname "$0")" @@ -16,16 +16,16 @@ HUB_VM_SIZE="${HUB_VM_SIZE:-}" VERSION="${VERSION:-200}" VALUES_FILE="${VALUES_FILE:-}" ISTIO_DIR="${ISTIO_DIR:-}" -AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-aks-documentdb-cluster}" +AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-azure-documentdb}" AKS_REGION="${AKS_REGION:-eastus2}" HUB_CONTEXT="${HUB_CONTEXT:-hub}" PROJECT_ID="${PROJECT_ID:-sanguine-office-475117-s6}" GCP_USER="${GCP_USER:-alexanderlaye59@gmail.com}" ZONE="${ZONE:-us-central1-a}" -GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gke-documentdb-cluster}" +GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gcp-documentdb}" -EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-eks-documentdb-cluster}" +EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-aws-documentdb}" EKS_REGION="${EKS_REGION:-us-west-2}" # ============================================================================ @@ -436,8 +436,9 @@ pushd $temp_dir git clone https://github.com/kubefleet-dev/kubefleet.git git clone https://github.com/Azure/fleet-networking.git pushd $temp_dir/kubefleet +git checkout d3f42486fa78874e33ba8e6e5e34636767f77b8f chmod +x hack/membership/joinMC.sh -hack/membership/joinMC.sh "v0.16.5" "$HUB_CONTEXT" "$GKE_CLUSTER_NAME" "$EKS_CLUSTER_NAME" +hack/membership/joinMC.sh "v0.16.9" "$HUB_CONTEXT" "$GKE_CLUSTER_NAME" "$EKS_CLUSTER_NAME" popd # TODO clean this up a bit @@ -568,7 +569,7 @@ kubectl --context "$EKS_CLUSTER_NAME" -n istio-system annotate service istio-eas # Step 6: Install DocumentDB Operator # ============================================================================ -CHART_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/operator/documentdb-helm-chart" +CHART_DIR="$(cd "$TEMPLATE_DIR/../../" && pwd)/operator/documentdb-helm-chart" CHART_PKG="$TEMPLATE_DIR/documentdb-operator-0.0.${VERSION}.tgz" # Apply cert-manager CRDs on hub diff --git a/documentdb-playground/multi-cloud-deployment/dns_failover.sh b/documentdb-playground/multi-cloud-deployment/dns_failover.sh index 1aa208f4..2542917a 100755 --- a/documentdb-playground/multi-cloud-deployment/dns_failover.sh +++ b/documentdb-playground/multi-cloud-deployment/dns_failover.sh @@ -4,7 +4,7 @@ RESOURCE_GROUP="${RESOURCE_GROUP:-documentdb-aks-fleet-rg}" DOCUMENTDB_NAME="${DOCUMENTDB_NAME:-documentdb-preview}" DOCUMENTDB_NAMESPACE="${DOCUMENTDB_NAMESPACE:-documentdb-preview-ns}" HUB_CONTEXT="${HUB_CONTEXT:-hub}" -GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gke-documentdb-cluster}" +GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gcp-documentdb}" MEMBER_CLUSTERS=$(kubectl --context "$HUB_CONTEXT" get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq -r ".spec.clusterReplication.clusterList[].name") PRIMARY_CLUSTER=$(kubectl --context "$HUB_CONTEXT" get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq -r ".spec.clusterReplication.primary") diff --git a/documentdb-playground/multi-cloud-deployment/documentdb-base.yaml b/documentdb-playground/multi-cloud-deployment/documentdb-base.yaml index ae7c6d7c..519e5c05 100644 --- a/documentdb-playground/multi-cloud-deployment/documentdb-base.yaml +++ b/documentdb-playground/multi-cloud-deployment/documentdb-base.yaml @@ -36,6 +36,14 @@ spec: version: v1 kind: CustomResourceDefinition name: scheduledbackups.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: backups.db.microsoft.com + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: scheduledbackups.db.microsoft.com - group: "apiextensions.k8s.io" version: v1 kind: CustomResourceDefinition diff --git a/documentdb-playground/multi-cloud-deployment/documentdb-cluster.yaml b/documentdb-playground/multi-cloud-deployment/documentdb-cluster.yaml index aff12c51..0e91d35b 100644 --- a/documentdb-playground/multi-cloud-deployment/documentdb-cluster.yaml +++ b/documentdb-playground/multi-cloud-deployment/documentdb-cluster.yaml @@ -15,7 +15,7 @@ metadata: namespace: documentdb-preview-ns type: Opaque stringData: - username: default_user + username: docdb password: {{DOCUMENTDB_PASSWORD}} --- @@ -28,8 +28,8 @@ metadata: spec: nodeCount: 1 instancesPerNode: 1 - documentDBImage: ghcr.io/microsoft/documentdb/documentdb-local:16 - gatewayImage: ghcr.io/microsoft/documentdb/documentdb-local:16 + documentDBImage: {{DOCUMENTDB_IMAGE}} + gatewayImage: {{GATEWAY_IMAGE}} resource: storage: pvcSize: 10Gi diff --git a/documentdb-playground/multi-cloud-deployment/insert_test.py b/documentdb-playground/multi-cloud-deployment/insert_test.py index 912f1ba2..9434868b 100644 --- a/documentdb-playground/multi-cloud-deployment/insert_test.py +++ b/documentdb-playground/multi-cloud-deployment/insert_test.py @@ -19,11 +19,11 @@ print(f"{'Inserted Document':<30} {'Insert Count':<15}") print("-" * 77) start_time = time.time() -end_time = start_time + (10 * 60) # 10 minutes +end_time = start_time + (60 * 60) # 60 minutes count = 0 +first_error_seen = False while time.time() < end_time: - failed = False write_result = "" try: doc = { @@ -36,10 +36,10 @@ print(f"{str(write_result):<30} {count:<15}") except Exception as e: if not first_error_seen: - #print("Promotion in progress") + print("Switching cloud to Azure") first_error_seen = True - time.sleep(1) + time.sleep(.25) print(f"Completed {count} insert operations in 10 minutes") final_read_count = collection.count_documents({}) diff --git a/documentdb-playground/multi-cloud-deployment/main.bicep b/documentdb-playground/multi-cloud-deployment/main.bicep index eb54d572..da8934b0 100644 --- a/documentdb-playground/multi-cloud-deployment/main.bicep +++ b/documentdb-playground/multi-cloud-deployment/main.bicep @@ -60,7 +60,7 @@ resource memberCluster 'Microsoft.ContainerService/managedClusters@2023-10-01' = // Member clusters fleet membership resource memberFleetMembers 'Microsoft.ContainerService/fleets/members@2023-10-15' = { - name: 'member-${memberRegion}-${uniqueString(resourceGroup().id, memberRegion)}' + name: memberName parent: fleet properties: { clusterResourceId: memberCluster.id diff --git a/documentdb-playground/aks-fleet-deployment/dashboard.json b/documentdb-playground/multi-cloud-deployment/telemetry/dashboard.json similarity index 100% rename from documentdb-playground/aks-fleet-deployment/dashboard.json rename to documentdb-playground/multi-cloud-deployment/telemetry/dashboard.json diff --git a/documentdb-playground/aks-fleet-deployment/deploy-telemetry.sh b/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh similarity index 93% rename from documentdb-playground/aks-fleet-deployment/deploy-telemetry.sh rename to documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh index 3c8f95fb..f27f8211 100755 --- a/documentdb-playground/aks-fleet-deployment/deploy-telemetry.sh +++ b/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh @@ -184,16 +184,15 @@ deploy_prometheus() { log "Deploying Prometheus in namespace: $namespace" # Get the directory where this script is located - SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - DEPLOYMENT_DIR="$(dirname "$SCRIPT_DIR")/aks-fleet-deployment" + SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - if [ ! -f "$DEPLOYMENT_DIR/prometheus-values.yaml" ]; then - error "Prometheus values file not found: $DEPLOYMENT_DIR/prometheus-values.yaml" + if [ ! -f "$SCRIPT_DIR/prometheus-values.yaml" ]; then + error "Prometheus values file not found: $SCRIPT_DIR/prometheus-values.yaml" fi helm upgrade --install prometheus prometheus-community/prometheus \ --namespace $namespace \ - --values "$DEPLOYMENT_DIR/prometheus-values.yaml" \ + --values "$SCRIPT_DIR/prometheus-values.yaml" \ --wait --timeout=300s success "Prometheus deployed" @@ -206,16 +205,15 @@ deploy_grafana() { log "Deploying Grafana in namespace: $namespace" # Get the directory where this script is located - SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - DEPLOYMENT_DIR="$(dirname "$SCRIPT_DIR")/aks-fleet-deployment" + SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - if [ ! -f "$DEPLOYMENT_DIR/grafana-values.yaml" ]; then - error "Grafana values file not found: $DEPLOYMENT_DIR/grafana-values.yaml" + if [ ! -f "$SCRIPT_DIR/grafana-values.yaml" ]; then + error "Grafana values file not found: $SCRIPT_DIR/grafana-values.yaml" fi helm upgrade --install grafana grafana/grafana \ --namespace $namespace \ - --values "$DEPLOYMENT_DIR/grafana-values.yaml" \ + --values "$SCRIPT_DIR/grafana-values.yaml" \ --wait --timeout=300s success "Grafana deployed" @@ -227,8 +225,7 @@ deploy_collectors() { log "Deploying OpenTelemetry collector to each member cluster..." # Get the directory where this script is located - SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - DEPLOYMENT_DIR="$(dirname "$SCRIPT_DIR")/aks-fleet-deployment" + SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" # Get member clusters and primary cluster from documentdb resource MEMBER_CLUSTERS=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o json 2>/dev/null | jq -r '.spec.clusterReplication.clusterList[].name' 2>/dev/null || echo "") @@ -239,7 +236,7 @@ deploy_collectors() { kubectl --context "$cluster" wait --for=jsonpath='{.subsets[*].addresses[*].ip}' endpoints/opentelemetry-operator-webhook-service -n opentelemetry-operator-system --timeout=300s || warn "Webhook service not ready on $cluster, proceeding anyway..." log "Deploying OpenTelemetry Collector to cluster: $cluster" - sed "s/{{CLUSTER_NAME}}/$cluster/g" "$DEPLOYMENT_DIR/otel-collector.yaml" | kubectl --context "$cluster" apply -f - + sed "s/{{CLUSTER_NAME}}/$cluster/g" "$SCRIPT_DIR/otel-collector.yaml" | kubectl --context "$cluster" apply -f - done success "All collectors deployed" } diff --git a/documentdb-playground/multi-cloud-deployment/telemetry/grafana-values.yaml b/documentdb-playground/multi-cloud-deployment/telemetry/grafana-values.yaml new file mode 100644 index 00000000..276a065e --- /dev/null +++ b/documentdb-playground/multi-cloud-deployment/telemetry/grafana-values.yaml @@ -0,0 +1,22 @@ +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: http://prometheus-server.documentdb-preview-ns.svc.cluster.local + access: proxy + isDefault: true + +adminPassword: admin123 + +service: + type: LoadBalancer + port: 3000 + +ingress: + enabled: false + +persistence: + enabled: true + size: 1Gi diff --git a/documentdb-playground/aks-fleet-deployment/otel-collector.yaml b/documentdb-playground/multi-cloud-deployment/telemetry/otel-collector.yaml similarity index 100% rename from documentdb-playground/aks-fleet-deployment/otel-collector.yaml rename to documentdb-playground/multi-cloud-deployment/telemetry/otel-collector.yaml diff --git a/documentdb-playground/aks-fleet-deployment/prometheus-values.yaml b/documentdb-playground/multi-cloud-deployment/telemetry/prometheus-values.yaml similarity index 100% rename from documentdb-playground/aks-fleet-deployment/prometheus-values.yaml rename to documentdb-playground/multi-cloud-deployment/telemetry/prometheus-values.yaml diff --git a/operator/src/scripts/multi-cloud-deployment/.gitignore b/operator/src/scripts/multi-cloud-deployment/.gitignore deleted file mode 100644 index 1503cc8a..00000000 --- a/operator/src/scripts/multi-cloud-deployment/.gitignore +++ /dev/null @@ -1 +0,0 @@ -certs \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/README.md b/operator/src/scripts/multi-cloud-deployment/README.md deleted file mode 100644 index dfd87b56..00000000 --- a/operator/src/scripts/multi-cloud-deployment/README.md +++ /dev/null @@ -1,608 +0,0 @@ -# Multi-Cloud DocumentDB Deployment - -This directory contains templates and scripts for deploying DocumentDB across multiple cloud providers (Azure AKS, Google GKE, and AWS EKS) with cross-cloud replication using Istio service mesh and AKS Fleet for resource propagation. - -## Architecture - -- **Fleet Resource**: Deployed in East US 2 (management hub for resource propagation) -- **Multi-Cloud Clusters**: - - **AKS**: Single member cluster in configurable region (default: eastus2) - - **GKE**: Cluster in us-central1-a - - **EKS**: Cluster in us-west-2 -- **Network**: - - AKS: Uses default Azure CNI - - GKE: Default GKE networking - - EKS: Default EKS networking with NLB for cross-cloud connectivity -- **Service Mesh**: Istio multi-cluster mesh for cross-cloud service discovery -- **VM Size**: Standard_DS3_v2 for AKS, e2-standard-4 for GKE, m5.large for EKS (configurable) -- **Node Count**: 1-2 nodes per cluster for cost optimization -- **Kubernetes Version**: Uses region default GA version (configurable) -- **DocumentDB**: Multi-cloud deployment with primary/replica architecture and Istio-based replication - -## Prerequisites - -- **Azure**: Azure CLI installed and logged in (`az login`) -- **GCP**: Google Cloud SDK installed and logged in (`gcloud auth login`) - - gke-gcloud-auth-plugin: `sudo apt-get install google-cloud-cli-gke-gcloud-auth-plugin` -- **AWS**: AWS CLI installed and configured (`aws configure`) - - eksctl installed for EKS cluster management -- **Kubernetes Tools**: - - kubectl installed - - kubelogin for Azure AD authentication: `az aks install-cli` - - Helm 3.x installed -- **Other Tools**: - - jq for JSON processing: `brew install jq` (macOS) or `apt-get install jq` (Linux) - - openssl for password generation -- **Permissions**: - - Azure: Contributor access to the subscription - - GCP: Container Admin, Compute Network Admin, and Service Account User roles - - AWS: Sufficient IAM permissions to create EKS clusters and IAM roles -- **Quotas**: Sufficient quota in target regions for clusters - -## Quick Start - -### Deploy Everything (One Command) - -```bash -./deploy.sh -``` - -This single script will: -1. **Deploy Infrastructure**: - - Create Azure resource group - - Deploy AKS Fleet resource - - Deploy AKS member cluster - - Deploy GKE cluster (in parallel) - - Deploy EKS cluster with EBS CSI driver and AWS Load Balancer Controller -2. **Configure Multi-Cloud Mesh**: - - Join GKE and EKS clusters to the AKS Fleet - - Install cert-manager on all clusters - - Set up Istio multi-cluster service mesh with shared root CA - - Configure cross-cloud networking with east-west gateways -3. **Deploy DocumentDB Operator**: - - Install DocumentDB operator on hub cluster - - Propagate base resources (CRDs, RBAC) to all member clusters via Fleet -4. **Set Up Access**: - - Configure kubectl contexts for all clusters - - Set up RBAC access for Fleet - -### Deploy DocumentDB Database - -After the infrastructure is deployed: - -```bash -# With auto-generated password -./deploy-documentdb.sh - -# With custom password -./deploy-documentdb.sh "MySecureP@ssw0rd" - -# Disable Azure DNS creation (for testing) -ENABLE_AZURE_DNS=false ./deploy-documentdb.sh -``` - -This will: -- Create cluster identification ConfigMaps on each member cluster -- Select a primary cluster (defaults to EKS cluster) -- Deploy DocumentDB with Istio-based cross-cloud replication -- Create Azure DNS zone with records for each cluster (if enabled) -- Create SRV record for MongoDB connection string -- Provide connection information and failover commands - -## Configuration - -### Infrastructure Configuration - -Edit `parameters.bicepparam` to customize AKS deployment: -- Hub cluster name (used for fleet naming) -- Hub region (fleet location) -- Member cluster name and region -- VM sizes -- Node counts -- Kubernetes version - -Or use environment variables for all clouds: - -```bash -# Azure AKS -export RESOURCE_GROUP="my-multi-cloud-rg" -export RG_LOCATION="eastus2" -export HUB_REGION="eastus2" -export AKS_CLUSTER_NAME="azure-documentdb" -export AKS_REGION="eastus2" -export HUB_VM_SIZE="Standard_D4s_v3" - -# Google GKE -export PROJECT_ID="my-gcp-project-id" -export GCP_USER="user@example.com" -export ZONE="us-central1-a" -export GKE_CLUSTER_NAME="gcp-documentdb" - -# AWS EKS -export EKS_CLUSTER_NAME="aws-documentdb" -export EKS_REGION="us-west-2" - -# DocumentDB Operator -export VERSION="200" # Operator version -export VALUES_FILE="/path/to/custom/values.yaml" # Optional Helm values - -./deploy.sh -``` - -### DocumentDB Configuration - -Edit `documentdb-cluster.yaml` to customize: -- Database size and instances -- Replication settings (primary cluster, HA mode) -- Cross-cloud networking strategy (Istio) -- Storage class per environment -- Service exposure type -- Log levels - -The template uses placeholders replaced at runtime: -- `{{DOCUMENTDB_PASSWORD}}`: The database password -- `{{PRIMARY_CLUSTER}}`: The selected primary cluster -- `{{CLUSTER_LIST}}`: YAML list of all clusters with their environments - -### Azure DNS Configuration - -```bash -export ENABLE_AZURE_DNS="true" # Enable/disable DNS creation -export AZURE_DNS_ZONE_NAME="my-documentdb-zone" # DNS zone name (default: resource group name) -export AZURE_DNS_PARENT_ZONE_RESOURCE_ID="/subscriptions/.../dnszones/parent.zone" -``` - -## Environment Variables - -The deployment scripts automatically set and export: -- `FLEET_ID`: Full resource ID of the AKS fleet -- `IDENTITY`: Your Azure AD user ID -- `DOCUMENTDB_PASSWORD`: Database password (when deploying DocumentDB) -- `RESOURCE_GROUP`: Resource group name (default: german-aks-fleet-rg) -- `PROJECT_ID`: GCP project ID (default: sanguine-office-475117-s6) -- `ZONE`: GCP zone (default: us-central1-a) -- `EKS_REGION`: AWS region (default: us-west-2) - -## kubectl Contexts - -After deployment, contexts are automatically configured for: -- `hub`: AKS Fleet hub cluster -- `azure-documentdb`: AKS member cluster (default name) -- `gcp-documentdb`: GKE cluster (default name) -- `aws-documentdb`: EKS cluster (default name) - -## Management - -### Check Deployment Status - -```bash -# Check operator status on hub -kubectl --context hub get deploy -n documentdb-operator - -# Check DocumentDB base resources propagation -kubectl --context hub get clusterresourceplacement documentdb-base -o wide - -# Check DocumentDB cluster resources propagation -kubectl --context hub get clusterresourceplacement documentdb-crp -o wide - -# View specific cluster -kubectl --context get documentdb,pods -n documentdb-preview-ns -``` - -### Connect to Database - -#### Via Port-Forward (for testing) - -```bash -# Connect to primary cluster -kubectl --context port-forward \ - -n documentdb-preview-ns svc/documentdb-service- 10260:10260 - -mongosh localhost:10260 -u default_user -p \ - --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates -``` - -#### Via Azure DNS (production) - -When `ENABLE_AZURE_DNS=true`, use the MongoDB SRV connection string: - -```bash -mongosh "mongodb+srv://default_user:@_mongodb._tcp../?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" -``` - -Example: -```bash -mongosh "mongodb+srv://default_user:mypassword@_mongodb._tcp.german-aks-fleet-rg.multi-cloud.pgmongo-dev.cosmos.windows-int.net/?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" -``` - -### Failover Operations - -Failover is performed using the DocumentDB kubectl plugin: - -```bash -kubectl documentdb promote \ - --documentdb documentdb-preview \ - --namespace documentdb-preview-ns \ - --hub-context hub \ - --target-cluster \ - --cluster-context -``` - -## Fleet Management - -```bash -# Show AKS fleet details -az fleet show --name --resource-group $RESOURCE_GROUP - -# List fleet members (includes Azure members only, not cross-cloud) -az fleet member list --fleet-name --resource-group $RESOURCE_GROUP - -# Check all ClusterResourcePlacements -kubectl --context hub get clusterresourceplacement - -# View base resources placement (CRDs, RBAC) -kubectl --context hub describe clusterresourceplacement documentdb-base - -# View DocumentDB cluster placement -kubectl --context hub describe clusterresourceplacement documentdb-crp - -# Check multi-cloud fleet membership (GKE and EKS) -kubectl --context hub get membercluster -``` - -## Multi-Cloud Mesh Management - -### Verify Istio Installation - -```bash -# Check Istio components on each cluster -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $cluster ===" - kubectl --context $cluster get pods -n istio-system - echo -done - -# Verify east-west gateway services -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $cluster ===" - kubectl --context $cluster get svc -n istio-system istio-eastwestgateway - echo -done -``` - -### Verify Cross-Cloud Connectivity - -```bash -# Check remote secrets (for service discovery) -kubectl --context azure-documentdb get secrets -n istio-system | grep "istio-remote-secret" -kubectl --context gcp-documentdb get secrets -n istio-system | grep "istio-remote-secret" -kubectl --context aws-documentdb get secrets -n istio-system | grep "istio-remote-secret" - -# Verify mesh network configuration -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $cluster ===" - kubectl --context $cluster get namespace istio-system --show-labels - echo -done -``` - -## DocumentDB Management - -### Check Deployment Status - -```bash -# Quick status across all clusters -for c in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $c ===" - kubectl --context $c get documentdb,pods -n documentdb-preview-ns 2>/dev/null || echo 'Not deployed yet' - echo -done - -# Check operator status on all clusters -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $cluster ===" - kubectl --context $cluster get deploy -n documentdb-operator - kubectl --context $cluster get pods -n documentdb-operator -done -``` - -### Monitor Replication - -```bash -# Watch ClusterResourcePlacement status -watch 'kubectl --context hub get clusterresourceplacement documentdb-crp -o wide' - -# Monitor all DocumentDB instances -watch 'for c in azure-documentdb gcp-documentdb aws-documentdb; do \ - echo "=== $c ==="; \ - kubectl --context $c get documentdb,pods -n documentdb-preview-ns; \ - echo; \ -done' - -# Check DocumentDB service endpoints -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $cluster ===" - kubectl --context $cluster get svc -n documentdb-preview-ns - echo -done -``` - -### Verify Cross-Cloud Replication - -```bash -# Check WAL replica status in Istio mesh -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $cluster ===" - kubectl --context $cluster get pods -n documentdb-preview-ns -l component=wal-replica - echo -done - -# Verify Istio sidecar injection -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $cluster ===" - kubectl --context $cluster get pods -n documentdb-preview-ns -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[*].name}{"\n"}{end}' - echo -done -``` - -### Azure DNS Management - -```bash -# List DNS records for DocumentDB -az network dns record-set list \ - --zone-name \ - --resource-group $RESOURCE_GROUP \ - --output table - -# Show SRV record for MongoDB connection -az network dns record-set srv show \ - --name "_mongodb._tcp" \ - --zone-name \ - --resource-group $RESOURCE_GROUP - -# Show A/CNAME records for each cluster -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $cluster ===" - az network dns record-set a show --name $cluster --zone-name --resource-group $RESOURCE_GROUP 2>/dev/null || \ - az network dns record-set cname show --name $cluster --zone-name --resource-group $RESOURCE_GROUP 2>/dev/null || \ - echo "Record not found" - echo -done -``` - -## RBAC Management - -The deployment script automatically assigns the "Azure Kubernetes Fleet Manager RBAC Cluster Admin" role for AKS Fleet access. To manage RBAC: - -```bash -# View current role assignment -az role assignment list --assignee $IDENTITY --scope $FLEET_ID - -# Add another user -az role assignment create --role "Azure Kubernetes Fleet Manager RBAC Cluster Admin" \ - --assignee --scope $FLEET_ID -``` - -For GCP and AWS, ensure you have appropriate IAM permissions configured via `gcloud` and `aws` CLI. - -## Troubleshooting - -### Authentication Issues - -**Azure AKS:** -```bash -# Get fleet credentials -az fleet get-credentials --resource-group $RESOURCE_GROUP --name - -# If web authentication is blocked, use Azure CLI -kubelogin convert-kubeconfig -l azurecli - -# Use admin credentials for member clusters -az aks get-credentials --resource-group $RESOURCE_GROUP --name --admin -``` - -**Google GKE:** -```bash -# Refresh credentials -gcloud container clusters get-credentials --zone - -# Verify authentication -gcloud auth list -gcloud config get-value account -``` - -**AWS EKS:** -```bash -# Update kubeconfig -aws eks update-kubeconfig --name --region - -# Verify IAM identity -aws sts get-caller-identity -``` - -### Resource Propagation Issues - -```bash -# Check ClusterResourcePlacement status -kubectl --context hub get clusterresourceplacement documentdb-base -o yaml -kubectl --context hub get clusterresourceplacement documentdb-crp -o yaml - -# Verify fleet members (Azure native) -az fleet member list --fleet-name --resource-group $RESOURCE_GROUP - -# Verify multi-cloud member clusters -kubectl --context hub get membercluster -kubectl --context hub describe membercluster - -# Check if resources reached target clusters -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $cluster ===" - kubectl --context $cluster get documentdb -n documentdb-preview-ns - kubectl --context $cluster get pods -n documentdb-preview-ns - echo -done -``` - -### Istio Mesh Issues - -```bash -# Verify Istio installation -istioctl --context version - -# Check proxy status -istioctl --context proxy-status - -# Verify mesh configuration -istioctl --context analyze - -# Check east-west gateway connectivity -kubectl --context get svc -n istio-system istio-eastwestgateway - -# Verify remote secrets -kubectl --context get secrets -n istio-system | grep istio-remote-secret -``` - -### EKS-Specific Issues - -**EBS CSI Driver:** -```bash -# Check CSI driver status -kubectl --context aws-documentdb get pods -n kube-system -l app=ebs-csi-controller - -# Verify storage class -kubectl --context aws-documentdb get storageclass documentdb-storage -``` - -**AWS Load Balancer Controller:** -```bash -# Check controller status -kubectl --context aws-documentdb get pods -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller - -# Verify subnet tags -VPC_ID=$(aws eks describe-cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION --query 'cluster.resourcesVpcConfig.vpcId' --output text) -aws ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" --query 'Subnets[].{ID:SubnetId,Tags:Tags}' --region $EKS_REGION -``` - -### DNS Issues - -```bash -# Verify DNS zone exists -az network dns zone show --name --resource-group $RESOURCE_GROUP - -# Check DNS records -az network dns record-set list --zone-name --resource-group $RESOURCE_GROUP - -# Test DNS resolution -nslookup .. -nslookup _mongodb._tcp.. -type=SRV -``` - -### Cross-Cloud Connectivity - -```bash -# Deploy test pod with network tools -kubectl --context azure-documentdb run test-pod --image=nicolaka/netshoot -it --rm -- /bin/bash - -# From within the pod, test connectivity to other clusters -# Using Istio service discovery -curl -v http://documentdb-service-gcp-documentdb.documentdb-preview-ns.svc.cluster.local:10260 -curl -v http://documentdb-service-aws-documentdb.documentdb-preview-ns.svc.cluster.local:10260 -``` - -### Debugging - -```bash -# Check operator logs on hub -kubectl --context hub logs -n documentdb-operator deployment/documentdb-operator --tail=100 - -# Check operator logs on member clusters -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - echo "=== $cluster ===" - kubectl --context $cluster logs -n documentdb-operator deployment/documentdb-operator --tail=50 - echo -done - -# View DocumentDB resource status -kubectl --context describe documentdb documentdb-preview -n documentdb-preview-ns - -# Check Istio sidecar logs -kubectl --context logs -n documentdb-preview-ns -c istio-proxy -``` - -## Clean Up - -```bash -# Delete DocumentDB resources from all clusters -kubectl --context hub delete clusterresourceplacement documentdb-crp -kubectl --context hub delete namespace documentdb-preview-ns - -# Wait for namespace deletion to complete on all clusters -for cluster in azure-documentdb gcp-documentdb aws-documentdb; do - kubectl --context $cluster wait --for=delete namespace/documentdb-preview-ns --timeout=60s || true -done - -# Delete base operator resources -kubectl --context hub delete clusterresourceplacement documentdb-base - -# Delete entire Azure resource group (includes AKS fleet and member) -az group delete --name $RESOURCE_GROUP --yes --no-wait - -# Delete GKE cluster -gcloud container clusters delete $GKE_CLUSTER_NAME \ - --zone $ZONE \ - --project $PROJECT_ID \ - --quiet - -# Delete EKS cluster (also deletes associated IAM roles and service accounts) -eksctl delete cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION - -# Delete Azure DNS zone (if created) -az network dns zone delete \ - --name \ - --resource-group $RESOURCE_GROUP \ - --yes - -# Clean up local kubectl contexts -kubectl config delete-context hub -kubectl config delete-context azure-documentdb -kubectl config delete-context gcp-documentdb -kubectl config delete-context aws-documentdb -``` - -## Scripts - -- **`deploy.sh`**: All-in-one multi-cloud deployment (AKS Fleet + GKE + EKS + cert-manager + Istio mesh + operator) -- **`deploy-documentdb.sh`**: Deploy multi-cloud DocumentDB with Istio-based replication and optional Azure DNS -- **`main.bicep`**: Bicep template for AKS Fleet and single member cluster -- **`parameters.bicepparam`**: Configuration parameters for AKS deployment -- **`documentdb-base.yaml`**: Fleet ClusterResourcePlacement for base resources (CRDs, RBAC, namespaces) -- **`documentdb-cluster.yaml`**: DocumentDB multi-cloud configuration template with Fleet ClusterResourcePlacement - -## Key Features - -- **Multi-Cloud Architecture**: Deploy across Azure AKS, Google GKE, and AWS EKS -- **Istio Service Mesh**: Cross-cloud service discovery and secure communication -- **Automated Mesh Setup**: Shared root CA, east-west gateways, and remote secrets -- **AKS Fleet Integration**: Resource propagation via ClusterResourcePlacement to all clouds -- **Cross-Cloud Replication**: DocumentDB replication using Istio for connectivity -- **Dynamic Discovery**: Automatically configures all clusters and generates failover commands -- **Azure DNS Integration**: Optional DNS zone creation with A/CNAME and SRV records for MongoDB -- **Cloud-Specific Configuration**: - - EKS: EBS CSI driver and AWS Load Balancer Controller - - GKE: Default persistent disk provisioner - - AKS: Azure Disk CSI driver -- **Parallel Deployment**: AKS, GKE, and EKS deployed concurrently for faster setup -- **Smart Defaults**: Sensible defaults with environment variable overrides - -## Additional Resources - -- [Azure AKS Fleet Documentation](https://learn.microsoft.com/en-us/azure/kubernetes-fleet/) -- [AKS Authentication Guide](https://learn.microsoft.com/en-us/azure/aks/kubelogin-authentication) -- [Fleet ClusterResourcePlacement API](https://learn.microsoft.com/en-us/azure/kubernetes-fleet/concepts-resource-propagation) -- [Istio Multi-Cluster Installation](https://istio.io/latest/docs/setup/install/multicluster/) -- [Istio Multi-Primary Multi-Network](https://istio.io/latest/docs/setup/install/multicluster/multi-primary_multi-network/) -- [Google GKE Documentation](https://cloud.google.com/kubernetes-engine/docs) -- [AWS EKS Documentation](https://docs.aws.amazon.com/eks/) -- [AWS Load Balancer Controller](https://kubernetes-sigs.github.io/aws-load-balancer-controller/) -- [eksctl Documentation](https://eksctl.io/) -- [DocumentDB Kubernetes Operator Documentation](../../README.md) \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh b/operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh deleted file mode 100755 index a1a0eefd..00000000 --- a/operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh +++ /dev/null @@ -1,431 +0,0 @@ -#!/usr/bin/env bash -# filepath: /Users/geeichbe/Projects/documentdb-kubernetes-operator/scripts/aks-fleet-deployment/deploy-multi-region.sh -set -euo pipefail - -# Deploy multi-region DocumentDB using Fleet with Traffic Manager -# Usage: ./deploy-documentdb.sh [password] -# -# Environment variables: -# RESOURCE_GROUP: Azure resource group (default: german-aks-fleet-rg) -# DOCUMENTDB_PASSWORD: Database password (will be generated if not provided) -# ENABLE_AZURE_DNS: Enable Azure DNS creation (default: true) -# AZURE_DNS_ZONE_NAME: Azure DNS zone name (default: same as resource group) -# AZURE_DNS_PARENT_ZONE_RESOURCE_ID: Azure DNS parent zone resource ID (default: multi-cloud.pgmongo-dev.cosmos.windows-int.net) -# -# Examples: -# ./deploy-multi-region.sh -# ENABLE_AZURE_DNS=false ./deploy-multi-region.sh mypassword - -# Get the directory where this script is located -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Resource group -RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" - -AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-azure-documentdb}" -GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gcp-documentdb}" -EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-aws-documentdb}" - -# Azure DNS configuration -AZURE_DNS_ZONE_NAME="${AZURE_DNS_ZONE_NAME:-${RESOURCE_GROUP}}" -AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-/subscriptions/81901d5e-31aa-46c5-b61a-537dbd5df1e7/resourceGroups/alaye-documentdb-dns/providers/Microsoft.Network/dnszones/multi-cloud.pgmongo-dev.cosmos.windows-int.net}" -AZURE_DNS_ZONE_FULL_NAME="${AZURE_DNS_ZONE_FULL_NAME:-}" -AZURE_DNS_ZONE_RG="${AZURE_DNS_ZONE_RG:-${RESOURCE_GROUP}}" -ENABLE_AZURE_DNS="${ENABLE_AZURE_DNS:-true}" - -# Set password from argument or environment variable -DOCUMENTDB_PASSWORD="${1:-${DOCUMENTDB_PASSWORD:-}}" -DOCUMENTDB_IMAGE="${DOCUMENTDB_IMAGE:-ghcr.io/microsoft/documentdb/documentdb-local:16}" -GATEWAY_IMAGE="${GATEWAY_IMAGE:-${DOCUMENTDB_IMAGE}}" - -# If no password provided, generate a secure one -if [ -z "$DOCUMENTDB_PASSWORD" ]; then - echo "No password provided. Generating a secure password..." - DOCUMENTDB_PASSWORD=$(openssl rand -base64 32 | tr -d "=+/" | cut -c1-25) - echo "Generated password: $DOCUMENTDB_PASSWORD" - echo "(Save this password - you'll need it to connect to the database)" - echo "" -fi - -# Export for envsubst -export DOCUMENTDB_PASSWORD - - -# Convert to array and add GCP -CLUSTER_ARRAY=("$EKS_CLUSTER_NAME" "$AKS_CLUSTER_NAME" "$GKE_CLUSTER_NAME") -echo "Found ${#CLUSTER_ARRAY[@]} member clusters:" -for cluster in "${CLUSTER_ARRAY[@]}"; do - echo " - $cluster" -done - -PRIMARY_CLUSTER=${CLUSTER_ARRAY[0]} -echo "" -echo "Selected primary cluster: $PRIMARY_CLUSTER" - -# Build the cluster list YAML with proper indentation -CLUSTER_LIST=$(cat </dev/null; then - echo "✗ Context $cluster not found, skipping" - continue - fi - - # Create or update the cluster-name ConfigMap - kubectl --context "$cluster" create configmap cluster-name \ - -n kube-system \ - --from-literal=name="$cluster" \ - --dry-run=client -o yaml | kubectl --context "$cluster" apply -f - - - # Verify the ConfigMap was created - if kubectl --context "$cluster" get configmap cluster-name -n kube-system &>/dev/null; then - echo "✓ ConfigMap created/updated for $cluster" - else - echo "✗ Failed to create ConfigMap for $cluster" - fi -done - -# Step 2: Deploy DocumentDB resources via Fleet -echo "" -echo "=======================================" -echo "Deploying DocumentDB multi-region configuration..." -echo "=======================================" - -# Determine hub context -HUB_CONTEXT="${HUB_CONTEXT:-hub}" -if ! kubectl config get-contexts "$HUB_CONTEXT" &>/dev/null; then - echo "Hub context not found, trying to find first member cluster..." - HUB_CONTEXT="${CLUSTER_ARRAY[0]}" - if [ -z "$HUB_CONTEXT" ]; then - echo "Error: No suitable context found. Please ensure you have credentials for the fleet." - exit 1 - fi -fi - -echo "Using hub context: $HUB_CONTEXT" - -# Check if resources already exist -EXISTING_RESOURCES="" -if kubectl --context "$HUB_CONTEXT" get namespace documentdb-preview-ns &>/dev/null 2>&1; then - EXISTING_RESOURCES="${EXISTING_RESOURCES}namespace " -fi -if kubectl --context "$HUB_CONTEXT" get secret documentdb-credentials -n documentdb-preview-ns &>/dev/null 2>&1; then - EXISTING_RESOURCES="${EXISTING_RESOURCES}secret " -fi -if kubectl --context "$HUB_CONTEXT" get documentdb documentdb-preview -n documentdb-preview-ns &>/dev/null 2>&1; then - EXISTING_RESOURCES="${EXISTING_RESOURCES}documentdb " -fi -if kubectl --context "$HUB_CONTEXT" get clusterresourceplacement documentdb-crp &>/dev/null 2>&1; then - EXISTING_RESOURCES="${EXISTING_RESOURCES}clusterresourceplacement " -fi - -if [ -n "$EXISTING_RESOURCES" ]; then - echo "" - echo "⚠️ Warning: The following resources already exist: $EXISTING_RESOURCES" - echo "" - echo "Options:" - echo "1. Delete existing resources and redeploy ()" - echo "2. Update existing deployment (preserve data)" - echo "3. Cancel" - echo "" - read -p "Choose an option (1/2/3): " CHOICE - - case $CHOICE in - 1) - echo "Deleting existing resources..." - kubectl --context "$HUB_CONTEXT" delete clusterresourceplacement documentdb-crp --ignore-not-found=true - kubectl --context "$HUB_CONTEXT" delete namespace documentdb-preview-ns --ignore-not-found=true - echo "Waiting for namespace deletion to complete..." - for cluster in "${CLUSTER_ARRAY[@]}"; do - kubectl --context "$cluster" wait --for=delete namespace/documentdb-preview-ns --timeout=60s - done - ;; - 2) - echo "Updating existing deployment..." - ;; - 3) - echo "Cancelled." - exit 0 - ;; - *) - echo "Invalid choice. Cancelled." - exit 1 - ;; - esac -fi - -# Create a temporary file with substituted values -TEMP_YAML=$(mktemp) - -# Use sed for safer substitution -sed -e "s/{{DOCUMENTDB_PASSWORD}}/$DOCUMENTDB_PASSWORD/g" \ - -e "s/{{PRIMARY_CLUSTER}}/$PRIMARY_CLUSTER/g" \ - -e "s#{{DOCUMENTDB_IMAGE}}#$DOCUMENTDB_IMAGE#g" \ - -e "s#{{GATEWAY_IMAGE}}#$GATEWAY_IMAGE#g" \ - "$SCRIPT_DIR/documentdb-cluster.yaml" | \ -while IFS= read -r line; do - if [[ "$line" == '{{CLUSTER_LIST}}' ]]; then - echo "$CLUSTER_LIST" - else - echo "$line" - fi -done > "$TEMP_YAML" - -# Debug: show the generated YAML section with clusterReplication -echo "" -echo "Generated configuration preview:" -echo "--------------------------------" -echo "Primary cluster: $PRIMARY_CLUSTER" -echo "Cluster list:" -echo "$CLUSTER_LIST" -echo "--------------------------------" - -# cat "$TEMP_YAML" - -# Apply the configuration -echo "" -echo "Applying DocumentDB multi-region configuration..." -kubectl --context "$HUB_CONTEXT" apply -f "$TEMP_YAML" - -# Clean up temp file -rm -f "$TEMP_YAML" - -# Check the ClusterResourcePlacement status -echo "" -echo "Checking ClusterResourcePlacement status..." -kubectl --context "$HUB_CONTEXT" get clusterresourceplacement documentdb-crp -o wide - -# Wait a bit for propagation -echo "" -echo "Waiting for resources to propagate to member clusters..." -sleep 10 - -# Step 3: Verify deployment on each member cluster -echo "" -echo "=======================================" -echo "Checking deployment status on member clusters..." -echo "=======================================" - -for cluster in "${CLUSTER_ARRAY[@]}"; do - echo "" - echo "=== $cluster ===" - - # Check if context exists - if ! kubectl config get-contexts "$cluster" &>/dev/null; then - echo "✗ Context not found, skipping" - continue - fi - - # Check ConfigMap - if kubectl --context "$cluster" get configmap cluster-name -n kube-system &>/dev/null; then - CLUSTER_ID=$(kubectl --context "$cluster" get configmap cluster-name -n kube-system -o jsonpath='{.data.name}') - echo "✓ Cluster identified as: $CLUSTER_ID" - else - echo "✗ Cluster identification ConfigMap not found" - fi - - # Check if namespace exists - if kubectl --context "$cluster" get namespace documentdb-preview-ns &>/dev/null; then - echo "✓ Namespace exists" - - # Check if secret exists - if kubectl --context "$cluster" get secret documentdb-credentials -n documentdb-preview-ns &>/dev/null; then - echo "✓ Secret exists" - else - echo "✗ Secret not found" - fi - - # Check if DocumentDB exists - if kubectl --context "$cluster" get documentdb documentdb-preview -n documentdb-preview-ns &>/dev/null; then - echo "✓ DocumentDB resource exists" - - # Get DocumentDB status - STATUS=$(kubectl --context "$cluster" get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown") - echo " Status: $STATUS" - - # Check if this is the primary or replica - if [ "$cluster" = "$PRIMARY_CLUSTER" ]; then - echo " Role: PRIMARY" - else - echo " Role: REPLICA" - fi - else - echo "✗ DocumentDB resource not found" - fi - - # Check pods - PODS=$(kubectl --context "$cluster" get pods -n documentdb-preview-ns --no-headers 2>/dev/null | wc -l || echo "0") - echo " Pods: $PODS" - - # Show pod status if any exist - if [ "$PODS" -gt 0 ]; then - kubectl --context "$cluster" get pods -n documentdb-preview-ns 2>/dev/null | head -5 - fi - else - echo "✗ Namespace not found (resources may still be propagating)" - fi -done - -# Step 4: Create Azure DNS zone for DocumentDB -if [ "$ENABLE_AZURE_DNS" = "true" ]; then - echo "" - echo "=======================================" - echo "Creating Azure DNS zone for DocumentDB..." - echo "=======================================" - - if [ -n "$AZURE_DNS_ZONE_FULL_NAME" ]; then - fullName="$AZURE_DNS_ZONE_FULL_NAME" - else - parentName=$(az network dns zone show --id $AZURE_DNS_PARENT_ZONE_RESOURCE_ID | jq -r ".name") - fullName="${AZURE_DNS_ZONE_NAME}.${parentName}" - - # Create Azure DNS zone - if az network dns zone show --name "$AZURE_DNS_ZONE_NAME" --resource-group "$AZURE_DNS_ZONE_RG" &>/dev/null; then - echo "Azure DNS zone already exists, updating..." - else - az network dns zone create \ - --name "$fullName" \ - --resource-group "$AZURE_DNS_ZONE_RG" \ - --parent-name "$AZURE_DNS_PARENT_ZONE_RESOURCE_ID" - fi - fi - - # Wait for DocumentDB services to be ready and create endpoints - echo "" - echo "Waiting for DocumentDB services to be ready..." - sleep 30 - - # Create DNS records for each cluster - for cluster in "${CLUSTER_ARRAY[@]}"; do - echo "Creating DNS record: $cluster" - - # Create service name by concatenating documentdb-preview with cluster name (max 63 chars) - SERVICE_NAME="documentdb-service-${cluster}" - SERVICE_NAME="${SERVICE_NAME:0:63}" - - # Get the external IP of the DocumentDB service - EXTERNAL_IP="" - for attempt in {1..12}; do # Try for 2 minutes - EXTERNAL_IP=$(kubectl --context "$cluster" get svc "$SERVICE_NAME" -n documentdb-preview-ns -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "") - if [ -n "$EXTERNAL_IP" ] && [ "$EXTERNAL_IP" != "" ]; then - break - fi - EXTERNAL_HOSTNAME=$(kubectl --context "$cluster" get svc "$SERVICE_NAME" -n documentdb-preview-ns -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || echo "") - if [ -n "$EXTERNAL_HOSTNAME" ] && [ "$EXTERNAL_HOSTNAME" != "" ]; then - break - fi - echo " Waiting for external IP for $cluster (service: $SERVICE_NAME, attempt $attempt/12)..." - sleep 10 - done - - if [ -n "$EXTERNAL_IP" ] && [ "$EXTERNAL_IP" != "" ]; then - echo " External IP for $cluster: $EXTERNAL_IP" - - # TODO Delete existing DNS record if it exists - az network dns record-set a delete \ - --name "$cluster" \ - --zone-name "$fullName" \ - --resource-group "$AZURE_DNS_ZONE_RG" \ - --yes - - # Create DNS record - az network dns record-set a create \ - --name "$cluster" \ - --zone-name "$fullName" \ - --resource-group "$AZURE_DNS_ZONE_RG" \ - --ttl 5 - az network dns record-set a add-record \ - --record-set-name "$cluster" \ - --zone-name "$fullName" \ - --resource-group "$AZURE_DNS_ZONE_RG" \ - --ipv4-address "$EXTERNAL_IP" \ - --ttl 5 - - echo " ✓ Created DNS record $cluster" - elif [ -n "$EXTERNAL_HOSTNAME" ] && [ "$EXTERNAL_HOSTNAME" != "" ]; then - echo " External hostname for $cluster: $EXTERNAL_HOSTNAME" - - # TODO Delete existing DNS record if it exists - az network dns record-set cname delete \ - --name "$cluster" \ - --zone-name "$fullName" \ - --resource-group "$AZURE_DNS_ZONE_RG" \ - --yes - - # Create DNS record - az network dns record-set cname create \ - --name "$cluster" \ - --zone-name "$fullName" \ - --resource-group "$AZURE_DNS_ZONE_RG" \ - --ttl 5 - az network dns record-set cname set-record \ - --record-set-name "$cluster" \ - --zone-name "$fullName" \ - --resource-group "$AZURE_DNS_ZONE_RG" \ - --cname "$EXTERNAL_HOSTNAME" \ - --ttl 5 - - echo " ✓ Created DNS record $cluster" - else - echo " ✗ Failed to get external IP for $cluster" - fi - done - - az network dns record-set srv delete \ - --name "_mongodb._tcp" \ - --zone-name "$fullName" \ - --resource-group "$AZURE_DNS_ZONE_RG" \ - --yes - - az network dns record-set srv create \ - --name "_mongodb._tcp" \ - --zone-name "$fullName" \ - --resource-group "$AZURE_DNS_ZONE_RG" \ - --ttl 5 - - mongoFQDN=$(az network dns record-set srv add-record \ - --record-set-name "_mongodb._tcp" \ - --zone-name "$fullName" \ - --resource-group "$AZURE_DNS_ZONE_RG" \ - --priority 0 \ - --weight 0 \ - --port 10260 \ - --target "$PRIMARY_CLUSTER.$fullName" | jq -r ".fqdn") - - echo "" - echo "✓ DNS zone created successfully!" - echo " Zone Name: $fullName" - echo " MongoDB FQDN: $mongoFQDN" -fi - -echo "" -echo "Connection Information:" -echo " Username: docdb" -echo " Password: $DOCUMENTDB_PASSWORD" -echo "" -echo "To monitor the deployment:" -echo "watch 'kubectl --context $HUB_CONTEXT get clusterresourceplacement documentdb-crp -o wide'" -echo "" -echo "To check DocumentDB status across all clusters:" -# Create a space-separated string from the array -CLUSTER_STRING=$(IFS=' '; echo "${CLUSTER_ARRAY[*]}") -echo "for c in $CLUSTER_STRING; do echo \"=== \$c ===\"; kubectl --context \$c get documentdb,pods -n documentdb-preview-ns 2>/dev/null || echo 'Not deployed yet'; echo; done" \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/deploy-gke.sh b/operator/src/scripts/multi-cloud-deployment/deploy-gke.sh deleted file mode 100644 index 1b903152..00000000 --- a/operator/src/scripts/multi-cloud-deployment/deploy-gke.sh +++ /dev/null @@ -1,100 +0,0 @@ -#!/bin/bash - -PROJECT_ID="${PROJECT_ID:-gke-documentdb-demo}" -GKE_USER="${GKE_USER:-alexanderlaye57@gmail.com}" -CLUSTER_NAME="${CLUSTER_NAME:-gcp-documentdb}" -ZONE="${ZONE:-us-central1-a}" - -# one time -#gcloud projects create $PROJECT_ID -#sudo apt-get install google-cloud-cli-gke-gcloud-auth-plugin - -gcloud config set project $PROJECT_ID -gcloud config set account $USER -gcloud auth login --brief - -gcloud services enable container.googleapis.com -gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$USER" --role="roles/container.admin" -gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$USER" --role="roles/compute.networkAdmin" -gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$USER" --role="roles/iam.serviceAccountUser" - -gcloud container clusters create "$CLUSTER_NAME" \ - --zone "$ZONE" \ - --num-nodes "2" \ - --machine-type "e2-standard-4" \ - --enable-ip-access \ - --project $PROJECT_ID - -gcloud container clusters get-credentials "$CLUSTER_NAME" \ - --location="$ZONE" -kubectl config rename-context "$(kubectl config current-context)" $CLUSTER_NAME - -helm repo add jetstack https://charts.jetstack.io -helm repo update -helm install cert-manager jetstack/cert-manager \ - --namespace cert-manager \ - --create-namespace \ - --version v1.13.2 \ - --set installCRDs=true \ - --set prometheus.enabled=false \ - --set webhook.timeoutSeconds=30 - - -cat < /dev/null; then - echo "ERROR: Azure CLI not found. Please install Azure CLI first." >&2 - exit 1 - fi - - # Check kubectl - if ! command -v kubectl &> /dev/null; then - echo "ERROR: kubectl not found. Please install kubectl first." >&2 - exit 1 - fi - - # Check Helm - if ! command -v helm &> /dev/null; then - echo "ERROR: Helm not found. Please install Helm first." >&2 - exit 1 - fi - - # Check gcloud CLI - if ! command -v gcloud &> /dev/null; then - echo "ERROR: gcloud CLI not found. Please install Google Cloud SDK first." >&2 - exit 1 - fi - - # Check AWS CLI - if ! command -v aws &> /dev/null; then - echo "ERROR: AWS CLI not found. Please install AWS CLI first." >&2 - exit 1 - fi - - # Check eksctl - if ! command -v eksctl &> /dev/null; then - echo "ERROR: eksctl not found. Please install eksctl first." >&2 - exit 1 - fi - - # Check jq - if ! command -v jq &> /dev/null; then - echo "ERROR: jq not found. Please install jq first." >&2 - exit 1 - fi - - # Check Azure login - if ! az account show &> /dev/null; then - echo "ERROR: Not logged into Azure. Please run 'az login' first." >&2 - exit 1 - fi - - # Check gcloud login - gcloud config set account $GCP_USER - if ! gcloud auth list --filter=status:ACTIVE --format="value(account)" 2>/dev/null | grep -q .; then - echo "ERROR: Not logged into Google Cloud. Please run 'gcloud auth login' first." >&2 - exit 1 - fi - - # Check AWS credentials - if ! aws sts get-caller-identity &> /dev/null; then - echo "ERROR: AWS credentials not configured. Please run 'aws configure' first." >&2 - exit 1 - fi - - echo "✅ All prerequisites met" -} - -wait_for_no_inprogress() { - local rg="$1" - echo "Checking for in-progress AKS operations in resource group '$rg'..." - local inprogress - inprogress=$(az aks list -g "$rg" -o json \ - | jq -r '.[] | select(.provisioningState != "Succeeded" and .provisioningState != null) | [.name, .provisioningState] | @tsv') - - if [ -z "$inprogress" ]; then - echo "No in-progress AKS operations detected." - return 0 - fi - - echo "Found clusters still provisioning:" - echo "$inprogress" | while IFS=$'\t' read -r name state; do echo " - $name: $state"; done - echo "Please re-run this script after the above operations complete." >&2 - return 1 -} - -# ============================================================================ -# Step 1: Deploy AKS Fleet Infrastructure -# ============================================================================ - -aks_fleet_deploy() { - echo "Creating or using resource group..." - EXISTING_RG_LOCATION=$(az group show --name "$RESOURCE_GROUP" --query location -o tsv 2>/dev/null || true) - if [ -n "$EXISTING_RG_LOCATION" ]; then - echo "Using existing resource group '$RESOURCE_GROUP' in location '$EXISTING_RG_LOCATION'" - RG_LOCATION="$EXISTING_RG_LOCATION" - else - az group create --name "$RESOURCE_GROUP" --location "$RG_LOCATION" - fi - - echo "Deploying AKS Fleet with Bicep..." - if ! wait_for_no_inprogress "$RESOURCE_GROUP"; then - echo "Exiting without changes due to in-progress operations." >&2 - exit 1 - fi - - PARAMS=( - --parameters "$TEMPLATE_DIR/parameters.bicepparam" - --parameters hubRegion="$HUB_REGION" - --parameters memberRegion="$AKS_REGION" - --parameters memberName="$AKS_CLUSTER_NAME" - ) - - if [ -n "$HUB_VM_SIZE" ]; then - echo "Overriding hubVmSize with: $HUB_VM_SIZE" - PARAMS+=( --parameters hubVmSize="$HUB_VM_SIZE" ) - fi - - DEPLOYMENT_NAME="aks-fleet-$(date +%s)" - az deployment group create \ - --name "$DEPLOYMENT_NAME" \ - --resource-group $RESOURCE_GROUP \ - --template-file "$TEMPLATE_DIR/main.bicep" \ - "${PARAMS[@]}" >/dev/null - - # Retrieve outputs - DEPLOYMENT_OUTPUT=$(az deployment group show \ - --resource-group $RESOURCE_GROUP \ - --name "$DEPLOYMENT_NAME" \ - --query "properties.outputs" -o json) - - FLEET_NAME=$(echo $DEPLOYMENT_OUTPUT | jq -r '.fleetName.value') - FLEET_ID_FROM_OUTPUT=$(echo $DEPLOYMENT_OUTPUT | jq -r '.fleetId.value') - AKS_CLUSTER_NAME=$(echo $DEPLOYMENT_OUTPUT | jq -r '.memberClusterName.value') - - SUBSCRIPTION_ID=$(az account show --query id -o tsv) - export FLEET_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP}/providers/Microsoft.ContainerService/fleets/${FLEET_NAME}" - - # Set up RBAC - echo "Setting up RBAC access for Fleet..." - export IDENTITY=$(az ad signed-in-user show --query "id" --output tsv) - export ROLE="Azure Kubernetes Fleet Manager RBAC Cluster Admin" - echo "Assigning role '$ROLE' to user '$IDENTITY'..." - az role assignment create --role "${ROLE}" --assignee ${IDENTITY} --scope ${FLEET_ID} >/dev/null 2>&1 || true - - # Fetch kubeconfig contexts - echo "Fetching kubeconfig contexts..." - az fleet get-credentials --resource-group "$RESOURCE_GROUP" --name "$FLEET_NAME" --overwrite-existing - - az aks get-credentials --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" --overwrite-existing -} - -# ============================================================================ -# Step 1.2: Deploy GKE Infrastructure -# ============================================================================ - -# TODO move this to a check at the top -# sudo apt-get install google-cloud-cli-gke-gcloud-auth-plugin - -# Create project if it doesn't exist -gke_deploy() { - if ! gcloud projects describe $PROJECT_ID &>/dev/null; then - gcloud projects create $PROJECT_ID - fi - - gcloud config set project $PROJECT_ID - - gcloud services enable container.googleapis.com - gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$GCP_USER" --role="roles/container.admin" - gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$GCP_USER" --role="roles/compute.networkAdmin" - gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$GCP_USER" --role="roles/iam.serviceAccountUser" - - # Delete cluster if it exists - if gcloud container clusters describe "$GKE_CLUSTER_NAME" --zone "$ZONE" --project $PROJECT_ID &>/dev/null; then - gcloud container clusters delete "$GKE_CLUSTER_NAME" \ - --zone "$ZONE" \ - --project $PROJECT_ID \ - --quiet - fi - - gcloud container clusters create "$GKE_CLUSTER_NAME" \ - --zone "$ZONE" \ - --num-nodes "2" \ - --machine-type "e2-standard-4" \ - --enable-ip-access \ - --project $PROJECT_ID - - kubectl config delete-context "$GKE_CLUSTER_NAME" || true - kubectl config delete-cluster "$GKE_CLUSTER_NAME" || true - kubectl config delete-user "$GKE_CLUSTER_NAME" || true - gcloud container clusters get-credentials "$GKE_CLUSTER_NAME" \ - --location="$ZONE" - fullName="gke_${PROJECT_ID}_${ZONE}_${GKE_CLUSTER_NAME}" - # Replace all occurrences of the generated name with GKE_CLUSTER_NAME in kubeconfig - sed -i "s|$fullName|$GKE_CLUSTER_NAME|g" ~/.kube/config -} - - -# ============================================================================ -# Step 1.3: Deploy EKS Infrastructure -# ============================================================================ - -eks_deploy() { - NODE_TYPE="m5.large" - - if eksctl get cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION &> /dev/null; then - echo "Cluster $EKS_CLUSTER_NAME already exists." - else - eksctl create cluster \ - --name $EKS_CLUSTER_NAME \ - --region $EKS_REGION \ - --node-type $NODE_TYPE \ - --nodes 2 \ - --nodes-min 2 \ - --nodes-max 2 \ - --managed \ - --with-oidc - fi - - eksctl create iamserviceaccount \ - --cluster $EKS_CLUSTER_NAME \ - --namespace kube-system \ - --name ebs-csi-controller-sa \ - --attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy \ - --override-existing-serviceaccounts \ - --approve \ - --region $EKS_REGION - - # Install EBS CSI driver addon - eksctl create addon \ - --name aws-ebs-csi-driver \ - --cluster $EKS_CLUSTER_NAME \ - --region $EKS_REGION \ - --force - - # Wait for EBS CSI driver to be ready - echo "Waiting for EBS CSI driver to be ready..." - sleep 5 - kubectl wait --for=condition=ready pod -l app=ebs-csi-controller -n kube-system --timeout=300s || echo "EBS CSI driver pods may still be starting" - - echo "Installing AWS Load Balancer Controller..." - - # Check if already installed - if helm list -n kube-system | grep -q aws-load-balancer-controller; then - echo "AWS Load Balancer Controller already installed. Skipping installation." - else - # Get VPC ID for the cluster - VPC_ID=$(aws eks describe-cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION --query 'cluster.resourcesVpcConfig.vpcId' --output text) - echo "Using VPC ID: $VPC_ID" - - # Verify subnet tags for Load Balancer Controller - echo "Verifying subnet tags for Load Balancer Controller..." - PUBLIC_SUBNETS=$(aws ec2 describe-subnets \ - --filters "Name=vpc-id,Values=$VPC_ID" "Name=map-public-ip-on-launch,Values=true" \ - --query 'Subnets[].SubnetId' --output text --region $EKS_REGION) - - PRIVATE_SUBNETS=$(aws ec2 describe-subnets \ - --filters "Name=vpc-id,Values=$VPC_ID" "Name=map-public-ip-on-launch,Values=false" \ - --query 'Subnets[].SubnetId' --output text --region $EKS_REGION) - - # Tag public subnets for internet-facing load balancers - if [ -n "$PUBLIC_SUBNETS" ]; then - echo "Tagging public subnets for internet-facing load balancers..." - for subnet in $PUBLIC_SUBNETS; do - aws ec2 create-tags --resources "$subnet" --tags Key=kubernetes.io/role/elb,Value=1 --region $EKS_REGION 2>/dev/null || true - echo "Tagged public subnet: $subnet" - done - fi - - # Tag private subnets for internal load balancers - if [ -n "$PRIVATE_SUBNETS" ]; then - echo "Tagging private subnets for internal load balancers..." - for subnet in $PRIVATE_SUBNETS; do - aws ec2 create-tags --resources "$subnet" --tags Key=kubernetes.io/role/internal-elb,Value=1 --region $EKS_REGION 2>/dev/null || true - echo "Tagged private subnet: $subnet" - done - fi - - # Download the official IAM policy (latest version) - echo "Downloading AWS Load Balancer Controller IAM policy (latest version)..." - curl -o /tmp/iam_policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/main/docs/install/iam_policy.json - - # Get account ID - ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) - - # Check if policy exists and create/update as needed - if aws iam get-policy --policy-arn arn:aws:iam::$ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy &>/dev/null; then - echo "IAM policy already exists, updating to latest version..." - # Delete and recreate to ensure we have the latest version - aws iam delete-policy --policy-arn arn:aws:iam::$ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy 2>/dev/null || true - sleep 5 # Wait for deletion to propagate - fi - - # Create IAM policy with latest permissions - echo "Creating IAM policy with latest permissions..." - aws iam create-policy \ - --policy-name AWSLoadBalancerControllerIAMPolicy \ - --policy-document file:///tmp/iam_policy.json 2>/dev/null || \ - echo "IAM policy already exists or was just created" - # Wait a moment for policy to be available - sleep 5 - - # Create IAM service account with proper permissions using eksctl - echo "Creating IAM service account with proper permissions..." - eksctl create iamserviceaccount \ - --cluster=$EKS_CLUSTER_NAME \ - --namespace=kube-system \ - --name=aws-load-balancer-controller \ - --role-name "AmazonEKSLoadBalancerControllerRole-$EKS_CLUSTER_NAME" \ - --attach-policy-arn=arn:aws:iam::$ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy \ - --approve \ - --override-existing-serviceaccounts \ - --region=$EKS_REGION - - # Add EKS Helm repository - helm repo add eks https://aws.github.io/eks-charts - helm repo update eks - - # Install Load Balancer Controller using the existing service account - helm install aws-load-balancer-controller eks/aws-load-balancer-controller \ - -n kube-system \ - --set clusterName=$EKS_CLUSTER_NAME \ - --set serviceAccount.create=false \ - --set serviceAccount.name=aws-load-balancer-controller \ - --set region=$EKS_REGION \ - --set vpcId=$VPC_ID - - # Wait for Load Balancer Controller to be ready - echo "Waiting for Load Balancer Controller to be ready..." - sleep 5 - kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=aws-load-balancer-controller -n kube-system --timeout=300s || echo "Load Balancer Controller pods may still be starting" - - # Clean up temp file - rm -f /tmp/iam_policy.json - - echo "AWS Load Balancer Controller installed" - fi - - if kubectl get storageclass documentdb-storage &> /dev/null; then - echo "DocumentDB storage class already exists. Skipping creation." - else - kubectl apply -f - </dev/null || true -helm repo update >/dev/null 2>&1 - -for cluster in ${MEMBER_CLUSTER_NAMES[@]}; do - echo "Installing cert-manager on $cluster..." - kubectl config use-context "$cluster" 2>/dev/null - helm upgrade --install cert-manager jetstack/cert-manager \ - --namespace cert-manager \ - --create-namespace \ - --set installCRDs=true \ - --wait --timeout=5m >/dev/null 2>&1 || echo " Warning: cert-manager installation issue on $cluster" - echo "✓ cert-manager installed on $cluster" -done - -echo "✅ cert-manager installed on all clusters" - -# ============================================================================ -# Step 5: Install Istio and setup mesh -# ============================================================================ - -# Create an issuer in istio-system namespace on hub -temp_dir=$(mktemp -d) -echo "Temporary directory created at: $temp_dir" - -# Check if istioctl is installed, if not install it to temp_dir -if ! command -v istioctl &> /dev/null; then - echo "istioctl not found, installing to $temp_dir..." - ISTIO_VERSION="1.24.0" - curl -L https://istio.io/downloadIstio | ISTIO_VERSION=$ISTIO_VERSION TARGET_ARCH=x86_64 sh - -d "$temp_dir" >/dev/null 2>&1 - export PATH="$temp_dir/istio-$ISTIO_VERSION/bin:$PATH" - echo "✓ istioctl installed to $temp_dir/istio-$ISTIO_VERSION/bin" -else - echo "✓ istioctl already installed: $(which istioctl)" -fi - -if [ -z "$ISTIO_DIR" ]; then - git clone https://github.com/istio/istio.git "$temp_dir/istio" - export ISTIO_DIR="$temp_dir/istio" -fi -rm -rf "$TEMPLATE_DIR/certs" -mkdir $TEMPLATE_DIR/certs -pushd $TEMPLATE_DIR/certs -make -f "$ISTIO_DIR/tools/certs/Makefile.selfsigned.mk" root-ca -index=1 -for cluster in ${MEMBER_CLUSTER_NAMES[@]}; do - make -f "$ISTIO_DIR/tools/certs/Makefile.selfsigned.mk" "${cluster}-cacerts" - kubectl --context "$cluster" delete namespace/istio-system --wait=true --ignore-not-found=true - kubectl --context "$cluster" create namespace istio-system - kubectl --context "$cluster" wait --for=jsonpath='{.status.phase}'=Active namespace/istio-system --timeout=60s - # create certs - kubectl --context "$cluster" create secret generic cacerts -n istio-system \ - --from-file="${cluster}/ca-cert.pem" \ - --from-file="${cluster}/ca-key.pem" \ - --from-file="${cluster}/root-cert.pem" \ - --from-file="${cluster}/cert-chain.pem" - - kubectl --context="${cluster}" label namespace istio-system topology.istio.io/network=network${index} - - #install istio on each cluster - cat < $remoteSecretFile - for other_cluster in ${MEMBER_CLUSTER_NAMES[@]}; do - if [ "$cluster" = "$other_cluster" ]; then - continue - fi - kubectl apply -f $remoteSecretFile --context="${other_cluster}" - done -done - -popd - -# 5.1 add lb tags to istio ew gateway on aws -kubectl --context "$EKS_CLUSTER_NAME" -n istio-system annotate service istio-eastwestgateway \ - service.beta.kubernetes.io/aws-load-balancer-type="nlb" \ - service.beta.kubernetes.io/aws-load-balancer-scheme="internet-facing" \ - service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled="true" \ - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type="ip" - -# ============================================================================ -# Step 6: Install DocumentDB Operator -# ============================================================================ - -CHART_DIR="$(cd "$TEMPLATE_DIR/../../.." && pwd)/documentdb-helm-chart" -CHART_PKG="$TEMPLATE_DIR/documentdb-operator-0.0.${VERSION}.tgz" - -# Apply cert-manager CRDs on hub -echo "Applying cert-manager CRDs on hub ($HUB_CONTEXT)..." -kubectl --context "$HUB_CONTEXT" apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.crds.yaml #>/dev/null 2>&1 - -# Create documentdb-operator namespace with Istio injection on hub -cat </dev/null || echo "0") - DESIRED=$(kubectl --context "$cluster" get deploy documentdb-operator -n documentdb-operator -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0") - echo " $cluster: $READY/$DESIRED replicas ready" -done - -# ============================================================================ -# Save environment variables and aliases -# ============================================================================ diff --git a/operator/src/scripts/multi-cloud-deployment/dns_failover.sh b/operator/src/scripts/multi-cloud-deployment/dns_failover.sh deleted file mode 100755 index 21608b8a..00000000 --- a/operator/src/scripts/multi-cloud-deployment/dns_failover.sh +++ /dev/null @@ -1,55 +0,0 @@ -#/bin/bash - -RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" -DOCUMENTDB_NAME="${DOCUMENTDB_NAME:-documentdb-preview}" -DOCUMENTDB_NAMESPACE="${DOCUMENTDB_NAMESPACE:-documentdb-preview-ns}" -HUB_CONTEXT="${HUB_CONTEXT:-hub}" -GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gcp-documentdb}" - -MEMBER_CLUSTERS=$(kubectl --context "$HUB_CONTEXT" get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq -r ".spec.clusterReplication.clusterList[].name") -PRIMARY_CLUSTER=$(kubectl --context "$HUB_CONTEXT" get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq -r ".spec.clusterReplication.primary") -TARGET_CLUSTER=$1 - -# Convert to array -CLUSTER_ARRAY=($MEMBER_CLUSTERS) -echo "Found ${#CLUSTER_ARRAY[@]} member clusters:" -for cluster in "${CLUSTER_ARRAY[@]}"; do - echo " - $cluster" - if [ "$cluster" == "$PRIMARY_CLUSTER" ]; then - echo " (current primary)" - elif [ "$cluster" == "$TARGET_CLUSTER" ]; then - echo " (target primary)" - fi -done - - -dnsName=$(az network dns zone list --resource-group $RESOURCE_GROUP --query="[0].name" -o tsv) - -#delete old srv record -az network dns record-set srv remove-record \ - --record-set-name "_mongodb._tcp" \ - --zone-name "$dnsName" \ - --resource-group "$RESOURCE_GROUP" \ - --priority 0 \ - --weight 0 \ - --port 10260 \ - --target "$PRIMARY_CLUSTER.$dnsName" \ - --keep-empty-record-set - -#create new one -az network dns record-set srv add-record \ - --record-set-name "_mongodb._tcp" \ - --zone-name "$dnsName" \ - --resource-group "$RESOURCE_GROUP" \ - --priority 0 \ - --weight 0 \ - --port 10260 \ - --target "$TARGET_CLUSTER.$dnsName" - -echo "To initiate failover to $TARGET_CLUSTER run:" -echo "kubectl documentdb promote \\" -echo " --documentdb documentdb-preview \\" -echo " --namespace documentdb-preview-ns \\" -echo " --hub-context $HUB_CONTEXT \\" -echo " --target-cluster $TARGET_CLUSTER \\" -echo " --cluster-context $TARGET_CLUSTER" diff --git a/operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml b/operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml deleted file mode 100644 index c33dacb6..00000000 --- a/operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml +++ /dev/null @@ -1,99 +0,0 @@ -apiVersion: placement.kubernetes-fleet.io/v1beta1 -kind: ClusterResourcePlacement -metadata: - name: documentdb-base -spec: - resourceSelectors: - - group: "" - version: v1 - kind: Namespace - name: documentdb-operator - - group: "" - version: v1 - kind: Namespace - name: cnpg-system - - group: "apiextensions.k8s.io" - version: v1 - kind: CustomResourceDefinition - name: documentdbs.db.microsoft.com - - group: "apiextensions.k8s.io" - version: v1 - kind: CustomResourceDefinition - name: publications.postgresql.cnpg.io - - group: "apiextensions.k8s.io" - version: v1 - kind: CustomResourceDefinition - name: poolers.postgresql.cnpg.io - - group: "apiextensions.k8s.io" - version: v1 - kind: CustomResourceDefinition - name: clusterimagecatalogs.postgresql.cnpg.io - - group: "apiextensions.k8s.io" - version: v1 - kind: CustomResourceDefinition - name: imagecatalogs.postgresql.cnpg.io - - group: "apiextensions.k8s.io" - version: v1 - kind: CustomResourceDefinition - name: scheduledbackups.postgresql.cnpg.io - - group: "apiextensions.k8s.io" - version: v1 - kind: CustomResourceDefinition - name: backups.postgresql.cnpg.io - - group: "apiextensions.k8s.io" - version: v1 - kind: CustomResourceDefinition - name: subscriptions.postgresql.cnpg.io - - group: "apiextensions.k8s.io" - version: v1 - kind: CustomResourceDefinition - name: databases.postgresql.cnpg.io - - group: "apiextensions.k8s.io" - version: v1 - kind: CustomResourceDefinition - name: clusters.postgresql.cnpg.io - # RBAC roles and bindings - - group: "rbac.authorization.k8s.io" - version: v1 - kind: ClusterRole - name: documentdb-operator-cluster-role - - group: "rbac.authorization.k8s.io" - version: v1 - kind: ClusterRole - name: documentdb-operator-cloudnative-pg - - group: "rbac.authorization.k8s.io" - version: v1 - kind: ClusterRole - name: documentdb-operator-cloudnative-pg-edit - - group: "rbac.authorization.k8s.io" - version: v1 - kind: ClusterRole - name: documentdb-operator-cloudnative-pg-view - - group: "rbac.authorization.k8s.io" - version: v1 - kind: ClusterRoleBinding - name: documentdb-operator-cluster-rolebinding - - group: "rbac.authorization.k8s.io" - version: v1 - kind: ClusterRoleBinding - name: documentdb-operator-cloudnative-pg - - group: "admissionregistration.k8s.io" - version: v1 - kind: MutatingWebhookConfiguration - name: cnpg-mutating-webhook-configuration - - group: "admissionregistration.k8s.io" - version: v1 - kind: ValidatingWebhookConfiguration - name: cnpg-validating-webhook-configuration - - group: "rbac.authorization.k8s.io" - version: v1 - kind: ClusterRole - name: wal-replica-manager - - group: "rbac.authorization.k8s.io" - version: v1 - kind: ClusterRoleBinding - name: wal-replica-manager-binding - policy: - placementType: PickAll - strategy: - type: RollingUpdate \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml b/operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml deleted file mode 100644 index 389c6991..00000000 --- a/operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml +++ /dev/null @@ -1,61 +0,0 @@ -# Namespace definition -apiVersion: v1 -kind: Namespace -metadata: - name: documentdb-preview-ns - labels: - istio-injection: enabled - ---- - -apiVersion: v1 -kind: Secret -metadata: - name: documentdb-credentials - namespace: documentdb-preview-ns -type: Opaque -stringData: - username: docdb - password: {{DOCUMENTDB_PASSWORD}} - ---- - -apiVersion: db.microsoft.com/preview -kind: DocumentDB -metadata: - name: documentdb-preview - namespace: documentdb-preview-ns -spec: - nodeCount: 1 - instancesPerNode: 1 - documentDBImage: {{DOCUMENTDB_IMAGE}} - gatewayImage: {{GATEWAY_IMAGE}} - resource: - storage: - pvcSize: 10Gi - clusterReplication: - highAvailability: true - primary: {{PRIMARY_CLUSTER}} - crossCloudNetworkingStrategy: Istio - clusterList: -{{CLUSTER_LIST}} - exposeViaService: - serviceType: LoadBalancer - logLevel: info - ---- - -apiVersion: placement.kubernetes-fleet.io/v1beta1 -kind: ClusterResourcePlacement -metadata: - name: documentdb-crp -spec: - resourceSelectors: - - group: "" - version: v1 - kind: Namespace - name: documentdb-preview-ns - policy: - placementType: PickAll - strategy: - type: RollingUpdate \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/insert_test.py b/operator/src/scripts/multi-cloud-deployment/insert_test.py deleted file mode 100644 index 9434868b..00000000 --- a/operator/src/scripts/multi-cloud-deployment/insert_test.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import time -from pymongo import MongoClient, errors -from datetime import datetime - -if len(sys.argv) != 2: - print(f"Usage: python insert_test.py ") - sys.exit(1) - -connection_string = sys.argv[1] - -client = MongoClient(connection_string) - -db = client.testdb -collection = db.testcollection - -print(f"{'Inserted Document':<30} {'Insert Count':<15}") -print("-" * 77) -start_time = time.time() -end_time = start_time + (60 * 60) # 60 minutes -count = 0 -first_error_seen = False - -while time.time() < end_time: - write_result = "" - try: - doc = { - "count": count, - "message": f"Insert operation {count}" - } - result = collection.insert_one(doc) - write_result = result.inserted_id - count += 1 - print(f"{str(write_result):<30} {count:<15}") - except Exception as e: - if not first_error_seen: - print("Switching cloud to Azure") - first_error_seen = True - - time.sleep(.25) - -print(f"Completed {count} insert operations in 10 minutes") -final_read_count = collection.count_documents({}) -print(f"Final read count: {final_read_count}") -client.close() diff --git a/operator/src/scripts/multi-cloud-deployment/main.bicep b/operator/src/scripts/multi-cloud-deployment/main.bicep deleted file mode 100644 index da8934b0..00000000 --- a/operator/src/scripts/multi-cloud-deployment/main.bicep +++ /dev/null @@ -1,74 +0,0 @@ -targetScope = 'resourceGroup' - -@description('Name of the Fleet Hub AKS cluster') -param hubClusterName string = 'aks-fleet-hub' - -@description('Location for the Fleet Hub') -param hubRegion string = 'eastus2' - -@description('Name for member cluster') -param memberName string = 'aks-fleet-member' - -@description('Location for member cluster') -param memberRegion string = 'eastus2' - -@description('Kubernetes version. Leave empty to use the region default GA version.') -param kubernetesVersion string = '' - -@description('VM size for cluster nodes') -param hubVmSize string = 'Standard_DS3_v2' - -@description('Number of nodes per cluster') -param nodeCount int = 1 - -var fleetName = '${hubClusterName}-fleet' - -// Optionally include kubernetesVersion in cluster properties -var maybeK8sVersion = empty(kubernetesVersion) ? {} : { kubernetesVersion: kubernetesVersion } - -// Fleet resource -resource fleet 'Microsoft.ContainerService/fleets@2025-03-01' = { - name: fleetName - location: hubRegion - properties: { - hubProfile: { - dnsPrefix: fleetName - } - } -} - -// Member AKS Cluster (using default Azure CNI without custom VNets) -resource memberCluster 'Microsoft.ContainerService/managedClusters@2023-10-01' = { - name: memberName - location: memberRegion - identity: { - type: 'SystemAssigned' - } - properties: union({ - dnsPrefix: 'member-${memberRegion}-dns' - agentPoolProfiles: [ - { - name: 'agentpool' - count: nodeCount - vmSize: hubVmSize - mode: 'System' - osType: 'Linux' - } - ] - }, maybeK8sVersion) -} - -// Member clusters fleet membership -resource memberFleetMembers 'Microsoft.ContainerService/fleets/members@2023-10-15' = { - name: memberName - parent: fleet - properties: { - clusterResourceId: memberCluster.id - } -} - -// Outputs -output fleetId string = fleet.id -output fleetName string = fleet.name -output memberClusterId string = memberCluster.id -output memberClusterName string = memberCluster.name diff --git a/operator/src/scripts/multi-cloud-deployment/parameters.bicepparam b/operator/src/scripts/multi-cloud-deployment/parameters.bicepparam deleted file mode 100644 index c58e7310..00000000 --- a/operator/src/scripts/multi-cloud-deployment/parameters.bicepparam +++ /dev/null @@ -1,8 +0,0 @@ -using './main.bicep' - -param hubClusterName = 'aks-fleet-hub' -param hubRegion = 'eastus2' -param memberRegion = 'eastus2' -param kubernetesVersion = '' -param nodeCount = 1 -param hubVmSize = 'Standard_DS3_v2' diff --git a/operator/src/scripts/multi-cloud-deployment/read_test.py b/operator/src/scripts/multi-cloud-deployment/read_test.py deleted file mode 100644 index 20b6eea8..00000000 --- a/operator/src/scripts/multi-cloud-deployment/read_test.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import time -from pymongo import MongoClient, errors -from datetime import datetime - -if len(sys.argv) != 2: - print("Usage: python insert_test.py ") - sys.exit(1) - -connection_string = sys.argv[1] - -client = MongoClient(connection_string) - -db = client.testdb -collection = db.testcollection - -# Perform single insert operation -print(f"Performing initial insert operation...") -print(f"Using: {connection_string.split('@')[1] if '@' in connection_string else 'local'}") -print() -print(f"Starting read operations for 10 minutes...") -print(f"{'Timestamp':<20} {'Read Count':<15} {'Status':<20}") -print("-" * 80) - -start_time = time.time() -end_time = start_time + (10 * 60) # 10 minutes -read_count = 0 -error_count = 0 - -while time.time() < end_time: - timestamp = datetime.now().strftime("%H:%M:%S") - try: - count = collection.count_documents({}) - read_count += 1 - print(f"{timestamp:<20} {count:<15} {'Success':<20}") - except Exception as e: - error_count += 1 - print(f"{timestamp:<20} {'N/A':<15} {'ERROR':<20}") - print(f" Exception Type: {type(e).__name__}") - print(f" Exception Message: {str(e)}") - if hasattr(e, 'details'): - print(f" Details: {e.details}") - if hasattr(e, '__cause__'): - print(f" Cause: {e.__cause__}") - print() - - time.sleep(1) - -print() -print(f"Completed {read_count} successful read operations in 10 minutes") -print(f"Total errors: {error_count}") -try: - final_count = collection.count_documents({}) - print(f"Final document count: {final_count}") -except Exception as e: - print(f"ERROR reading final count:") - print(f" Exception Type: {type(e).__name__}") - print(f" Exception Message: {str(e)}") -client.close() From 695d0859329f071efbe600f1d19987a3c60e6966 Mon Sep 17 00:00:00 2001 From: Alexander Laye Date: Wed, 14 Jan 2026 11:01:27 -0500 Subject: [PATCH 3/5] typo --- .../aks-fleet-deployment/deploy-multi-region.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh b/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh index 5c54cb94..73c5640e 100755 --- a/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh +++ b/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh @@ -92,7 +92,7 @@ echo "=======================================" for cluster in "${CLUSTER_ARRAY[@]}"; do echo "" - echo "Processing ConfigMaps for $cluster..." + echo "Processing ConfigMap for $cluster..." # Check if context exists if ! kubectl config get-contexts "$cluster" &>/dev/null; then @@ -110,7 +110,7 @@ for cluster in "${CLUSTER_ARRAY[@]}"; do if kubectl --context "$cluster" get configmap cluster-name -n kube-system &>/dev/null; then echo "✓ ConfigMap created/updated for $cluster" else - echo "✗ Failed to create ConfigMap cluster-name for $cluster" + echo "✗ Failed to create ConfigMap for $cluster" fi done From c29bc2d0b627a4d8a058c3395d1ca664a2a5ebcd Mon Sep 17 00:00:00 2001 From: Alexander Laye Date: Fri, 30 Jan 2026 11:30:40 -0500 Subject: [PATCH 4/5] newlines and dead code --- .../deploy-multi-region.sh | 2 +- .../deploy-documentdb.sh | 2 +- .../telemetry/deploy-telemetry.sh | 35 ++----------------- .../telemetry/otel-collector.yaml | 2 +- .../telemetry/prometheus-values.yaml | 2 +- 5 files changed, 6 insertions(+), 37 deletions(-) diff --git a/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh b/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh index 73c5640e..c785f4e4 100755 --- a/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh +++ b/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh @@ -22,7 +22,7 @@ HUB_REGION="${HUB_REGION:-westus3}" # Azure DNS configuration AZURE_DNS_ZONE_NAME="${AZURE_DNS_ZONE_NAME:-${RESOURCE_GROUP}}" -AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-/subscriptions/81901d5e-31aa-46c5-b61a-537dbd5df1e7/resourceGroups/alaye-documentdb-dns/providers/Microsoft.Network/dnszones/multi-cloud.pgmongo-dev.cosmos.windows-int.net}" +AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-}" ENABLE_AZURE_DNS="${ENABLE_AZURE_DNS:-true}" # Set password from argument or environment variable diff --git a/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh b/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh index ff3f32f8..1130ec10 100755 --- a/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh +++ b/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh @@ -28,7 +28,7 @@ EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-aws-documentdb}" # Azure DNS configuration AZURE_DNS_ZONE_NAME="${AZURE_DNS_ZONE_NAME:-${RESOURCE_GROUP}}" -AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-/subscriptions/81901d5e-31aa-46c5-b61a-537dbd5df1e7/resourceGroups/alaye-documentdb-dns/providers/Microsoft.Network/dnszones/multi-cloud.pgmongo-dev.cosmos.windows-int.net}" +AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-}" AZURE_DNS_ZONE_FULL_NAME="${AZURE_DNS_ZONE_FULL_NAME:-}" AZURE_DNS_ZONE_RG="${AZURE_DNS_ZONE_RG:-${RESOURCE_GROUP}}" ENABLE_AZURE_DNS="${ENABLE_AZURE_DNS:-true}" diff --git a/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh b/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh index f27f8211..d9a8914c 100755 --- a/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh +++ b/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh @@ -244,8 +244,7 @@ deploy_collectors() { # Deploy monitoring stack only on primary deploy_monitoring_stack() { - #primary=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.primary}') - primary="azure-documentdb" + primary=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.primary}') kubectl config use-context "$primary" log "Deploying monitoring stack to primary" @@ -355,34 +354,6 @@ EOF success "Fleet ServiceExport and MultiClusterService resources created for OTEL collectors" } -# Wait for collectors to be ready -wait_for_collectors() { - log "Waiting for OpenTelemetry collectors to be ready..." - - # Wait for Sales collector - kubectl wait --for=condition=available deployment/documentdb-sales-collector-collector -n $SALES_NAMESPACE --timeout=300s - success "Sales collector is ready" - - # Wait for Accounts collector - kubectl wait --for=condition=available deployment/documentdb-accounts-collector-collector -n $ACCOUNTS_NAMESPACE --timeout=300s - success "Accounts collector is ready" -} - -# Wait for monitoring stacks to be ready -wait_for_monitoring_stacks() { - log "Waiting for monitoring stacks to be ready..." - - # Wait for Sales monitoring stack - kubectl wait --for=condition=available deployment/prometheus-sales-server -n $SALES_NAMESPACE --timeout=300s - kubectl wait --for=condition=available deployment/grafana-sales -n $SALES_NAMESPACE --timeout=300s - success "Sales monitoring stack is ready" - - # Wait for Accounts monitoring stack - kubectl wait --for=condition=available deployment/prometheus-accounts-server -n $ACCOUNTS_NAMESPACE --timeout=300s - kubectl wait --for=condition=available deployment/grafana-accounts -n $ACCOUNTS_NAMESPACE --timeout=300s - success "Accounts monitoring stack is ready" -} - # Main execution main() { log "Starting Multi-Tenant DocumentDB + Telemetry Deployment..." @@ -417,10 +388,8 @@ main() { if [[ "$SKIP_WAIT" == "false" ]]; then error "Wait not yet implemented" - #wait_for_collectors - #wait_for_monitoring_stacks fi } # Run main function -main "$@" \ No newline at end of file +main "$@" diff --git a/documentdb-playground/multi-cloud-deployment/telemetry/otel-collector.yaml b/documentdb-playground/multi-cloud-deployment/telemetry/otel-collector.yaml index 9fce06ff..02e254a0 100644 --- a/documentdb-playground/multi-cloud-deployment/telemetry/otel-collector.yaml +++ b/documentdb-playground/multi-cloud-deployment/telemetry/otel-collector.yaml @@ -86,4 +86,4 @@ spec: metrics: receivers: [otlp] processors: [memory_limiter, batch] - exporters: [prometheus] \ No newline at end of file + exporters: [prometheus] diff --git a/documentdb-playground/multi-cloud-deployment/telemetry/prometheus-values.yaml b/documentdb-playground/multi-cloud-deployment/telemetry/prometheus-values.yaml index eac090b4..6daad2d1 100644 --- a/documentdb-playground/multi-cloud-deployment/telemetry/prometheus-values.yaml +++ b/documentdb-playground/multi-cloud-deployment/telemetry/prometheus-values.yaml @@ -48,4 +48,4 @@ serverFiles: # Prometheus self-monitoring - job_name: 'prometheus' static_configs: - - targets: ['localhost:9090'] \ No newline at end of file + - targets: ['localhost:9090'] From 10260c2d366c903e71d538222c04cdc137defdb7 Mon Sep 17 00:00:00 2001 From: Alexander Laye Date: Mon, 2 Feb 2026 10:54:11 -0500 Subject: [PATCH 5/5] Copilot review Signed-off-by: Alexander Laye --- documentdb-playground/multi-cloud-deployment/README.md | 6 +++--- .../multi-cloud-deployment/insert_test.py | 2 +- .../multi-cloud-deployment/telemetry/deploy-telemetry.sh | 8 +++----- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/documentdb-playground/multi-cloud-deployment/README.md b/documentdb-playground/multi-cloud-deployment/README.md index 788a2e98..c28e04fb 100644 --- a/documentdb-playground/multi-cloud-deployment/README.md +++ b/documentdb-playground/multi-cloud-deployment/README.md @@ -189,7 +189,7 @@ kubectl --context get documentdb,pods -n documentdb-preview-ns kubectl --context port-forward \ -n documentdb-preview-ns svc/documentdb-service- 10260:10260 -mongosh localhost:10260 -u default_user -p \ +mongosh localhost:10260 -u docdb -p \ --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates ``` @@ -198,12 +198,12 @@ mongosh localhost:10260 -u default_user -p \ When `ENABLE_AZURE_DNS=true`, use the MongoDB SRV connection string: ```bash -mongosh "mongodb+srv://default_user:@./?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" +mongosh "mongodb+srv://docdb:@./?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" ``` Example: ```bash -mongosh "mongodb+srv://default_user:mypassword@documentdb-aks-fleet-rg.multi-cloud.pgmongo-dev.cosmos.windows-int.net/?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" +mongosh "mongodb+srv://docdb:mypassword@documentdb-aks-fleet-rg.multi-cloud.pgmongo-dev.cosmos.windows-int.net/?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" ``` ### Observability and Telemetry diff --git a/documentdb-playground/multi-cloud-deployment/insert_test.py b/documentdb-playground/multi-cloud-deployment/insert_test.py index 9434868b..c4a9f209 100644 --- a/documentdb-playground/multi-cloud-deployment/insert_test.py +++ b/documentdb-playground/multi-cloud-deployment/insert_test.py @@ -41,7 +41,7 @@ time.sleep(.25) -print(f"Completed {count} insert operations in 10 minutes") +print(f"Completed {count} insert operations in 60 minutes") final_read_count = collection.count_documents({}) print(f"Final read count: {final_read_count}") client.close() diff --git a/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh b/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh index d9a8914c..cc9e66a7 100755 --- a/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh +++ b/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh @@ -31,7 +31,7 @@ while [[ $# -gt 0 ]]; do exit 0 ;; *) - error "Unknown option: $1" + echo "Unknown option: $1" usage exit 1 ;; @@ -359,8 +359,6 @@ main() { log "Starting Multi-Tenant DocumentDB + Telemetry Deployment..." log "=========================================================" log "Configuration:" - log " Deploy DocumentDB: $DEPLOY_DOCUMENTDB" - log " Deploy Telemetry: $DEPLOY_TELEMETRY" log " Skip Wait: $SKIP_WAIT" log "" @@ -370,7 +368,7 @@ main() { CROSS_CLOUD_STRATEGY=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.crossCloudNetworkingStrategy}' 2>/dev/null || echo "") - deploy_collectors $CROSS_CLOUD_STRATEGY + deploy_collectors deploy_monitoring_stack @@ -380,7 +378,7 @@ main() { log "Cross-cloud networking strategy is Istio. Creating placeholder services..." create_placeholder_prometheus_services elif [ "$CROSS_CLOUD_STRATEGY" = "AzureFleet" ]; then - log "Cross-cloud networking strategy is Istio. Creating placeholder services..." + log "Cross-cloud networking strategy is AzureFleet. Creating placeholder services..." create_service_exports_and_imports else log "Cross-cloud networking strategy is '$CROSS_CLOUD_STRATEGY', not 'Istio'. Skipping placeholder services."