diff --git a/.github/dockerfiles/Dockerfile_docdb b/.github/dockerfiles/Dockerfile_docdb index f99a803f..b45ce721 100644 --- a/.github/dockerfiles/Dockerfile_docdb +++ b/.github/dockerfiles/Dockerfile_docdb @@ -138,7 +138,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 6B827C12C2D425E227EDCA75089EBE08314DF160) && \ apt-get update && \ apt-get install -qy \ - libproj22 \ + libproj-dev \ libxml2 \ libjson-c5 \ libgeos-c1v5 \ diff --git a/.github/dockerfiles/Dockerfile_gateway b/.github/dockerfiles/Dockerfile_gateway index 0a4f73cf..6ccdc926 100644 --- a/.github/dockerfiles/Dockerfile_gateway +++ b/.github/dockerfiles/Dockerfile_gateway @@ -29,6 +29,8 @@ WORKDIR /home/documentdb/code/ RUN wget -P /tmp https://github.com/documentdb/documentdb/archive/refs/tags/v${DocumentDB_VERSION}.zip && \ unzip /tmp/v${DocumentDB_VERSION}.zip -d /home/documentdb/code && \ rm /tmp/v${DocumentDB_VERSION}.zip +# For local builds, copy the code over directly (uncomment the line below, and comment the wget line above) +# COPY --chown=documentdb:documentdb . /home/documentdb/code/documentdb-${DocumentDB_VERSION} RUN sudo chown -R documentdb:documentdb /home/documentdb/ diff --git a/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh b/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh index 614c1753..c785f4e4 100755 --- a/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh +++ b/documentdb-playground/aks-fleet-deployment/deploy-multi-region.sh @@ -22,11 +22,13 @@ HUB_REGION="${HUB_REGION:-westus3}" # Azure DNS configuration AZURE_DNS_ZONE_NAME="${AZURE_DNS_ZONE_NAME:-${RESOURCE_GROUP}}" -AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-/subscriptions/81901d5e-31aa-46c5-b61a-537dbd5df1e7/resourceGroups/alaye-documentdb-dns/providers/Microsoft.Network/dnszones/multi-cloud.pgmongo-dev.cosmos.windows-int.net}" +AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-}" ENABLE_AZURE_DNS="${ENABLE_AZURE_DNS:-true}" # Set password from argument or environment variable DOCUMENTDB_PASSWORD="${1:-${DOCUMENTDB_PASSWORD:-}}" +DOCUMENTDB_IMAGE="${DOCUMENTDB_IMAGE:-ghcr.io/microsoft/documentdb/documentdb-local:16}" +GATEWAY_IMAGE="${GATEWAY_IMAGE:-${DOCUMENTDB_IMAGE}}" # If no password provided, generate a secure one if [ -z "$DOCUMENTDB_PASSWORD" ]; then @@ -183,6 +185,8 @@ TEMP_YAML=$(mktemp) # Use sed for safer substitution sed -e "s/{{DOCUMENTDB_PASSWORD}}/$DOCUMENTDB_PASSWORD/g" \ -e "s/{{PRIMARY_CLUSTER}}/$PRIMARY_CLUSTER/g" \ + -e "s#{{DOCUMENTDB_IMAGE}}#$DOCUMENTDB_IMAGE#g" \ + -e "s#{{GATEWAY_IMAGE}}#$GATEWAY_IMAGE#g" \ "$SCRIPT_DIR/documentdb-resource-crp.yaml" | \ while IFS= read -r line; do if [[ "$line" == '{{CLUSTER_LIST}}' ]]; then diff --git a/documentdb-playground/aks-fleet-deployment/documentdb-resource-crp.yaml b/documentdb-playground/aks-fleet-deployment/documentdb-resource-crp.yaml index 2087d324..4ce41e7c 100644 --- a/documentdb-playground/aks-fleet-deployment/documentdb-resource-crp.yaml +++ b/documentdb-playground/aks-fleet-deployment/documentdb-resource-crp.yaml @@ -26,8 +26,8 @@ metadata: spec: nodeCount: 1 instancesPerNode: 1 - documentDBImage: ghcr.io/microsoft/documentdb/documentdb-local:16 - gatewayImage: ghcr.io/microsoft/documentdb/documentdb-local:16 + documentDBImage: {{DOCUMENTDB_IMAGE}} + gatewayImage: {{GATEWAY_IMAGE}} resource: storage: pvcSize: 10Gi diff --git a/documentdb-playground/multi-cloud-deployment/README.md b/documentdb-playground/multi-cloud-deployment/README.md index 39f2ac56..c28e04fb 100644 --- a/documentdb-playground/multi-cloud-deployment/README.md +++ b/documentdb-playground/multi-cloud-deployment/README.md @@ -108,7 +108,7 @@ Or use environment variables for all clouds: export RESOURCE_GROUP="my-multi-cloud-rg" export RG_LOCATION="eastus2" export HUB_REGION="eastus2" -export AKS_CLUSTER_NAME="aks-documentdb-cluster" +export AKS_CLUSTER_NAME="azure-documentdb" export AKS_REGION="eastus2" export HUB_VM_SIZE="Standard_D4s_v3" @@ -116,10 +116,10 @@ export HUB_VM_SIZE="Standard_D4s_v3" export PROJECT_ID="my-gcp-project-id" export GCP_USER="user@example.com" export ZONE="us-central1-a" -export GKE_CLUSTER_NAME="gke-documentdb-cluster" +export GKE_CLUSTER_NAME="gcp-documentdb" # AWS EKS -export EKS_CLUSTER_NAME="eks-documentdb-cluster" +export EKS_CLUSTER_NAME="aws-documentdb" export EKS_REGION="us-west-2" # DocumentDB Operator @@ -158,9 +158,9 @@ export AZURE_DNS_PARENT_ZONE_RESOURCE_ID="/subscriptions/.../dnszones/parent.zon After deployment, contexts are automatically configured for: - `hub`: AKS Fleet hub cluster -- `aks-documentdb-cluster`: AKS member cluster (default name) -- `gke-documentdb-cluster`: GKE cluster (default name) -- `eks-documentdb-cluster`: EKS cluster (default name) +- `azure-documentdb`: AKS member cluster (default name) +- `gcp-documentdb`: GKE cluster (default name) +- `aws-documentdb`: EKS cluster (default name) ## Management @@ -189,7 +189,7 @@ kubectl --context get documentdb,pods -n documentdb-preview-ns kubectl --context port-forward \ -n documentdb-preview-ns svc/documentdb-service- 10260:10260 -mongosh localhost:10260 -u default_user -p \ +mongosh localhost:10260 -u docdb -p \ --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates ``` @@ -198,14 +198,65 @@ mongosh localhost:10260 -u default_user -p \ When `ENABLE_AZURE_DNS=true`, use the MongoDB SRV connection string: ```bash -mongosh "mongodb+srv://default_user:@./?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" +mongosh "mongodb+srv://docdb:@./?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" ``` Example: ```bash -mongosh "mongodb+srv://default_user:mypassword@german-aks-fleet-rg.multi-cloud.pgmongo-dev.cosmos.windows-int.net/?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" +mongosh "mongodb+srv://docdb:mypassword@documentdb-aks-fleet-rg.multi-cloud.pgmongo-dev.cosmos.windows-int.net/?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" ``` +### Observability and Telemetry + +The `telemetry` folder contains configuration files for setting up a comprehensive observability stack across your multi-cloud DocumentDB deployment: + +#### Components + +- **Prometheus**: Metrics collection and storage +- **Grafana**: Visualization and dashboards +- **OpenTelemetry Collector**: Unified telemetry collection (metrics, logs, traces) + +#### Deploy Telemetry Stack + +```bash +cd telemetry +./deploy-telemetry.sh +``` + +This script will: +1. Deploy OpenTelemetry Collector on all clusters +2. Install Prometheus on the azure-documentdb cluster +2. Install Grafana on the azure-documentdb cluster +4. Configure Prometheus to scrape DocumentDB metrics + +#### Access Grafana Dashboard + +```bash +# Port-forward to Grafana +kubectl --context hub port-forward -n monitoring svc/grafana 3000:80 + +# Open browser to http://localhost:3000 +# Default credentials: admin/admin (change on first login) +``` + +From there you can import dashboard.json + +#### Configuration Files + +- **`deploy-telemetry.sh`**: Automated deployment script for the entire observability stack +- **`prometheus-values.yaml`**: Prometheus Helm chart configuration +- **`grafana-values.yaml`**: Grafana Helm chart configuration with dashboard provisioning +- **`otel-collector.yaml`**: OpenTelemetry Collector configuration for metrics and logs +- **`dashboard.json`**: Pre-built Grafana dashboard for DocumentDB monitoring + +#### Custom Configuration + +Edit the values files to customize: +- Prometheus retention period and storage +- Grafana plugins and data sources +- OpenTelemetry Collector pipelines and exporters +- Dashboard refresh intervals and panels + ### Failover Operations Failover is performed using the DocumentDB kubectl plugin: @@ -238,14 +289,14 @@ kubectl --context hub get membercluster ```bash # Check Istio components on each cluster -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get pods -n istio-system echo done # Verify east-west gateway services -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get svc -n istio-system istio-eastwestgateway echo @@ -256,12 +307,12 @@ done ```bash # Check remote secrets (for service discovery) -kubectl --context aks-documentdb-cluster get secrets -n istio-system | grep "istio-remote-secret" -kubectl --context gke-documentdb-cluster get secrets -n istio-system | grep "istio-remote-secret" -kubectl --context eks-documentdb-cluster get secrets -n istio-system | grep "istio-remote-secret" +kubectl --context azure-documentdb get secrets -n istio-system | grep "istio-remote-secret" +kubectl --context gcp-documentdb get secrets -n istio-system | grep "istio-remote-secret" +kubectl --context aws-documentdb get secrets -n istio-system | grep "istio-remote-secret" # Verify mesh network configuration -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get namespace istio-system --show-labels echo @@ -274,14 +325,14 @@ done ```bash # Quick status across all clusters -for c in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for c in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $c ===" kubectl --context $c get documentdb,pods -n documentdb-preview-ns 2>/dev/null || echo 'Not deployed yet' echo done # Check operator status on all clusters -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get deploy -n documentdb-operator kubectl --context $cluster get pods -n documentdb-operator @@ -292,14 +343,14 @@ done ```bash # Monitor all DocumentDB instances -watch 'for c in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do \ +watch 'for c in azure-documentdb gcp-documentdb aws-documentdb; do \ echo "=== $c ==="; \ kubectl --context $c get documentdb,pods -n documentdb-preview-ns; \ echo; \ done' # Check DocumentDB service endpoints -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get svc -n documentdb-preview-ns echo @@ -310,14 +361,14 @@ done ```bash # Check WAL replica status in Istio mesh -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get pods -n documentdb-preview-ns -l component=wal-replica echo done # Verify Istio sidecar injection -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster get pods -n documentdb-preview-ns -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[*].name}{"\n"}{end}' echo @@ -340,7 +391,7 @@ az network dns record-set srv show \ --resource-group $RESOURCE_GROUP # Show A/CNAME records for each cluster -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" az network dns record-set a show --name $cluster --zone-name --resource-group $RESOURCE_GROUP 2>/dev/null || \ az network dns record-set cname show --name $cluster --zone-name --resource-group $RESOURCE_GROUP 2>/dev/null || \ @@ -420,16 +471,16 @@ kubectl --context get secrets -n istio-system | grep istio-remote **EBS CSI Driver:** ```bash # Check CSI driver status -kubectl --context eks-documentdb-cluster get pods -n kube-system -l app=ebs-csi-controller +kubectl --context aws-documentdb get pods -n kube-system -l app=ebs-csi-controller # Verify storage class -kubectl --context eks-documentdb-cluster get storageclass documentdb-storage +kubectl --context aws-documentdb get storageclass documentdb-storage ``` **AWS Load Balancer Controller:** ```bash # Check controller status -kubectl --context eks-documentdb-cluster get pods -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller +kubectl --context aws-documentdb get pods -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller # Verify subnet tags VPC_ID=$(aws eks describe-cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION --query 'cluster.resourcesVpcConfig.vpcId' --output text) @@ -454,19 +505,19 @@ nslookup _mongodb._tcp.. -type=SRV ```bash # Deploy test pod with network tools -kubectl --context aks-documentdb-cluster run test-pod --image=nicolaka/netshoot -it --rm -- /bin/bash +kubectl --context azure-documentdb run test-pod --image=nicolaka/netshoot -it --rm -- /bin/bash # From within the pod, test connectivity to other clusters # Using Istio service discovery -curl -v http://documentdb-service-gke-documentdb-cluster.documentdb-preview-ns.svc.cluster.local:10260 -curl -v http://documentdb-service-eks-documentdb-cluster.documentdb-preview-ns.svc.cluster.local:10260 +curl -v http://documentdb-service-gcp-documentdb.documentdb-preview-ns.svc.cluster.local:10260 +curl -v http://documentdb-service-aws-documentdb.documentdb-preview-ns.svc.cluster.local:10260 ``` ### Debugging ```bash # Check operator logs on member clusters -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do echo "=== $cluster ===" kubectl --context $cluster logs -n documentdb-operator deployment/documentdb-operator --tail=50 echo @@ -487,7 +538,7 @@ kubectl --context hub delete clusterresourceplacement documentdb-crp kubectl --context hub delete namespace documentdb-preview-ns # Wait for namespace deletion to complete on all clusters -for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do +for cluster in azure-documentdb gcp-documentdb aws-documentdb; do kubectl --context $cluster wait --for=delete namespace/documentdb-preview-ns --timeout=60s || true done @@ -514,9 +565,9 @@ az network dns zone delete \ # Clean up local kubectl contexts kubectl config delete-context hub -kubectl config delete-context aks-documentdb-cluster -kubectl config delete-context gke-documentdb-cluster -kubectl config delete-context eks-documentdb-cluster +kubectl config delete-context azure-documentdb +kubectl config delete-context gcp-documentdb +kubectl config delete-context aws-documentdb ``` ## Scripts diff --git a/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh b/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh index 7ce4e31d..1130ec10 100755 --- a/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh +++ b/documentdb-playground/multi-cloud-deployment/deploy-documentdb.sh @@ -6,7 +6,7 @@ set -euo pipefail # Usage: ./deploy-documentdb.sh [password] # # Environment variables: -# RESOURCE_GROUP: Azure resource group (default: german-aks-fleet-rg) +# RESOURCE_GROUP: Azure resource group (default: documentdb-aks-fleet-rg) # DOCUMENTDB_PASSWORD: Database password (will be generated if not provided) # ENABLE_AZURE_DNS: Enable Azure DNS creation (default: true) # AZURE_DNS_ZONE_NAME: Azure DNS zone name (default: same as resource group) @@ -20,19 +20,23 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Resource group -RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" +RESOURCE_GROUP="${RESOURCE_GROUP:-documentdb-aks-fleet-rg}" -AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-aks-documentdb-cluster}" -GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gke-documentdb-cluster}" -EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-eks-documentdb-cluster}" +AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-azure-documentdb}" +GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gcp-documentdb}" +EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-aws-documentdb}" # Azure DNS configuration AZURE_DNS_ZONE_NAME="${AZURE_DNS_ZONE_NAME:-${RESOURCE_GROUP}}" -AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-/subscriptions/81901d5e-31aa-46c5-b61a-537dbd5df1e7/resourceGroups/alaye-documentdb-dns/providers/Microsoft.Network/dnszones/multi-cloud.pgmongo-dev.cosmos.windows-int.net}" +AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-}" +AZURE_DNS_ZONE_FULL_NAME="${AZURE_DNS_ZONE_FULL_NAME:-}" +AZURE_DNS_ZONE_RG="${AZURE_DNS_ZONE_RG:-${RESOURCE_GROUP}}" ENABLE_AZURE_DNS="${ENABLE_AZURE_DNS:-true}" # Set password from argument or environment variable DOCUMENTDB_PASSWORD="${1:-${DOCUMENTDB_PASSWORD:-}}" +DOCUMENTDB_IMAGE="${DOCUMENTDB_IMAGE:-ghcr.io/microsoft/documentdb/documentdb-local:16}" +GATEWAY_IMAGE="${GATEWAY_IMAGE:-${DOCUMENTDB_IMAGE}}" # If no password provided, generate a secure one if [ -z "$DOCUMENTDB_PASSWORD" ]; then @@ -54,7 +58,7 @@ for cluster in "${CLUSTER_ARRAY[@]}"; do echo " - $cluster" done -PRIMARY_CLUSTER=${CLUSTER_ARRAY[0]} +PRIMARY_CLUSTER=${CLUSTER_ARRAY[1]} echo "" echo "Selected primary cluster: $PRIMARY_CLUSTER" @@ -175,6 +179,8 @@ TEMP_YAML=$(mktemp) # Use sed for safer substitution sed -e "s/{{DOCUMENTDB_PASSWORD}}/$DOCUMENTDB_PASSWORD/g" \ -e "s/{{PRIMARY_CLUSTER}}/$PRIMARY_CLUSTER/g" \ + -e "s#{{DOCUMENTDB_IMAGE}}#$DOCUMENTDB_IMAGE#g" \ + -e "s#{{GATEWAY_IMAGE}}#$GATEWAY_IMAGE#g" \ "$SCRIPT_DIR/documentdb-cluster.yaml" | \ while IFS= read -r line; do if [[ "$line" == '{{CLUSTER_LIST}}' ]]; then @@ -286,17 +292,21 @@ if [ "$ENABLE_AZURE_DNS" = "true" ]; then echo "Creating Azure DNS zone for DocumentDB..." echo "=======================================" - parentName=$(az network dns zone show --id $AZURE_DNS_PARENT_ZONE_RESOURCE_ID | jq -r ".name") - fullName="${AZURE_DNS_ZONE_NAME}.${parentName}" - - # Create Azure DNS zone - if az network dns zone show --name "$AZURE_DNS_ZONE_NAME" --resource-group "$RESOURCE_GROUP" &>/dev/null; then - echo "Azure DNS zone already exists, updating..." + if [ -n "$AZURE_DNS_ZONE_FULL_NAME" ]; then + fullName="$AZURE_DNS_ZONE_FULL_NAME" else - az network dns zone create \ - --name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ - --parent-name "$AZURE_DNS_PARENT_ZONE_RESOURCE_ID" + parentName=$(az network dns zone show --id $AZURE_DNS_PARENT_ZONE_RESOURCE_ID | jq -r ".name") + fullName="${AZURE_DNS_ZONE_NAME}.${parentName}" + + # Create Azure DNS zone + if az network dns zone show --name "$AZURE_DNS_ZONE_NAME" --resource-group "$AZURE_DNS_ZONE_RG" &>/dev/null; then + echo "Azure DNS zone already exists, updating..." + else + az network dns zone create \ + --name "$fullName" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ + --parent-name "$AZURE_DNS_PARENT_ZONE_RESOURCE_ID" + fi fi # Wait for DocumentDB services to be ready and create endpoints @@ -334,19 +344,19 @@ if [ "$ENABLE_AZURE_DNS" = "true" ]; then az network dns record-set a delete \ --name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --yes # Create DNS record az network dns record-set a create \ --name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --ttl 5 az network dns record-set a add-record \ --record-set-name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --ipv4-address "$EXTERNAL_IP" \ --ttl 5 @@ -358,19 +368,19 @@ if [ "$ENABLE_AZURE_DNS" = "true" ]; then az network dns record-set cname delete \ --name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --yes # Create DNS record az network dns record-set cname create \ --name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --ttl 5 az network dns record-set cname set-record \ --record-set-name "$cluster" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --cname "$EXTERNAL_HOSTNAME" \ --ttl 5 @@ -383,19 +393,19 @@ if [ "$ENABLE_AZURE_DNS" = "true" ]; then az network dns record-set srv delete \ --name "_mongodb._tcp" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --yes az network dns record-set srv create \ --name "_mongodb._tcp" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --ttl 5 mongoFQDN=$(az network dns record-set srv add-record \ --record-set-name "_mongodb._tcp" \ --zone-name "$fullName" \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "$AZURE_DNS_ZONE_RG" \ --priority 0 \ --weight 0 \ --port 10260 \ @@ -409,7 +419,7 @@ fi echo "" echo "Connection Information:" -echo " Username: default_user" +echo " Username: docdb" echo " Password: $DOCUMENTDB_PASSWORD" echo "" echo "To monitor the deployment:" diff --git a/documentdb-playground/multi-cloud-deployment/deploy.sh b/documentdb-playground/multi-cloud-deployment/deploy.sh index 04709061..186d5b62 100755 --- a/documentdb-playground/multi-cloud-deployment/deploy.sh +++ b/documentdb-playground/multi-cloud-deployment/deploy.sh @@ -8,7 +8,7 @@ set -euo pipefail # Configuration # ============================================================================ -RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" +RESOURCE_GROUP="${RESOURCE_GROUP:-documentdb-aks-fleet-rg}" RG_LOCATION="${RG_LOCATION:-eastus2}" HUB_REGION="${HUB_REGION:-$RG_LOCATION}" TEMPLATE_DIR="$(dirname "$0")" @@ -16,16 +16,16 @@ HUB_VM_SIZE="${HUB_VM_SIZE:-}" VERSION="${VERSION:-200}" VALUES_FILE="${VALUES_FILE:-}" ISTIO_DIR="${ISTIO_DIR:-}" -AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-aks-documentdb-cluster}" +AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-azure-documentdb}" AKS_REGION="${AKS_REGION:-eastus2}" HUB_CONTEXT="${HUB_CONTEXT:-hub}" PROJECT_ID="${PROJECT_ID:-sanguine-office-475117-s6}" GCP_USER="${GCP_USER:-alexanderlaye59@gmail.com}" ZONE="${ZONE:-us-central1-a}" -GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gke-documentdb-cluster}" +GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gcp-documentdb}" -EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-eks-documentdb-cluster}" +EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-aws-documentdb}" EKS_REGION="${EKS_REGION:-us-west-2}" # ============================================================================ @@ -436,8 +436,9 @@ pushd $temp_dir git clone https://github.com/kubefleet-dev/kubefleet.git git clone https://github.com/Azure/fleet-networking.git pushd $temp_dir/kubefleet +git checkout d3f42486fa78874e33ba8e6e5e34636767f77b8f chmod +x hack/membership/joinMC.sh -hack/membership/joinMC.sh "v0.16.5" "$HUB_CONTEXT" "$GKE_CLUSTER_NAME" "$EKS_CLUSTER_NAME" +hack/membership/joinMC.sh "v0.16.9" "$HUB_CONTEXT" "$GKE_CLUSTER_NAME" "$EKS_CLUSTER_NAME" popd # TODO clean this up a bit @@ -568,7 +569,7 @@ kubectl --context "$EKS_CLUSTER_NAME" -n istio-system annotate service istio-eas # Step 6: Install DocumentDB Operator # ============================================================================ -CHART_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/operator/documentdb-helm-chart" +CHART_DIR="$(cd "$TEMPLATE_DIR/../../" && pwd)/operator/documentdb-helm-chart" CHART_PKG="$TEMPLATE_DIR/documentdb-operator-0.0.${VERSION}.tgz" # Apply cert-manager CRDs on hub diff --git a/documentdb-playground/multi-cloud-deployment/dns_failover.sh b/documentdb-playground/multi-cloud-deployment/dns_failover.sh index 1aa208f4..2542917a 100755 --- a/documentdb-playground/multi-cloud-deployment/dns_failover.sh +++ b/documentdb-playground/multi-cloud-deployment/dns_failover.sh @@ -4,7 +4,7 @@ RESOURCE_GROUP="${RESOURCE_GROUP:-documentdb-aks-fleet-rg}" DOCUMENTDB_NAME="${DOCUMENTDB_NAME:-documentdb-preview}" DOCUMENTDB_NAMESPACE="${DOCUMENTDB_NAMESPACE:-documentdb-preview-ns}" HUB_CONTEXT="${HUB_CONTEXT:-hub}" -GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gke-documentdb-cluster}" +GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gcp-documentdb}" MEMBER_CLUSTERS=$(kubectl --context "$HUB_CONTEXT" get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq -r ".spec.clusterReplication.clusterList[].name") PRIMARY_CLUSTER=$(kubectl --context "$HUB_CONTEXT" get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq -r ".spec.clusterReplication.primary") diff --git a/documentdb-playground/multi-cloud-deployment/documentdb-base.yaml b/documentdb-playground/multi-cloud-deployment/documentdb-base.yaml index ae7c6d7c..519e5c05 100644 --- a/documentdb-playground/multi-cloud-deployment/documentdb-base.yaml +++ b/documentdb-playground/multi-cloud-deployment/documentdb-base.yaml @@ -36,6 +36,14 @@ spec: version: v1 kind: CustomResourceDefinition name: scheduledbackups.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: backups.db.microsoft.com + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: scheduledbackups.db.microsoft.com - group: "apiextensions.k8s.io" version: v1 kind: CustomResourceDefinition diff --git a/documentdb-playground/multi-cloud-deployment/documentdb-cluster.yaml b/documentdb-playground/multi-cloud-deployment/documentdb-cluster.yaml index aff12c51..0e91d35b 100644 --- a/documentdb-playground/multi-cloud-deployment/documentdb-cluster.yaml +++ b/documentdb-playground/multi-cloud-deployment/documentdb-cluster.yaml @@ -15,7 +15,7 @@ metadata: namespace: documentdb-preview-ns type: Opaque stringData: - username: default_user + username: docdb password: {{DOCUMENTDB_PASSWORD}} --- @@ -28,8 +28,8 @@ metadata: spec: nodeCount: 1 instancesPerNode: 1 - documentDBImage: ghcr.io/microsoft/documentdb/documentdb-local:16 - gatewayImage: ghcr.io/microsoft/documentdb/documentdb-local:16 + documentDBImage: {{DOCUMENTDB_IMAGE}} + gatewayImage: {{GATEWAY_IMAGE}} resource: storage: pvcSize: 10Gi diff --git a/documentdb-playground/multi-cloud-deployment/insert_test.py b/documentdb-playground/multi-cloud-deployment/insert_test.py index 912f1ba2..c4a9f209 100644 --- a/documentdb-playground/multi-cloud-deployment/insert_test.py +++ b/documentdb-playground/multi-cloud-deployment/insert_test.py @@ -19,11 +19,11 @@ print(f"{'Inserted Document':<30} {'Insert Count':<15}") print("-" * 77) start_time = time.time() -end_time = start_time + (10 * 60) # 10 minutes +end_time = start_time + (60 * 60) # 60 minutes count = 0 +first_error_seen = False while time.time() < end_time: - failed = False write_result = "" try: doc = { @@ -36,12 +36,12 @@ print(f"{str(write_result):<30} {count:<15}") except Exception as e: if not first_error_seen: - #print("Promotion in progress") + print("Switching cloud to Azure") first_error_seen = True - time.sleep(1) + time.sleep(.25) -print(f"Completed {count} insert operations in 10 minutes") +print(f"Completed {count} insert operations in 60 minutes") final_read_count = collection.count_documents({}) print(f"Final read count: {final_read_count}") client.close() diff --git a/documentdb-playground/multi-cloud-deployment/main.bicep b/documentdb-playground/multi-cloud-deployment/main.bicep index eb54d572..da8934b0 100644 --- a/documentdb-playground/multi-cloud-deployment/main.bicep +++ b/documentdb-playground/multi-cloud-deployment/main.bicep @@ -60,7 +60,7 @@ resource memberCluster 'Microsoft.ContainerService/managedClusters@2023-10-01' = // Member clusters fleet membership resource memberFleetMembers 'Microsoft.ContainerService/fleets/members@2023-10-15' = { - name: 'member-${memberRegion}-${uniqueString(resourceGroup().id, memberRegion)}' + name: memberName parent: fleet properties: { clusterResourceId: memberCluster.id diff --git a/documentdb-playground/multi-cloud-deployment/telemetry/dashboard.json b/documentdb-playground/multi-cloud-deployment/telemetry/dashboard.json new file mode 100644 index 00000000..76076364 --- /dev/null +++ b/documentdb-playground/multi-cloud-deployment/telemetry/dashboard.json @@ -0,0 +1,904 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "rate(documentdb_mongodb_requests_total[2m])", + "legendFormat": "{{operation}}-{{region}}", + "refId": "A" + } + ], + "title": "Request Rate by Operation", + "type": "timeseries" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum(rate(documentdb_mongodb_requests_total[2m]))", + "refId": "A" + } + ], + "title": "Total Request Rate", + "type": "gauge" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum(documentdb_mongodb_requests_total)", + "refId": "A" + } + ], + "title": "Total Requests", + "type": "stat" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.50, sum by(le, operation) (rate(documentdb_mongodb_request_duration_milliseconds_bucket{phase=\"total\"}[2m])))", + "legendFormat": "p50 - {{operation}}", + "refId": "A" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.95, sum by(le, operation) (rate(documentdb_mongodb_request_duration_milliseconds_bucket{phase=\"total\"}[2m])))", + "legendFormat": "p95 - {{operation}}", + "refId": "B" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.99, sum by(le, operation) (rate(documentdb_mongodb_request_duration_milliseconds_bucket{phase=\"total\"}[2m])))", + "legendFormat": "p99 - {{operation}}", + "refId": "C" + } + ], + "title": "Request Duration (p50, p95, p99)", + "type": "timeseries" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": ["mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.95, sum by(le, phase) (rate(documentdb_mongodb_request_duration_milliseconds_bucket[2m])))", + "legendFormat": "{{phase}}", + "refId": "A" + } + ], + "title": "Request Duration by Phase (p95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 6, + "options": { + "displayLabels": ["percent"], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": ["value"] + }, + "pieType": "pie", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(operation) (documentdb_mongodb_requests_total)", + "legendFormat": "{{operation}}-{{region}}", + "refId": "A" + } + ], + "title": "Requests by Operation", + "type": "piechart" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 7, + "options": { + "displayLabels": ["percent"], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": ["value"] + }, + "pieType": "pie", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(collection) (documentdb_mongodb_requests_total{collection!=\"\"})", + "legendFormat": "{{collection}}", + "refId": "A" + } + ], + "title": "Requests by Collection", + "type": "piechart" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum(rate(documentdb_mongodb_requests_total{is_error=\"true\"}[2m])) / sum(rate(documentdb_mongodb_requests_total[2m]))", + "legendFormat": "Error Rate", + "refId": "A" + } + ], + "title": "Error Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.95, sum by(le) (rate(documentdb_mongodb_request_size_bytes_bucket[2m])))", + "legendFormat": "Request Size (p95)", + "refId": "A" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "histogram_quantile(0.95, sum by(le) (rate(documentdb_mongodb_response_size_bytes_bucket[2m])))", + "legendFormat": "Response Size (p95)", + "refId": "B" + } + ], + "title": "Request/Response Size (p95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "reqps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(operation) (documentdb_mongodb_requests_total)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + }, + { + "datasource": { + "type": "Prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(operation) (rate(documentdb_mongodb_requests_total[5m]))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "B" + } + ], + "title": "Operations Summary", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "collection": true, + "environment": true, + "is_error": true, + "job": true, + "username": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "operation": 1 + }, + "renameByName": { + "Value #A": "Total", + "Value #B": "Rate", + "operation": "Operation" + } + } + } + ], + "type": "table" + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": ["documentdb", "mongodb", "observability"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "DocumentDB Gateway - Overview", + "uid": "documentdb-gateway", + "version": 0, + "weekStart": "" +} diff --git a/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh b/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh new file mode 100755 index 00000000..cc9e66a7 --- /dev/null +++ b/documentdb-playground/multi-cloud-deployment/telemetry/deploy-telemetry.sh @@ -0,0 +1,393 @@ +#!/bin/bash + +# Multi-Tenant DocumentDB + Telemetry Deployment Script +# This script deploys complete DocumentDB clusters with isolated monitoring stacks for different teams + +set -e + +# Deployment options +SKIP_WAIT=true + +# Parse command line arguments +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --skip-wait Skip waiting for deployments to be ready" + echo " --help Show this help message" + echo "" + echo "Examples:" + echo " $0 # Deploy everything (DocumentDB + Telemetry)" +} + +while [[ $# -gt 0 ]]; do + case $1 in + --skip-wait) + SKIP_WAIT=true + shift + ;; + --help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" + usage + exit 1 + ;; + esac +done + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log() { + echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1" +} + +success() { + echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] ✅${NC} $1" +} + +warn() { + echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] ⚠️${NC} $1" +} + +error() { + echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ❌${NC} $1" + exit 1 +} + +# Check if OpenTelemetry Operator is installed +check_prerequisites() { + log "Checking prerequisites..." + + if ! helm version > /dev/null 2>&1; then + error "Helm is not installed. Please install Helm first." + fi + + # Add Prometheus Helm repo if not already added + if ! helm repo list | grep -q prometheus-community; then + log "Adding Prometheus Helm repository..." + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo update + fi + + # Add Grafana Helm repo if not already added + if ! helm repo list | grep -q grafana; then + log "Adding Grafana Helm repository..." + helm repo add grafana https://grafana.github.io/helm-charts + helm repo update + fi + + success "Prerequisites check completed" +} + +install_opentelemetry_operator() { + + kubectl config use-context hub + + log "Installing OpenTelemetry Operator (infrastructure component)..." + + # Check if already installed + if kubectl get deployment opentelemetry-operator-controller-manager -n opentelemetry-operator-system &> /dev/null; then + warn "OpenTelemetry Operator already installed. Skipping installation." + return 0 + fi + + # Install OpenTelemetry Operator on hub + log "Installing OpenTelemetry Operator from upstream..." + kubectl apply -f https://github.com/open-telemetry/opentelemetry-operator/releases/latest/download/opentelemetry-operator.yaml + + # Create ClusterResourcePlacement to deploy operator to all member clusters + log "Creating ClusterResourcePlacement for OpenTelemetry Operator..." + cat < /dev/null && pwd )" + + if [ ! -f "$SCRIPT_DIR/prometheus-values.yaml" ]; then + error "Prometheus values file not found: $SCRIPT_DIR/prometheus-values.yaml" + fi + + helm upgrade --install prometheus prometheus-community/prometheus \ + --namespace $namespace \ + --values "$SCRIPT_DIR/prometheus-values.yaml" \ + --wait --timeout=300s + + success "Prometheus deployed" +} + +# Deploy Grafana for a namespace +deploy_grafana() { + local namespace=$1 + + log "Deploying Grafana in namespace: $namespace" + + # Get the directory where this script is located + SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + + if [ ! -f "$SCRIPT_DIR/grafana-values.yaml" ]; then + error "Grafana values file not found: $SCRIPT_DIR/grafana-values.yaml" + fi + + helm upgrade --install grafana grafana/grafana \ + --namespace $namespace \ + --values "$SCRIPT_DIR/grafana-values.yaml" \ + --wait --timeout=300s + + success "Grafana deployed" +} + +# Deploy OpenTelemetry collectors for each member +# TODO figure out how to do this with fleet, currently can't deploy without the operator running (opentelemetry-operator-webhook-service) +deploy_collectors() { + log "Deploying OpenTelemetry collector to each member cluster..." + + # Get the directory where this script is located + SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + + # Get member clusters and primary cluster from documentdb resource + MEMBER_CLUSTERS=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o json 2>/dev/null | jq -r '.spec.clusterReplication.clusterList[].name' 2>/dev/null || echo "") + + # Deploy to each member cluster + for cluster in $MEMBER_CLUSTERS; do + log "Waiting for OpenTelemetry Operator webhook service on cluster: $cluster" + kubectl --context "$cluster" wait --for=jsonpath='{.subsets[*].addresses[*].ip}' endpoints/opentelemetry-operator-webhook-service -n opentelemetry-operator-system --timeout=300s || warn "Webhook service not ready on $cluster, proceeding anyway..." + + log "Deploying OpenTelemetry Collector to cluster: $cluster" + sed "s/{{CLUSTER_NAME}}/$cluster/g" "$SCRIPT_DIR/otel-collector.yaml" | kubectl --context "$cluster" apply -f - + done + success "All collectors deployed" +} + +# Deploy monitoring stack only on primary +deploy_monitoring_stack() { + + primary=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.primary}') + kubectl config use-context "$primary" + + log "Deploying monitoring stack to primary" + + deploy_prometheus documentdb-preview-ns + deploy_grafana documentdb-preview-ns + + success "All monitoring stacks deployed" +} + +# Create placeholder OTEL collector services on primary cluster for non-primary members +create_placeholder_prometheus_services() { + log "Creating placeholder OTEL collector services on primary cluster..." + + # Get primary cluster and all member clusters + #PRIMARY_CLUSTER=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.primary}' 2>/dev/null || echo "") + PRIMARY_CLUSTER="azure-documentdb" + MEMBER_CLUSTERS=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o json 2>/dev/null | jq -r '.spec.clusterReplication.clusterList[].name' 2>/dev/null || echo "") + + if [ -z "$PRIMARY_CLUSTER" ] || [ -z "$MEMBER_CLUSTERS" ]; then + warn "Could not determine primary or member clusters, skipping placeholder services" + return 0 + fi + + # Deploy placeholder services on primary cluster for each non-primary member + for cluster in $MEMBER_CLUSTERS; do + if [ "$cluster" = "$PRIMARY_CLUSTER" ]; then + log "Skipping primary cluster: $cluster" + continue + fi + + log "Creating placeholder OTEL collector service for $cluster on primary cluster" + cat </dev/null +apiVersion: v1 +kind: Service +metadata: + name: ${cluster}-collector + namespace: documentdb-preview-ns + labels: + app: otel-collector + cluster: ${cluster} +spec: + type: ClusterIP + ports: + - name: prometheus + port: 8889 + targetPort: 8889 + protocol: TCP + selector: + app: nonexistent-placeholder +EOF + if [ $? -eq 0 ]; then + success "Placeholder service ${cluster}-collector created on primary cluster" + else + warn "Failed to create placeholder service for $cluster on primary cluster" + fi + done + + success "Placeholder OTEL collector services created on primary cluster" +} + +# Create Fleet ServiceExport and MultiClusterService for OTEL collectors +create_service_exports_and_imports() { + log "Creating Fleet ServiceExport and MultiClusterService for OTEL collector endpoints..." + + # Get primary cluster and all member clusters + #PRIMARY_CLUSTER=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.primary}' 2>/dev/null || echo "") + PRIMARY_CLUSTER="azure-documentdb" + MEMBER_CLUSTERS=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o json 2>/dev/null | jq -r '.spec.clusterReplication.clusterList[].name' 2>/dev/null || echo "") + + if [ -z "$PRIMARY_CLUSTER" ] || [ -z "$MEMBER_CLUSTERS" ]; then + warn "Could not determine primary or member clusters, skipping service export/import" + return 0 + fi + + # Create ServiceExport on each non-primary member cluster for their OTEL collector + for cluster in $MEMBER_CLUSTERS; do + if [ "$cluster" = "$PRIMARY_CLUSTER" ]; then + log "Skipping ServiceExport on primary cluster: $cluster" + continue + fi + + log "Creating ServiceExport for documentdb-collector-collector on cluster: $cluster" + cat </dev/null +apiVersion: networking.fleet.azure.com/v1alpha1 +kind: MultiClusterService +metadata: + name: $cluster-collector + namespace: documentdb-preview-ns +spec: + serviceImport: + name: $cluster-collector +EOF + done + + # Create MultiClusterService on primary cluster to import all OTEL collector endpoints + + + success "Fleet ServiceExport and MultiClusterService resources created for OTEL collectors" +} + +# Main execution +main() { + log "Starting Multi-Tenant DocumentDB + Telemetry Deployment..." + log "=========================================================" + log "Configuration:" + log " Skip Wait: $SKIP_WAIT" + log "" + + check_prerequisites + + install_opentelemetry_operator + + CROSS_CLOUD_STRATEGY=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.crossCloudNetworkingStrategy}' 2>/dev/null || echo "") + + deploy_collectors + + deploy_monitoring_stack + + # Only create placeholder services if using Istio networking + CROSS_CLOUD_STRATEGY=$(kubectl get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.spec.clusterReplication.crossCloudNetworkingStrategy}' 2>/dev/null || echo "") + if [ "$CROSS_CLOUD_STRATEGY" = "Istio" ]; then + log "Cross-cloud networking strategy is Istio. Creating placeholder services..." + create_placeholder_prometheus_services + elif [ "$CROSS_CLOUD_STRATEGY" = "AzureFleet" ]; then + log "Cross-cloud networking strategy is AzureFleet. Creating placeholder services..." + create_service_exports_and_imports + else + log "Cross-cloud networking strategy is '$CROSS_CLOUD_STRATEGY', not 'Istio'. Skipping placeholder services." + fi + + if [[ "$SKIP_WAIT" == "false" ]]; then + error "Wait not yet implemented" + fi +} + +# Run main function +main "$@" diff --git a/documentdb-playground/multi-cloud-deployment/telemetry/grafana-values.yaml b/documentdb-playground/multi-cloud-deployment/telemetry/grafana-values.yaml new file mode 100644 index 00000000..276a065e --- /dev/null +++ b/documentdb-playground/multi-cloud-deployment/telemetry/grafana-values.yaml @@ -0,0 +1,22 @@ +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: http://prometheus-server.documentdb-preview-ns.svc.cluster.local + access: proxy + isDefault: true + +adminPassword: admin123 + +service: + type: LoadBalancer + port: 3000 + +ingress: + enabled: false + +persistence: + enabled: true + size: 1Gi diff --git a/documentdb-playground/multi-cloud-deployment/telemetry/otel-collector.yaml b/documentdb-playground/multi-cloud-deployment/telemetry/otel-collector.yaml new file mode 100644 index 00000000..02e254a0 --- /dev/null +++ b/documentdb-playground/multi-cloud-deployment/telemetry/otel-collector.yaml @@ -0,0 +1,89 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: documentdb-preview-ns +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: otel-collector +rules: +- apiGroups: [""] + resources: ["nodes", "nodes/proxy", "nodes/metrics", "services", "endpoints", "pods"] + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics", "/metrics/cadvisor"] + verbs: ["get"] +- apiGroups: ["apps"] + resources: ["daemonsets", "deployments", "replicasets"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: otel-collector +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: otel-collector +subjects: +- kind: ServiceAccount + name: otel-collector + namespace: documentdb-preview-ns +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: {{CLUSTER_NAME}} + namespace: documentdb-preview-ns +spec: + mode: deployment # Single pod per namespace, not DaemonSet + replicas: 1 + serviceAccount: otel-collector + config: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + batch: + timeout: 10s + send_batch_size: 1024 + + memory_limiter: + check_interval: 1s + limit_mib: 512 + + attributes: + actions: + - key: service.name + action: upsert + value: documentdb-gateway + + exporters: + # Prometheus for metrics + prometheus: + endpoint: "0.0.0.0:8889" + namespace: documentdb + const_labels: + environment: demo + + extensions: + health_check: + endpoint: 0.0.0.0:13133 + + pprof: + endpoint: 0.0.0.0:1777 + + service: + extensions: [health_check, pprof] + + pipelines: + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [prometheus] diff --git a/documentdb-playground/multi-cloud-deployment/telemetry/prometheus-values.yaml b/documentdb-playground/multi-cloud-deployment/telemetry/prometheus-values.yaml new file mode 100644 index 00000000..6daad2d1 --- /dev/null +++ b/documentdb-playground/multi-cloud-deployment/telemetry/prometheus-values.yaml @@ -0,0 +1,51 @@ +server: + persistentVolume: + size: 10Gi + retention: 15d + service: + type: LoadBalancer + ingress: + enabled: false + +alertmanager: + enabled: false + +prometheus-node-exporter: + enabled: false + +prometheus-pushgateway: + enabled: false + +kube-state-metrics: + enabled: false + +global: + scrape_interval: 15s + evaluation_interval: 15s + +serverFiles: + prometheus.yml: + scrape_configs: + - job_name: 'otel-collector-azure' + static_configs: + - targets: ['azure-documentdb-collector.documentdb-preview-ns.svc.cluster.local:8889'] + labels: + service: 'documentdb-gateway' + region: 'azure' + - job_name: 'otel-collector-aws' + static_configs: + - targets: ['aws-documentdb-collector.documentdb-preview-ns.svc.cluster.local:8889'] + labels: + service: 'documentdb-gateway' + region: 'aws' + - job_name: 'otel-collector-westus' + static_configs: + - targets: ['gcp-documentdb-collector.documentdb-preview-ns.svc.cluster.local:8889'] + labels: + service: 'documentdb-gateway' + region: 'gcp' + + # Prometheus self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] diff --git a/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go b/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go index 0fbf40c3..b9a21015 100644 --- a/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go +++ b/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go @@ -133,7 +133,7 @@ func (impl Implementation) reconcileMetadata( envVars := []corev1.EnvVar{ { Name: "OTEL_EXPORTER_OTLP_ENDPOINT", - Value: "http://localhost:4412", + Value: "http://" + cluster.Name + "-collector." + cluster.Namespace + ".svc.cluster.local:4317", }, } diff --git a/operator/src/internal/cnpg/cnpg_cluster.go b/operator/src/internal/cnpg/cnpg_cluster.go index 5761160b..59844bdf 100644 --- a/operator/src/internal/cnpg/cnpg_cluster.go +++ b/operator/src/internal/cnpg/cnpg_cluster.go @@ -62,7 +62,10 @@ func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, docu }, InheritedMetadata: getInheritedMetadataLabels(documentdb.Name), Plugins: func() []cnpgv1.PluginConfiguration { - params := map[string]string{"gatewayImage": gatewayImage} + params := map[string]string{ + "gatewayImage": gatewayImage, + "documentDbCredentialSecret": credentialSecretName, + } // If TLS is ready, surface secret name to plugin so it can mount certs. if documentdb.Status.TLS != nil && documentdb.Status.TLS.Ready && documentdb.Status.TLS.SecretName != "" { params["gatewayTLSSecret"] = documentdb.Status.TLS.SecretName diff --git a/operator/src/internal/controller/documentdb_controller.go b/operator/src/internal/controller/documentdb_controller.go index c0e9b126..adb93492 100644 --- a/operator/src/internal/controller/documentdb_controller.go +++ b/operator/src/internal/controller/documentdb_controller.go @@ -97,7 +97,7 @@ func (r *DocumentDBReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Check if the DocumentDB Service already exists for this instance foundService, err := util.UpsertService(ctx, r.Client, ddbService) if err != nil { - logger.Info("Failed to create DocumentDB Service; Requeuing.") + logger.Error(err, "Failed to create DocumentDB Service; Requeuing.") return ctrl.Result{RequeueAfter: RequeueAfterShort}, nil }