diff --git a/.gitignore b/.gitignore index 6e9488d..e95861c 100644 --- a/.gitignore +++ b/.gitignore @@ -56,4 +56,5 @@ tags # Persistent undo [._]*.un~ -# End of https://www.toptal.com/developers/gitignore/api/osx,vim +# AI +.clauderc diff --git a/coder/README.md b/coder/README.md index 2c6dc3d..bd86fdf 100644 --- a/coder/README.md +++ b/coder/README.md @@ -55,7 +55,38 @@ This follows Nuon best practices for deploying public Helm charts. > This is a development/demo installation of Coder. Do not use in production. -> A CNAME record must be manually created in Route 53 for wildcard subdomains to work. This enables features like web apps e.g., Jupyter and web port fowarding. For example, if your domain is `{{.nuon.install.sandbox.outputs.nuon_dns.public_domain.name}}`, create a CNAME record for `*.{{.nuon.install.sandbox.outputs.nuon_dns.public_domain.name}}` that points to the DNS name of the load balancer created by this Nuon app config. The load balancer DNS name can be found in AWS Console. +> Wildcard DNS for workspace subdomains is automatically configured via external-dns. This enables features like web apps (e.g., Jupyter) and web port forwarding without manual DNS configuration. + +### Observability & Monitoring + +This app includes comprehensive monitoring and Kubernetes event streaming: + +- **Observability Stack**: Prometheus, Grafana, Loki, and Alertmanager deployed in the `coder-observability` namespace for metrics collection, log aggregation, and alerting +- **Kubelogstream**: Streams Kubernetes pod events directly to Coder workspace startup logs for easier troubleshooting + +**Accessing Grafana Dashboards**: + +1. In the Nuon dashboard, navigate to your Coder installation +2. Go to the **Actions** tab +3. Run the `grafana_password` action (manual trigger) +4. The action output will display: + - Grafana URL: `https://{{.nuon.install.sandbox.outputs.nuon_dns.public_domain.name}}/grafana` + - Username: `admin` + - Password: (randomly generated, stored in AWS Secrets Manager) +5. Open the Grafana URL in your browser and log in with the credentials + +Grafana is served from `/grafana` path on the same ALB as Coder, reducing infrastructure cost and complexity. + +**Available Dashboards**: +- Coder Status - Overview of Coder health +- Coder Coderd - Control plane metrics +- Workspaces - Workspace utilization and performance +- Workspace Detail - Individual workspace deep-dive +- Provisioner - Terraform provisioner metrics +- Postgres Database - RDS performance +- Infrastructure - Node metrics + +The admin password is generated once during initial deployment and persisted in AWS Secrets Manager for the lifetime of the installation. ## Coder Resources @@ -63,4 +94,12 @@ This follows Nuon best practices for deploying public Helm charts. [Coder Releases](https://github.com/coder/coder/releases/) +[Coder Monitoring](https://coder.com/docs/admin/monitoring) + +[Coder Kubernetes Logs Integration](https://coder.com/docs/admin/integrations/kubernetes-logs) + +[Coder Logstream Kube GitHub](https://github.com/coder/coder-logstream-kube) + +[Coder Observability GitHub](https://github.com/coder/observability) + [AWS Instance Types](https://aws.amazon.com/ec2/instance-types/) diff --git a/coder/actions/coder_rds_creds.toml b/coder/actions/coder_rds_creds.toml index 165bb3a..87e7da1 100644 --- a/coder/actions/coder_rds_creds.toml +++ b/coder/actions/coder_rds_creds.toml @@ -16,7 +16,7 @@ command = "./import.sh" [steps.public_repo] repo = "nuonco/example-app-configs" directory = "coder/src/actions/rds_secrets" -branch = "main" +branch = "mm/coder-observe-kubelogstream" [steps.env_vars] SECRET_ARN = "{{ .nuon.components.rds_cluster_coder.outputs.db_instance_master_user_secret_arn }}" @@ -26,3 +26,4 @@ TARGET_NAMESPACE = "coder" DB_ADDRESS = "{{ .nuon.components.rds_cluster_coder.outputs.address }}" DB_PORT = "{{ .nuon.components.rds_cluster_coder.outputs.db_instance_port }}" DB_NAME = "{{ .nuon.components.rds_cluster_coder.outputs.db_instance_name }}" +INSTALL_ID = "{{ .nuon.install.id }}" diff --git a/coder/actions/default-storage-class.toml b/coder/actions/default-storage-class.toml index ada7984..e32fc8f 100644 --- a/coder/actions/default-storage-class.toml +++ b/coder/actions/default-storage-class.toml @@ -3,10 +3,6 @@ name = "default_storage_class" timeout = "1m" -[[triggers]] -type = "pre-deploy-component" -component_name = "coder" - [[triggers]] type = "manual" diff --git a/coder/actions/grafana-password.toml b/coder/actions/grafana-password.toml new file mode 100644 index 0000000..634291d --- /dev/null +++ b/coder/actions/grafana-password.toml @@ -0,0 +1,18 @@ +name = "grafana_password" +timeout = "1m" + +[[triggers]] +type = "manual" + +[[steps]] +name = "Retrieve Grafana Admin Password" +command = "./get-password.sh" + +[steps.public_repo] +repo = "nuonco/example-app-configs" +directory = "coder/src/actions/grafana-password" +branch = "mm/coder-observe-kubelogstream" + +[steps.env_vars] +REGION = "{{ .nuon.install_stack.outputs.region }}" +INSTALL_ID = "{{ .nuon.install.id }}" diff --git a/coder/actions/grafana-setup.toml b/coder/actions/grafana-setup.toml new file mode 100644 index 0000000..c8b94b3 --- /dev/null +++ b/coder/actions/grafana-setup.toml @@ -0,0 +1,22 @@ +name = "grafana_setup" +timeout = "2m" + +[[triggers]] +type = "pre-deploy-component" +component_name = "observability" + +[[triggers]] +type = "manual" + +[[steps]] +name = "Create Grafana Admin Secret" +command = "./setup.sh" + +[steps.public_repo] +repo = "nuonco/example-app-configs" +directory = "coder/src/actions/grafana-setup" +branch = "mm/coder-observe-kubelogstream" + +[steps.env_vars] +REGION = "{{ .nuon.install_stack.outputs.region }}" +INSTALL_ID = "{{ .nuon.install.id }}" diff --git a/coder/actions/remove-gp2-default.toml b/coder/actions/remove-gp2-default.toml deleted file mode 100644 index fffcdb1..0000000 --- a/coder/actions/remove-gp2-default.toml +++ /dev/null @@ -1,16 +0,0 @@ -# action - -name = "remove_gp2_default" -timeout = "1m" - -[[triggers]] -type = "manual" - -[[steps]] -name = "remove_gp2_default_annotation" -inline_contents = """ -#!/usr/bin/env sh -kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' -echo "Removed default annotation from gp2 storage class" -kubectl get storageclass -""" diff --git a/coder/components/1-rds_cluster_coder.toml b/coder/components/1-rds_cluster_coder.toml index 65ec11b..3fa2d35 100644 --- a/coder/components/1-rds_cluster_coder.toml +++ b/coder/components/1-rds_cluster_coder.toml @@ -2,6 +2,7 @@ name = "rds_cluster_coder" type = "terraform_module" terraform_version = "1.13.5" +dependencies = ["rds_subnet"] [public_repo] repo = "nuonco/example-app-configs" diff --git a/coder/components/4-alb.toml b/coder/components/4-alb.toml index 553176a..5720569 100644 --- a/coder/components/4-alb.toml +++ b/coder/components/4-alb.toml @@ -3,12 +3,12 @@ name = "application_load_balancer" type = "helm_chart" chart_name = "application-load-balancer" -dependencies = ["coder"] +dependencies = ["coder", "certificate"] [public_repo] repo = "nuonco/example-app-configs" directory = "coder/src/components/alb" -branch = "main" +branch = "mm/coder-observe-kubelogstream" [values] domain_certificate = "{{.nuon.components.certificate.outputs.public_domain_certificate_arn}}" diff --git a/coder/components/5-kubelogstream.toml b/coder/components/5-kubelogstream.toml new file mode 100644 index 0000000..8df6d7d --- /dev/null +++ b/coder/components/5-kubelogstream.toml @@ -0,0 +1,13 @@ +name = "kubelogstream" +type = "helm_chart" +chart_name = "coder-logstream-kube" +namespace = "coder" +storage_driver = "configmap" +dependencies = ["coder", "application_load_balancer"] + +[helm_repo] +repo_url = "https://helm.coder.com/logstream-kube" +chart = "coder-logstream-kube" + +[[values_file]] +contents = "./values/kubelogstream.yaml" diff --git a/coder/components/6-observability.toml b/coder/components/6-observability.toml new file mode 100644 index 0000000..41569b6 --- /dev/null +++ b/coder/components/6-observability.toml @@ -0,0 +1,13 @@ +name = "observability" +type = "helm_chart" +chart_name = "coder-observability" +namespace = "coder-observability" +storage_driver = "configmap" +dependencies = ["coder", "application_load_balancer"] + +[helm_repo] +repo_url = "https://helm.coder.com/observability" +chart = "coder-observability" + +[[values_file]] +contents = "./values/observability.yaml" diff --git a/coder/components/values/coder.yaml b/coder/components/values/coder.yaml index 948bc26..95c9fe5 100644 --- a/coder/components/values/coder.yaml +++ b/coder/components/values/coder.yaml @@ -13,6 +13,10 @@ coder: replicaCount: "{{.nuon.inputs.inputs.replicas}}" + podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "2112" + # since tls is terminated at alb tls: secretNames: [] @@ -84,6 +88,10 @@ coder: value: "{{.nuon.inputs.inputs.telemetry}}" - name: CODER_PROMETHEUS_ENABLE value: "true" + - name: CODER_PROMETHEUS_COLLECT_AGENT_STATS + value: "true" + - name: CODER_LOGGING_HUMAN + value: "/dev/stderr" - name: CODER_UPDATE_CHECK value: "true" diff --git a/coder/components/values/kubelogstream.yaml b/coder/components/values/kubelogstream.yaml new file mode 100644 index 0000000..75ea015 --- /dev/null +++ b/coder/components/values/kubelogstream.yaml @@ -0,0 +1,20 @@ +url: "https://{{.nuon.install.sandbox.outputs.nuon_dns.public_domain.name}}" + +namespaces: ["coder"] + +image: + repo: "ghcr.io/coder/coder-logstream-kube" + pullPolicy: IfNotPresent + +resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + +serviceAccount: + name: coder-logstream-kube + annotations: {} + labels: {} diff --git a/coder/components/values/observability.yaml b/coder/components/values/observability.yaml new file mode 100644 index 0000000..8e98089 --- /dev/null +++ b/coder/components/values/observability.yaml @@ -0,0 +1,117 @@ +global: + coder: + controlPlaneNamespace: coder + externalProvisionersNamespace: coder + coderdSelector: 'pod=~`coder.*`, pod!~`.*provisioner.*`' + provisionerdSelector: 'pod=~`coder-provisioner.*`' + workspacesSelector: 'namespace=`coder`' + logFormat: human + + postgres: + hostname: "{{.nuon.components.rds_cluster_coder.outputs.address}}" + port: "{{.nuon.components.rds_cluster_coder.outputs.db_instance_port}}" + database: coder + username: coder + mountSecret: coder-db-password + exporter: + enabled: true + image: "quay.io/prometheuscommunity/postgres-exporter" + + telemetry: + metrics: + scrape_interval: 15s + scrape_timeout: 12s + native_histograms: false + + alerts: + enabled: true + kind: "configmap" + + dashboards: + enabled: true + timerange: 12h + refresh: 30s + queryTimeout: 900 + +grafana-agent: + enabled: true + controller: + type: daemonset + +grafana: + enabled: true + replicas: 1 + env: + GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION: "false" + admin: + existingSecret: grafana-admin + userKey: username + passwordKey: password + persistence: + enabled: true + size: 10Gi + storageClassName: ebs-auto + ingress: + enabled: true + ingressClassName: alb + annotations: + alb.ingress.kubernetes.io/group.name: {{.nuon.install.id}} + alb.ingress.kubernetes.io/group.order: "200" + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]' + hosts: + - {{.nuon.install.sandbox.outputs.nuon_dns.public_domain.name}} + path: /grafana + pathType: Prefix + "grafana.ini": + server: + domain: {{.nuon.install.sandbox.outputs.nuon_dns.public_domain.name}} + root_url: "https://{{.nuon.install.sandbox.outputs.nuon_dns.public_domain.name}}/grafana" + serve_from_sub_path: true + security: + password_policy: false + "auth.anonymous": + enabled: false + analytics: + reporting_enabled: false + users: + allow_sign_up: false + +prometheus: + enabled: true + server: + replicaCount: 1 + persistentVolume: + enabled: true + size: 20Gi + storageClassName: ebs-auto + retentionSize: 18GB + retention: 15d + resources: + limits: + cpu: 1000m + memory: 2Gi + requests: + cpu: 500m + memory: 1Gi + alertmanager: + enabled: true + persistentVolume: + enabled: true + size: 2Gi + storageClassName: ebs-auto + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true + +loki: + enabled: true + backend: + replicas: 1 + read: + replicas: 1 + write: + replicas: 1 + minio: + enabled: true diff --git a/coder/runner.toml b/coder/runner.toml index 21bdd6d..e6c02e6 100644 --- a/coder/runner.toml +++ b/coder/runner.toml @@ -4,5 +4,3 @@ runner_type = "aws" helm_driver = "configmap" init_script_url = "https://raw.githubusercontent.com/nuonco/runner/refs/heads/main/scripts/aws/init-mng-v2.sh" -[env_vars] -foo = "bar" diff --git a/coder/sandbox.tfvars b/coder/sandbox.tfvars index 5c3e888..918ec86 100644 --- a/coder/sandbox.tfvars +++ b/coder/sandbox.tfvars @@ -1,7 +1,7 @@ ebs_storage_class = { enabled = true name = "ebs-auto" - is_default_class = false + is_default_class = true provisioner = "ebs.csi.eks.amazonaws.com" volume_binding_mode = "WaitForFirstConsumer" reclaim_policy = "Delete" @@ -13,7 +13,7 @@ ebs_storage_class = { } -additional_namespaces = ["coder"] +additional_namespaces = ["coder", "coder-observability"] enable_irsa = true diff --git a/coder/src/actions/grafana-password/get-password.sh b/coder/src/actions/grafana-password/get-password.sh new file mode 100755 index 0000000..a49aae0 --- /dev/null +++ b/coder/src/actions/grafana-password/get-password.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -e +set -o pipefail +set -u + +region="$REGION" +install_id="$INSTALL_ID" +secret_name="grafana-admin-${install_id}" + +echo "==========================================" +echo "Grafana Admin Credentials" +echo "==========================================" +echo "" + +secret=$(aws --region "$region" secretsmanager get-secret-value --secret-id="$secret_name") +username=$(echo "$secret" | jq -r '.SecretString' | jq -r '.username') +password=$(echo "$secret" | jq -r '.SecretString' | jq -r '.password') + +echo "URL: https://{{ .nuon.install.sandbox.outputs.nuon_dns.public_domain.name }}/grafana" +echo "Username: $username" +echo "Password: $password" +echo "" +echo "==========================================" diff --git a/coder/src/actions/grafana-setup/setup.sh b/coder/src/actions/grafana-setup/setup.sh new file mode 100644 index 0000000..da6e569 --- /dev/null +++ b/coder/src/actions/grafana-setup/setup.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +set -e +set -o pipefail +set -u + +region="$REGION" +install_id="$INSTALL_ID" +grafana_secret_name="grafana-admin-${install_id}" +grafana_username="admin" +observability_namespace="coder-observability" + +echo "[grafana-setup] creating namespace if not exists" +kubectl create namespace "$observability_namespace" --dry-run=client -o yaml | kubectl apply -f - + +echo "[grafana-setup] checking if Grafana admin secret exists in Secrets Manager" +if aws --region "$region" secretsmanager describe-secret --secret-id="$grafana_secret_name" 2>/dev/null; then + echo "[grafana-setup] Grafana admin secret already exists, retrieving" + grafana_secret=$(aws --region "$region" secretsmanager get-secret-value --secret-id="$grafana_secret_name") + grafana_password=$(echo "$grafana_secret" | jq -r '.SecretString' | jq -r '.password') +else + echo "[grafana-setup] generating new Grafana admin password" + grafana_password=$(openssl rand -base64 32 | tr -d "=+/" | cut -c1-32) + + echo "[grafana-setup] storing Grafana admin password in Secrets Manager" + aws --region "$region" secretsmanager create-secret \ + --name "$grafana_secret_name" \ + --description "Grafana admin credentials for Nuon install ${install_id}" \ + --secret-string "{\"username\":\"${grafana_username}\",\"password\":\"${grafana_password}\"}" \ + --tags Key=nuon-install-id,Value="${install_id}" Key=component,Value=observability +fi + +echo "[grafana-setup] creating Grafana admin secret in Kubernetes" +kubectl create -n "$observability_namespace" secret generic grafana-admin \ + --save-config \ + --dry-run=client \ + --from-literal=username="$grafana_username" \ + --from-literal=password="$grafana_password" \ + -o yaml | kubectl apply -f - + +echo "[grafana-setup] Grafana admin secret created successfully" diff --git a/coder/src/actions/rds_secrets/import.sh b/coder/src/actions/rds_secrets/import.sh index 8711c2b..c8497c8 100755 --- a/coder/src/actions/rds_secrets/import.sh +++ b/coder/src/actions/rds_secrets/import.sh @@ -41,3 +41,20 @@ kubectl create -n "$namespace" secret generic "$name" \ -o yaml | kubectl apply -f - echo "[rds-secrets import] secret created successfully" + +# Also create secret for observability namespace +observability_namespace="coder-observability" +observability_secret_name="coder-db-password" + +echo "[rds-secrets import] creating observability namespace if not exists" +kubectl create namespace "$observability_namespace" --dry-run=client -o yaml | kubectl apply -f - + +echo "[rds-secrets import] creating postgres password secret for observability" +kubectl create -n "$observability_namespace" secret generic "$observability_secret_name" \ + --save-config \ + --dry-run=client \ + --from-literal=PGPASSWORD="$password" \ + -o yaml | kubectl apply -f - + +echo "[rds-secrets import] observability secret created successfully" + diff --git a/coder/src/components/alb/templates/alb.tpl b/coder/src/components/alb/templates/alb.tpl index a287447..e7ab222 100644 --- a/coder/src/components/alb/templates/alb.tpl +++ b/coder/src/components/alb/templates/alb.tpl @@ -17,8 +17,10 @@ metadata: alb.ingress.kubernetes.io/unhealthy-threshold-count: '2' alb.ingress.kubernetes.io/healthy-threshold-count: '2' alb.ingress.kubernetes.io/certificate-arn: {{ .Values.domain_certificate }} + alb.ingress.kubernetes.io/group.name: {{ .Values.install_name }} + alb.ingress.kubernetes.io/group.order: "1000" - external-dns.alpha.kubernetes.io/hostname: {{ .Values.domain }} + external-dns.alpha.kubernetes.io/hostname: {{ .Values.domain }},*.{{ .Values.domain }} spec: ingressClassName: alb rules: