diff --git a/README.md b/README.md index 6cf39221..ad92f583 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ The cluster-deployment tools here include helm charts and ansible playbooks to s * Pod security policies * Automatic certificate issuing/renewal with Letsencrypt * PostgreSQL-operator from CrunchyData +* Grafana with prometheus-based alerting ### Resource definitions @@ -70,6 +71,7 @@ The cluster-deployment tools here include helm charts and ansible playbooks to s | duplicati | [![](https://img.shields.io/docker/v/instantlinux/duplicati?sort=date)](https://hub.docker.com/r/instantlinux/duplicati "Version badge") | backups | | ez-ipupdate | [![](https://img.shields.io/docker/v/instantlinux/ez-ipupdate?sort=date)](https://hub.docker.com/r/instantlinux/ez-ipupdate "Version badge") | Dynamic DNS client | | haproxy-keepalived | [![](https://img.shields.io/docker/v/instantlinux/haproxy-keepalived?sort=date)](https://hub.docker.com/r/instantlinux/haproxy-keepalived "Version badge") | load balancer | +| grafana | ** | monitoring dashboard with prometheus-based alerting | | guacamole | ** | authenticated remote-desktop server | | logspout | ** | central logging for Docker | | mysqldump | [![](https://img.shields.io/docker/v/instantlinux/mysqldump?sort=date)](https://hub.docker.com/r/instantlinux/mysqldump "Version badge") | per-database alternative to xtrabackup | diff --git a/ansible/roles/monitoring_agent/defaults/main.yml b/ansible/roles/monitoring_agent/defaults/main.yml index 5a576298..0b85bc15 100644 --- a/ansible/roles/monitoring_agent/defaults/main.yml +++ b/ansible/roles/monitoring_agent/defaults/main.yml @@ -83,6 +83,7 @@ syslog: "{{ syslog_defaults | combine(syslog_override) }}" ubuntu_packages: - bc - nagios-nrpe-server + - prometheus-node-exporter - python3-pip - rsyslog - smartmontools diff --git a/k8s/Makefile b/k8s/Makefile index 46b39c72..f0e83b4d 100644 --- a/k8s/Makefile +++ b/k8s/Makefile @@ -95,14 +95,14 @@ $(STACKS):: .PHONY: envsubst imports install namespace_config node_labels \ persistent remote_volumes secrets sops untaint_master -IMPORTS = cert-manager flannel metrics +IMPORTS = cert-manager flannel INSTALL_YAML = $(basename $(wildcard install/*.yaml)) \ $(addprefix imports/, $(IMPORTS)) VOLUMES_YAML = $(basename $(wildcard volumes/*.yaml)) install: install/admin-user cluster_network \ install/local-storage storage_localdefault imports \ - install_imports namespace_config install/gitlab-rbac \ + install_imports namespace_config install/prometheus-rbac \ install/k8s-backup install/logspout remote_volumes \ sops data-sync-ssh persistent secrets install/ingress-nginx \ install/cert-manager @@ -219,10 +219,14 @@ cert-manager-helm: helm_install imports: $(foreach file,$(IMPORTS),imports/$(file).yaml) install_imports: $(foreach file, $(IMPORTS), imports/$(file)) -imports/metrics.yaml: imports/metrics-$(VERSION_METRICS).yaml +imports/kube-state-metrics: imports/kube-state-metrics-$(VERSION_METRICS) ln -s $(notdir $<) $@ -imports/metrics-$(VERSION_METRICS).yaml: - curl -sLo $@ https://github.com/kubernetes-sigs/metrics-server/releases/download/v$(VERSION_METRICS)/components.yaml +imports/kube-state-metrics-$(VERSION_METRICS): + git clone --dept 1 --branch v$(VERSION_METRICS) \ + https://github.com/kubernetes/kube-state-metrics \ + imports/kube-state-metrics-$(VERSION_METRICS) +install_metrics: imports/kube-state-metrics + kubectl apply --context=sudo -k $ + smtp_smarthost: smtp:587 + route: + group_by: [ alertname ] + group_wait: 10s + group_interval: 1m + repeat_interval: 1d + receiver: SMTP + receivers: + - name: SMTP + email_configs: + - require_tls: true + send_resolved: true + to: monitor@example.com + inhibit_rules: + - source_match: + severity: critical + target_match: + severity: warning + equal: [ alertname, dev, instance ] diff --git a/k8s/helm/grafana/subcharts/prometheus/.helmignore b/k8s/helm/grafana/subcharts/prometheus/.helmignore new file mode 100644 index 00000000..839de881 --- /dev/null +++ b/k8s/helm/grafana/subcharts/prometheus/.helmignore @@ -0,0 +1,2 @@ +*~ +.git diff --git a/k8s/helm/grafana/subcharts/prometheus/Chart.yaml b/k8s/helm/grafana/subcharts/prometheus/Chart.yaml new file mode 100644 index 00000000..58bbd3e6 --- /dev/null +++ b/k8s/helm/grafana/subcharts/prometheus/Chart.yaml @@ -0,0 +1,14 @@ +apiVersion: v2 +name: prometheus +description: Prometheus metrics and alerting +home: https://github.com/instantlinux/docker-tools +sources: +- https://github.com/instantlinux/docker-tools +- https://hub.docker.com/r/prom/prometheus +type: application +version: 0.1.0 +appVersion: "v3.3.1" +dependencies: +- name: chartlib + version: 0.1.8 + repository: https://instantlinux.github.io/docker-tools diff --git a/k8s/helm/grafana/subcharts/prometheus/templates/NOTES.txt b/k8s/helm/grafana/subcharts/prometheus/templates/NOTES.txt new file mode 100644 index 00000000..62ea3f4b --- /dev/null +++ b/k8s/helm/grafana/subcharts/prometheus/templates/NOTES.txt @@ -0,0 +1,28 @@ +{{- if hasKey .Values "service" }} +{{- if or .Values.service.enabled (not (hasKey .Values.service "enabled")) }} +1. Get the application URL by running these commands: +{{- if hasKey .Values "ingress" }} +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "local.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "local.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "local.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "local.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} +{{- end }} +{{- end }} +{{- end }} diff --git a/k8s/helm/grafana/subcharts/prometheus/templates/app.yaml b/k8s/helm/grafana/subcharts/prometheus/templates/app.yaml new file mode 100644 index 00000000..5a01911b --- /dev/null +++ b/k8s/helm/grafana/subcharts/prometheus/templates/app.yaml @@ -0,0 +1,15 @@ +{{- include "chartlib.configmap" . }} +--- +{{- include "chartlib.deployment" . }} +--- +{{- include "chartlib.hpa" . }} +--- +{{- include "chartlib.ingress" . }} +--- +{{- include "chartlib.ingresstotp" . }} +--- +{{- include "chartlib.service" . }} +--- +{{- include "chartlib.serviceaccount" . }} +--- +{{- include "chartlib.statefulset" . }} diff --git a/k8s/helm/grafana/subcharts/prometheus/templates/tests/test-connection.yaml b/k8s/helm/grafana/subcharts/prometheus/templates/tests/test-connection.yaml new file mode 100644 index 00000000..ae159a4f --- /dev/null +++ b/k8s/helm/grafana/subcharts/prometheus/templates/tests/test-connection.yaml @@ -0,0 +1,17 @@ +{{- if hasKey .Values "service" }} +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "local.fullname" . }}-test-connection" + labels: + {{- include "local.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "local.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never +{{- end }} diff --git a/k8s/helm/grafana/subcharts/prometheus/values.yaml b/k8s/helm/grafana/subcharts/prometheus/values.yaml new file mode 100644 index 00000000..e276f45a --- /dev/null +++ b/k8s/helm/grafana/subcharts/prometheus/values.yaml @@ -0,0 +1,138 @@ +# Default values for prometheus. +deployment: + command: + - /bin/prometheus + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=90d + - --web.external-url=http://10.101.1.21:9090 + containerPorts: [ containerPort: 9090 ] + nodeSelector: + service.prometheus: allow +volumeMounts: +- mountPath: /etc/prometheus/prometheus.yml + name: config + readOnly: true + subPath: prometheus.yml +- mountPath: /etc/prometheus/alert-rules.yml + name: config + readOnly: true + subPath: alert-rules.yml +- mountPath: /etc/prometheus/targets.json + name: config + readOnly: true + subPath: targets.json +- mountPath: /prometheus + name: data +volumes: +- name: config + configMap: + name: grafana-prometheus +- name: data + hostPath: { path: /var/lib/docker/k8s-volumes/prometheus } +image: + repository: prom/prometheus + pullPolicy: IfNotPresent + # tag: default + +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + enabled: true + create: true +service: + clusterIP: 10.101.1.21 + ports: [ port: 9090 ] + type: ClusterIP +autoscaling: + enabled: false + +configmap: + data: + prometheus.yml: | + global: + scrape_interval: 1m + evaluation_interval: 1m + alerting: + alertmanagers: + - static_configs: + - targets: + - grafana-alertmanager:9093 + rule_files: [ alert-rules.yml ] + scrape_configs: + - job_name: prometheus + static_configs: + - targets: [ localhost:9090 ] + - job_name: hw-nodes + file_sd_configs: + - files: [ targets.json ] + relabel_configs: + - source_labels: [__address__] + target_label: instance + - source_labels: [ __address__ ] + target_label: __address__ + replacement: '${1}:9100' + # Next two directives define label alertSuppress and apply + # it to nodes with a specified hostname prefix + - source_labels: [ instance ] + target_label: alertSuppress + replacement: false + - source_labels: [ instance ] + regex: ^myth.* + target_label: alertSuppress + replacement: true + targets.json: | + # Override the targets with your nodes list, comma-separated + [ + { + "labels": { + "job": "hw-nodes" + }, + "targets": [ + "localhost" + ] + } + ] + alert-rules.yml: | + groups: + - name: systems + rules: + - alert: InstanceDown + expr: up{alertSuppress="false"} == 0 + for: 15s + labels: + severity: critical + annotations: + summary: "Instance [{{ $labels.instance }}] down" + description: "[{{ $labels.instance }}] of {{ $labels.job }} is down" + + - alert: DiskSpaceLow + # To skip volumes on a monitored node, add to that node's config + # /etc/defaults/prometheus-node-exporter + # ARGS="--collector.filesystem.ignored-mount-points=" + expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0) + for: 2m + labels: + severity: critical + annotations: + summary: Low disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (> 90%) Value = {{ $value }}" + + - alert: CPULoadHigh + expr: sum by (instance) (node_load1) > node:cpu_core:count + for: 10m + labels: + severity: warning + annotations: + summary: Host high CPU load (instance {{ $labels.instance }}) + description: "CPU load average is high Value = {{ $value }}" + + - alert: NTPClockSkew + expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock is out of sync, ensure NTP is configured correctly on this host. Value = {{ $value }}" diff --git a/k8s/helm/grafana/templates/NOTES.txt b/k8s/helm/grafana/templates/NOTES.txt new file mode 100644 index 00000000..62ea3f4b --- /dev/null +++ b/k8s/helm/grafana/templates/NOTES.txt @@ -0,0 +1,28 @@ +{{- if hasKey .Values "service" }} +{{- if or .Values.service.enabled (not (hasKey .Values.service "enabled")) }} +1. Get the application URL by running these commands: +{{- if hasKey .Values "ingress" }} +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "local.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "local.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "local.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "local.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} +{{- end }} +{{- end }} +{{- end }} diff --git a/k8s/helm/grafana/templates/app.yaml b/k8s/helm/grafana/templates/app.yaml new file mode 100644 index 00000000..5a01911b --- /dev/null +++ b/k8s/helm/grafana/templates/app.yaml @@ -0,0 +1,15 @@ +{{- include "chartlib.configmap" . }} +--- +{{- include "chartlib.deployment" . }} +--- +{{- include "chartlib.hpa" . }} +--- +{{- include "chartlib.ingress" . }} +--- +{{- include "chartlib.ingresstotp" . }} +--- +{{- include "chartlib.service" . }} +--- +{{- include "chartlib.serviceaccount" . }} +--- +{{- include "chartlib.statefulset" . }} diff --git a/k8s/helm/grafana/templates/tests/test-connection.yaml b/k8s/helm/grafana/templates/tests/test-connection.yaml new file mode 100644 index 00000000..ae159a4f --- /dev/null +++ b/k8s/helm/grafana/templates/tests/test-connection.yaml @@ -0,0 +1,17 @@ +{{- if hasKey .Values "service" }} +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "local.fullname" . }}-test-connection" + labels: + {{- include "local.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "local.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never +{{- end }} diff --git a/k8s/helm/grafana/values.yaml b/k8s/helm/grafana/values.yaml new file mode 100644 index 00000000..1ba4cabe --- /dev/null +++ b/k8s/helm/grafana/values.yaml @@ -0,0 +1,97 @@ +# Default values for grafana. +tlsHostname: grafana.example.com +deployment: + env: + gf_analytics_reporting_enabled: false + gf_analytics_check_for_updates: false + gf_database_type: mysql + gf_database_host: db00 + gf_database_name: grafana + gf_database_password__file: /run/secrets/grafana-db-password + gf_database_user: grafana + gf_remote_cache_type: redis + gf_remote_cache_connstr: addr=grafana-redis:6379,pool_size=100,db=0,ssl=false + # See note in volumeMounts below + # gf_security_admin_password__file: /run/secrets/admin_password + gf_security_admin_email: admin@example.com + gf_server_domain: grafana.example.com + gf_server_root_url: https://grafana.example.com + gf_smtp_enabled: true + gf_smtp_host: smtp:25 + gf_users_allow_sign_up: false + tz: America/Los_Angeles + uid: 472 + containerPorts: + - containerPort: 3000 + livenessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + resources: + limits: + memory: 256Mi + requests: + cpu: 100m + memory: 64Mi +volumeMounts: +- mountPath: /var/lib/grafana + name: data + subPath: data +- mountPath: /run/secrets/grafana-db-password + name: grafana-db-password + readOnly: true + subPath: grafana-db-password +# Add an admin_password key in the secret and override volumeMounts to +# include this mountPath at first run if you want to set the admin pw +# automatically. This value is not used after initial setup. +# +# - mountPath: /run/secrets/admin_password +# name: grafana-db-password +# readOnly: true +# subPath: admin_password +volumes: +- name: data + hostPath: { path: /var/lib/docker/k8s-volumes/share/grafana } +- name: grafana-db-password + secret: + secretName: grafana-db-password + +image: + repository: grafana/grafana-enterprise + pullPolicy: IfNotPresent + # tag: default + +nameOverride: "" +fullnameOverride: "" + +serviceAccount: {} +service: + clusterIP: None + ports: + - { port: 80, targetPort: 3000, name: grafana } + type: ClusterIP +autoscaling: + enabled: false + +authelia: + fqdn: authtotp.example.com + ip: 10.101.1.5 + path: /login +ingress: + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + kubernetes.io/ingress.class: nginx + nginx.ingress.kubernetes.io/enable-access-log: "false" +ingressTOTP: + enabled: true + +# Subchart parameters +prometheus: + enabled: true +alertmanager: + enabled: true +redis: + enabled: false diff --git a/k8s/install/prometheus-rbac.yaml b/k8s/install/prometheus-rbac.yaml new file mode 100644 index 00000000..c02e37fc --- /dev/null +++ b/k8s/install/prometheus-rbac.yaml @@ -0,0 +1,32 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: $K8S_NAMESPACE:prometheus-scraper-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: $K8S_NAMESPACE:prometheus-scraper +subjects: +- kind: ServiceAccount + name: grafana-prometheus + namespace: $K8S_NAMESPACE +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: $K8S_NAMESPACE:prometheus-scraper +rules: +- apiGroups: [""] + resources: + - endpoints + - nodes + - pods + - services + verbs: [get, list, watch] +- apiGroups: [extensions] + resources: + - ingresses + verbs: [get, list, watch] +- nonResourceURLs: ["/metrics"] + verbs: [get]