diff --git a/docker-compose.yml b/docker-compose.yml index b047c88..541b002 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -72,6 +72,21 @@ services: - "/var/run/docker.sock:/var/run/docker.sock" - portainer_data:/data + scraper: + container_name: system_status + build: + context: . + dockerfile: ./status/dockerfile + volumes: + - ./status/health.html:/app/health.html + - ./status/json.json:/app/json.json + restart: 'on-failure' + command: + - --config json.json + - --prometheus_url "http://one.sce/prometheus" + - -vvv + + volumes: alertmanager-data: grafana-data: diff --git a/status/dockerfile b/status/dockerfile new file mode 100644 index 0000000..1a9ea20 --- /dev/null +++ b/status/dockerfile @@ -0,0 +1,16 @@ +# filepath: /Users/vineet/Projects/sce_system_status/prom/Dockerfile.scraper +FROM python:3.9-slim + +WORKDIR /app + +COPY ./status/requirements.txt . + +RUN pip3 install -r requirements.txt + +COPY ./status/flags.py . + +COPY ./status/scraper.py . + +COPY ./status/health.html /app/health.html + +ENTRYPOINT ["python3", "scraper.py"] \ No newline at end of file diff --git a/status/health.html b/status/health.html new file mode 100644 index 0000000..ade2ca6 --- /dev/null +++ b/status/health.html @@ -0,0 +1,93 @@ + + + + Prometheus Metrics + + + +

Prometheus Metrics

+

Last updated: {{ timestamp }}

+ + + + + + {% for item in metrics %} + + + + + {% endfor %} +
JobStatus
{{ item.job }} + {{ item.status }} +
+ + \ No newline at end of file diff --git a/status/json.json b/status/json.json new file mode 100644 index 0000000..5c8efee --- /dev/null +++ b/status/json.json @@ -0,0 +1,6 @@ +[ + { + "job-id": "prometheus-aggregation", + "query": "up" + } +] \ No newline at end of file diff --git a/status/nginx.conf b/status/nginx.conf new file mode 100644 index 0000000..35277a6 --- /dev/null +++ b/status/nginx.conf @@ -0,0 +1,31 @@ +http { + # Define cache path and parameters + proxy_cache_path /var/cache/nginx levels=1:2 keys_zone=html_cache:10m max_size=100m inactive=60m; + proxy_temp_path /var/cache/nginx/temp; + + #for http://localhost/status + server{ + listen 80; + server_name _; + + # Enable caching + proxy_cache html_cache; + proxy_cache_valid 200 302 30s; + proxy_cache_valid 404 30s; + + location /{ + proxy_pass http://scraper:8000; + + # Cache HTML files + location ~* \.html$ { + proxy_pass http://scraper:8000; + proxy_cache html_cache; + proxy_cache_min_uses 1; + proxy_cache_lock on; + add_header X-Cache-Status $upstream_cache_status; + } + } + } +} + +events{ } \ No newline at end of file diff --git a/status/requirements.txt b/status/requirements.txt new file mode 100644 index 0000000..112ac36 --- /dev/null +++ b/status/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.84.0 +uvicorn==0.18.3 +Jinja2==3.0.2 +py-grpc-prometheus==0.7.0 +prometheus_api_client==0.4.0 + diff --git a/status/scraper.py b/status/scraper.py new file mode 100644 index 0000000..7d6ae89 --- /dev/null +++ b/status/scraper.py @@ -0,0 +1,246 @@ +from dataclasses import dataclass +from fastapi.responses import HTMLResponse +from prometheus_api_client import PrometheusConnect +from fastapi import FastAPI, Request +from fastapi.templating import Jinja2Templates +from fastapi.staticfiles import StaticFiles +import uvicorn + +# from flags import get_args +import json +import time +import threading +import os +from datetime import datetime, timedelta +import pytz +import requests +import argparse +import logging + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--query-interval-seconds", + "-int", + type=int, + default=15, + help="interval for how often queries should be done", + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="port for server to be hosted on, defaults to 8000", + ) + parser.add_argument( + "--config", + type=str, + required=True, + help="argument to a json file, where the json file specifies what services we need to query", + ) + parser.add_argument( + "--prometheus-url", + type=str, + default="http://one.sce/prometheus", + help="the url for the promtheus container thats running that has to be scraped", + ) + parser.add_argument( + "--verbose", + "-v", + action="count", + default=0, + help="increase output verbosity)", + ) + + return parser.parse_args() + + +app = FastAPI() + +pacific_tz = pytz.timezone("US/Pacific") + +templates = Jinja2Templates(directory=".") + +args = get_args() + +prom = PrometheusConnect( + url=args.prometheus_url, disable_ssl=True +) # this will query "http://prometheus:9090/api/v1/query?query=up" + +metrics_data = [] +up_hours = 24 + +logging.Formatter.converter = time.gmtime + +logging.basicConfig( + # in mondo we trust + format="%(asctime)s.%(msecs)03dZ %(levelname)s:%(name)s:%(message)s", + datefmt="%Y-%m-%dT%H:%M:%S", + level=logging.ERROR - (args.verbose * 10), +) + + +@dataclass +class metrics: + job_name: str + timestamp: float + value: float + + +def check_status(query): + params = {"query": query} + try: + response = requests.get(f"{args.prometheus_url}/api/v1/query", params=params) + response.raise_for_status() # Raise an error for HTTP issues + json_response = response.json() + if json_response.get("status") != "success": + print(f"json response did not include success in status key, {json_response}") + return False + + except Exception as e: + logging.exception(f"Error querying Prometheus: {e}") + return None + + +def polling_loop(interval, config): + global metrics_data + while True: + metrics_data = [] + for hosts in config: + service_name = hosts.get("job-id", "prometheus-aggregation") + prom_query = hosts.get("query", "up") + process_up_query(prom_query, service_name) + time.sleep(interval) + + +service_data = {} + + +def process_up_query(query, service_name): + global metrics_data, service_data + process_time_query("time() - process_start_time_seconds", service_name) + if not check_status(query="up"): + logging.warning("status is not success, please look into it!!") + else: + logging.info("status is success in the query!!") + try: + result = prom.custom_query(query=query) + if not result: + logging.info(f"No results for query: {query}") + last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") + metrics_data.append( + {"instance": service_name, "status": "Error in querying"} + ) + return + + for metric in result: + instance = metric.get("metric", {}).get("instance", "unknown") + job_name = metric.get("metric", {}).get( + "job", "unknown" + ) # for later use in dataclass + value = metric.get("value", [None, None])[1] + # last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") + if float(value) > 0: + status = "Healthy" + else: + status = "Unhealthy" + if status == "Unhealthy": + current = get_first_match_time( + prom=prom, prom_query="up", match_value=0, hours=up_hours + ) + metrics_data.append( + {"instance": instance, "job": job_name, "status": current} + ) + continue + metrics_data.append( + {"instance": instance, "job": job_name, "status": "Healthy"} + ) + except Exception as e: + logging.exception(f"Error processing query '{query}': {e}") + metrics_data.append( + {"instance": service_name, "status": "Unhealthy due to error!"} + ) + + +def process_time_query(query, service_name): + global metrics_data, up_hours + try: + result = prom.custom_query(query=query) + if result and len(result) > 0: + for metric in result: + instance = metric.get("metric", {}).get("instance", "unknown") + job_name = metric.get("metric", {}).get("job", "unknown") + uptime_seconds = float(metric.get("value", [None,None])[1] or 0) + up_hours = uptime_seconds // 3600 + if up_hours == 0: + up_hours = 1 + except Exception as e: + logging.exception(f"Error processing time query '{query}': {e}") + + +def get_first_match_time(prom, prom_query, match_value=0, hours=24): + global metrics_data + prom_query = "up" + start_time = datetime.now() - timedelta(hours=hours) + end_time = datetime.now() + + try: + result = prom.get_metric_range_data( + metric_name=prom_query, + start_time=start_time, + end_time=end_time, + ) + + for series in result: + saw_up = False + for timestamp, value in reversed(series.get("values", [])): + v = float(value) + if v == 1: + saw_up = True + elif v == 0 and saw_up: + utc_time = datetime.utcfromtimestamp(float(timestamp)) + pacific_time = utc_time.astimezone(pacific_tz) + readable_time = pacific_time.strftime("%Y-%m-%d %H:%M:%S %Z") + status = f"Unhealthy as of {readable_time}" + return status + except Exception as e: + logging.exception(f"Error in get_first_match_time: {e}") + return "Error checking status history" + + +@app.get("/", response_class=HTMLResponse) +async def get_metrics(request: Request): + return templates.TemplateResponse( + "health.html", + { + "request": request, + "metrics": metrics_data, + "timestamp": datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z"), + }, + ) + + +def main(): + try: + with open(args.config, "r") as file: + config = json.load(file) + polling_thread = threading.Thread( + target=polling_loop, args=(args.query_interval_seconds, config), daemon=True + ) # The daemon=True ensures the thread exits when the main program exits. + polling_thread.start() + + uvicorn.run(app, host="0.0.0.0", port=args.port) + except FileNotFoundError: + logging.critical(f"Configuration file '{args.config}' not found!") + exit(1) + except Exception as e: + logging.exception("Unexpected error occurred!") + exit(1) + + + + +if __name__ == "__main__": + main() +