diff --git a/docker-compose.yml b/docker-compose.yml
index b047c88..541b002 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -72,6 +72,21 @@ services:
- "/var/run/docker.sock:/var/run/docker.sock"
- portainer_data:/data
+ scraper:
+ container_name: system_status
+ build:
+ context: .
+ dockerfile: ./status/dockerfile
+ volumes:
+ - ./status/health.html:/app/health.html
+ - ./status/json.json:/app/json.json
+ restart: 'on-failure'
+ command:
+ - --config json.json
+ - --prometheus_url "http://one.sce/prometheus"
+ - -vvv
+
+
volumes:
alertmanager-data:
grafana-data:
diff --git a/status/dockerfile b/status/dockerfile
new file mode 100644
index 0000000..1a9ea20
--- /dev/null
+++ b/status/dockerfile
@@ -0,0 +1,16 @@
+# filepath: /Users/vineet/Projects/sce_system_status/prom/Dockerfile.scraper
+FROM python:3.9-slim
+
+WORKDIR /app
+
+COPY ./status/requirements.txt .
+
+RUN pip3 install -r requirements.txt
+
+COPY ./status/flags.py .
+
+COPY ./status/scraper.py .
+
+COPY ./status/health.html /app/health.html
+
+ENTRYPOINT ["python3", "scraper.py"]
\ No newline at end of file
diff --git a/status/health.html b/status/health.html
new file mode 100644
index 0000000..ade2ca6
--- /dev/null
+++ b/status/health.html
@@ -0,0 +1,93 @@
+
+
+
+ Prometheus Metrics
+
+
+
+ Prometheus Metrics
+ Last updated: {{ timestamp }}
+
+
+ | Job |
+ Status |
+
+ {% for item in metrics %}
+
+ | {{ item.job }} |
+
+ {{ item.status }}
+ |
+
+ {% endfor %}
+
+
+
\ No newline at end of file
diff --git a/status/json.json b/status/json.json
new file mode 100644
index 0000000..5c8efee
--- /dev/null
+++ b/status/json.json
@@ -0,0 +1,6 @@
+[
+ {
+ "job-id": "prometheus-aggregation",
+ "query": "up"
+ }
+]
\ No newline at end of file
diff --git a/status/nginx.conf b/status/nginx.conf
new file mode 100644
index 0000000..35277a6
--- /dev/null
+++ b/status/nginx.conf
@@ -0,0 +1,31 @@
+http {
+ # Define cache path and parameters
+ proxy_cache_path /var/cache/nginx levels=1:2 keys_zone=html_cache:10m max_size=100m inactive=60m;
+ proxy_temp_path /var/cache/nginx/temp;
+
+ #for http://localhost/status
+ server{
+ listen 80;
+ server_name _;
+
+ # Enable caching
+ proxy_cache html_cache;
+ proxy_cache_valid 200 302 30s;
+ proxy_cache_valid 404 30s;
+
+ location /{
+ proxy_pass http://scraper:8000;
+
+ # Cache HTML files
+ location ~* \.html$ {
+ proxy_pass http://scraper:8000;
+ proxy_cache html_cache;
+ proxy_cache_min_uses 1;
+ proxy_cache_lock on;
+ add_header X-Cache-Status $upstream_cache_status;
+ }
+ }
+ }
+}
+
+events{ }
\ No newline at end of file
diff --git a/status/requirements.txt b/status/requirements.txt
new file mode 100644
index 0000000..112ac36
--- /dev/null
+++ b/status/requirements.txt
@@ -0,0 +1,6 @@
+fastapi==0.84.0
+uvicorn==0.18.3
+Jinja2==3.0.2
+py-grpc-prometheus==0.7.0
+prometheus_api_client==0.4.0
+
diff --git a/status/scraper.py b/status/scraper.py
new file mode 100644
index 0000000..7d6ae89
--- /dev/null
+++ b/status/scraper.py
@@ -0,0 +1,246 @@
+from dataclasses import dataclass
+from fastapi.responses import HTMLResponse
+from prometheus_api_client import PrometheusConnect
+from fastapi import FastAPI, Request
+from fastapi.templating import Jinja2Templates
+from fastapi.staticfiles import StaticFiles
+import uvicorn
+
+# from flags import get_args
+import json
+import time
+import threading
+import os
+from datetime import datetime, timedelta
+import pytz
+import requests
+import argparse
+import logging
+
+
+def get_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--query-interval-seconds",
+ "-int",
+ type=int,
+ default=15,
+ help="interval for how often queries should be done",
+ )
+ parser.add_argument(
+ "--port",
+ type=int,
+ default=8000,
+ help="port for server to be hosted on, defaults to 8000",
+ )
+ parser.add_argument(
+ "--config",
+ type=str,
+ required=True,
+ help="argument to a json file, where the json file specifies what services we need to query",
+ )
+ parser.add_argument(
+ "--prometheus-url",
+ type=str,
+ default="http://one.sce/prometheus",
+ help="the url for the promtheus container thats running that has to be scraped",
+ )
+ parser.add_argument(
+ "--verbose",
+ "-v",
+ action="count",
+ default=0,
+ help="increase output verbosity)",
+ )
+
+ return parser.parse_args()
+
+
+app = FastAPI()
+
+pacific_tz = pytz.timezone("US/Pacific")
+
+templates = Jinja2Templates(directory=".")
+
+args = get_args()
+
+prom = PrometheusConnect(
+ url=args.prometheus_url, disable_ssl=True
+) # this will query "http://prometheus:9090/api/v1/query?query=up"
+
+metrics_data = []
+up_hours = 24
+
+logging.Formatter.converter = time.gmtime
+
+logging.basicConfig(
+ # in mondo we trust
+ format="%(asctime)s.%(msecs)03dZ %(levelname)s:%(name)s:%(message)s",
+ datefmt="%Y-%m-%dT%H:%M:%S",
+ level=logging.ERROR - (args.verbose * 10),
+)
+
+
+@dataclass
+class metrics:
+ job_name: str
+ timestamp: float
+ value: float
+
+
+def check_status(query):
+ params = {"query": query}
+ try:
+ response = requests.get(f"{args.prometheus_url}/api/v1/query", params=params)
+ response.raise_for_status() # Raise an error for HTTP issues
+ json_response = response.json()
+ if json_response.get("status") != "success":
+ print(f"json response did not include success in status key, {json_response}")
+ return False
+
+ except Exception as e:
+ logging.exception(f"Error querying Prometheus: {e}")
+ return None
+
+
+def polling_loop(interval, config):
+ global metrics_data
+ while True:
+ metrics_data = []
+ for hosts in config:
+ service_name = hosts.get("job-id", "prometheus-aggregation")
+ prom_query = hosts.get("query", "up")
+ process_up_query(prom_query, service_name)
+ time.sleep(interval)
+
+
+service_data = {}
+
+
+def process_up_query(query, service_name):
+ global metrics_data, service_data
+ process_time_query("time() - process_start_time_seconds", service_name)
+ if not check_status(query="up"):
+ logging.warning("status is not success, please look into it!!")
+ else:
+ logging.info("status is success in the query!!")
+ try:
+ result = prom.custom_query(query=query)
+ if not result:
+ logging.info(f"No results for query: {query}")
+ last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z")
+ metrics_data.append(
+ {"instance": service_name, "status": "Error in querying"}
+ )
+ return
+
+ for metric in result:
+ instance = metric.get("metric", {}).get("instance", "unknown")
+ job_name = metric.get("metric", {}).get(
+ "job", "unknown"
+ ) # for later use in dataclass
+ value = metric.get("value", [None, None])[1]
+ # last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z")
+ if float(value) > 0:
+ status = "Healthy"
+ else:
+ status = "Unhealthy"
+ if status == "Unhealthy":
+ current = get_first_match_time(
+ prom=prom, prom_query="up", match_value=0, hours=up_hours
+ )
+ metrics_data.append(
+ {"instance": instance, "job": job_name, "status": current}
+ )
+ continue
+ metrics_data.append(
+ {"instance": instance, "job": job_name, "status": "Healthy"}
+ )
+ except Exception as e:
+ logging.exception(f"Error processing query '{query}': {e}")
+ metrics_data.append(
+ {"instance": service_name, "status": "Unhealthy due to error!"}
+ )
+
+
+def process_time_query(query, service_name):
+ global metrics_data, up_hours
+ try:
+ result = prom.custom_query(query=query)
+ if result and len(result) > 0:
+ for metric in result:
+ instance = metric.get("metric", {}).get("instance", "unknown")
+ job_name = metric.get("metric", {}).get("job", "unknown")
+ uptime_seconds = float(metric.get("value", [None,None])[1] or 0)
+ up_hours = uptime_seconds // 3600
+ if up_hours == 0:
+ up_hours = 1
+ except Exception as e:
+ logging.exception(f"Error processing time query '{query}': {e}")
+
+
+def get_first_match_time(prom, prom_query, match_value=0, hours=24):
+ global metrics_data
+ prom_query = "up"
+ start_time = datetime.now() - timedelta(hours=hours)
+ end_time = datetime.now()
+
+ try:
+ result = prom.get_metric_range_data(
+ metric_name=prom_query,
+ start_time=start_time,
+ end_time=end_time,
+ )
+
+ for series in result:
+ saw_up = False
+ for timestamp, value in reversed(series.get("values", [])):
+ v = float(value)
+ if v == 1:
+ saw_up = True
+ elif v == 0 and saw_up:
+ utc_time = datetime.utcfromtimestamp(float(timestamp))
+ pacific_time = utc_time.astimezone(pacific_tz)
+ readable_time = pacific_time.strftime("%Y-%m-%d %H:%M:%S %Z")
+ status = f"Unhealthy as of {readable_time}"
+ return status
+ except Exception as e:
+ logging.exception(f"Error in get_first_match_time: {e}")
+ return "Error checking status history"
+
+
+@app.get("/", response_class=HTMLResponse)
+async def get_metrics(request: Request):
+ return templates.TemplateResponse(
+ "health.html",
+ {
+ "request": request,
+ "metrics": metrics_data,
+ "timestamp": datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z"),
+ },
+ )
+
+
+def main():
+ try:
+ with open(args.config, "r") as file:
+ config = json.load(file)
+ polling_thread = threading.Thread(
+ target=polling_loop, args=(args.query_interval_seconds, config), daemon=True
+ ) # The daemon=True ensures the thread exits when the main program exits.
+ polling_thread.start()
+
+ uvicorn.run(app, host="0.0.0.0", port=args.port)
+ except FileNotFoundError:
+ logging.critical(f"Configuration file '{args.config}' not found!")
+ exit(1)
+ except Exception as e:
+ logging.exception("Unexpected error occurred!")
+ exit(1)
+
+
+
+
+if __name__ == "__main__":
+ main()
+