From 8cb8591dd69008f0896fe8499edf5f182fb8f6d3 Mon Sep 17 00:00:00 2001 From: vineeshah Date: Wed, 25 Jun 2025 12:32:18 -0700 Subject: [PATCH 01/15] everything for system status --- status/prom/docker-compose.yml | 41 +++++++++ status/prom/dockerfile | 14 +++ status/prom/dockerfile.scraper | 14 +++ status/prom/flags.py | 25 ++++++ status/prom/json.json | 13 +++ status/prom/nginx.conf | 31 +++++++ status/prom/output/metrics.html | 22 +++++ status/prom/prometheus.yml | 8 ++ status/prom/queries.txt | 21 +++++ status/prom/requirements.txt | 6 ++ status/prom/scraper.py | 140 ++++++++++++++++++++++++++++++ status/prom/server.py | 71 +++++++++++++++ status/prom/templates/health.html | 55 ++++++++++++ 13 files changed, 461 insertions(+) create mode 100644 status/prom/docker-compose.yml create mode 100644 status/prom/dockerfile create mode 100644 status/prom/dockerfile.scraper create mode 100644 status/prom/flags.py create mode 100644 status/prom/json.json create mode 100644 status/prom/nginx.conf create mode 100644 status/prom/output/metrics.html create mode 100644 status/prom/prometheus.yml create mode 100644 status/prom/queries.txt create mode 100644 status/prom/requirements.txt create mode 100644 status/prom/scraper.py create mode 100644 status/prom/server.py create mode 100644 status/prom/templates/health.html diff --git a/status/prom/docker-compose.yml b/status/prom/docker-compose.yml new file mode 100644 index 0000000..4f46dfe --- /dev/null +++ b/status/prom/docker-compose.yml @@ -0,0 +1,41 @@ +version: '2' + +services: + coin-api: + container_name: coin-api + build: + context: . + dockerfile: ./dockerfile + restart: 'on-failure' + ports: + - "5000:5000" + prometheus: + image: prom/prometheus:latest + restart: always + ports: + - 9090:9090 + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + command: + - --config.file=/etc/prometheus/prometheus.yml + + scraper: + container_name: scraper + build: + context: . + dockerfile: ./dockerfile.scraper + volumes: + - ./output:/app/output + - ./templates:/app/templates + - ./json.json:/app/json.json + ports: + - "8000:8000" + restart: 'on-failure' + command: python3 scraper.py --json json.json + + nginx: + image: nginx:1.25.3 + ports: + - 80:80 + volumes: + - ./nginx.conf:/etc/nginx/nginx.conf diff --git a/status/prom/dockerfile b/status/prom/dockerfile new file mode 100644 index 0000000..b13d47b --- /dev/null +++ b/status/prom/dockerfile @@ -0,0 +1,14 @@ +FROM python:3.9-slim + +WORKDIR /app + +COPY requirements.txt . + +RUN pip3 install -r requirements.txt + +COPY server.py . + + +EXPOSE 5000 + +CMD ["python3", "server.py"] diff --git a/status/prom/dockerfile.scraper b/status/prom/dockerfile.scraper new file mode 100644 index 0000000..420ce1f --- /dev/null +++ b/status/prom/dockerfile.scraper @@ -0,0 +1,14 @@ +# filepath: /Users/vineet/Projects/sce_system_status/prom/Dockerfile.scraper +FROM python:3.9-slim + +WORKDIR /app + +COPY requirements.txt . + +RUN pip3 install -r requirements.txt + +COPY flags.py . +COPY scraper.py . +COPY templates/ ./templates/ + +CMD ["python3", "scraper.py", "--json", "json.json"] \ No newline at end of file diff --git a/status/prom/flags.py b/status/prom/flags.py new file mode 100644 index 0000000..69d6c0c --- /dev/null +++ b/status/prom/flags.py @@ -0,0 +1,25 @@ +import argparse + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--interval", + "-int", + type= int, + default = 15, + help = "interval for how often queries should be done" + ) + parser.add_argument( + "--port", + type = int, + default = 8000, + help = "port for server to be hosted on, defaults to 8000" + ) + parser.add_argument( + "--json", + type = str, + required = True, + help = "argument to a json file, where the json file specifies what services we need to query" + ) + + return parser.parse_args() \ No newline at end of file diff --git a/status/prom/json.json b/status/prom/json.json new file mode 100644 index 0000000..a4411c2 --- /dev/null +++ b/status/prom/json.json @@ -0,0 +1,13 @@ +[ + { + "hostname": "localhost:3000/metrics", + "queries": [ + + { + "serviceName": "Coin-API", + "query": "up" + } + + ] + } +] \ No newline at end of file diff --git a/status/prom/nginx.conf b/status/prom/nginx.conf new file mode 100644 index 0000000..35277a6 --- /dev/null +++ b/status/prom/nginx.conf @@ -0,0 +1,31 @@ +http { + # Define cache path and parameters + proxy_cache_path /var/cache/nginx levels=1:2 keys_zone=html_cache:10m max_size=100m inactive=60m; + proxy_temp_path /var/cache/nginx/temp; + + #for http://localhost/status + server{ + listen 80; + server_name _; + + # Enable caching + proxy_cache html_cache; + proxy_cache_valid 200 302 30s; + proxy_cache_valid 404 30s; + + location /{ + proxy_pass http://scraper:8000; + + # Cache HTML files + location ~* \.html$ { + proxy_pass http://scraper:8000; + proxy_cache html_cache; + proxy_cache_min_uses 1; + proxy_cache_lock on; + add_header X-Cache-Status $upstream_cache_status; + } + } + } +} + +events{ } \ No newline at end of file diff --git a/status/prom/output/metrics.html b/status/prom/output/metrics.html new file mode 100644 index 0000000..4f97d2b --- /dev/null +++ b/status/prom/output/metrics.html @@ -0,0 +1,22 @@ + + + + + Prometheus Metrics + + +

Prometheus Metrics

+ + + + + + + + + + + +
InstanceValue
coin-api:50001
+ + diff --git a/status/prom/prometheus.yml b/status/prom/prometheus.yml new file mode 100644 index 0000000..a50693f --- /dev/null +++ b/status/prom/prometheus.yml @@ -0,0 +1,8 @@ +global: + scrape_interval: 10s + + +scrape_configs: + - job_name: 'coin-api' + static_configs: + - targets: ['coin-api:5000'] diff --git a/status/prom/queries.txt b/status/prom/queries.txt new file mode 100644 index 0000000..5bc99ee --- /dev/null +++ b/status/prom/queries.txt @@ -0,0 +1,21 @@ +{ + "hostname": "localhost:9090/metrics", + "queries": [ + { + "serviceName": "My FastAPI API - CPU Usage", + "query": "rate(node_cpu_seconds_total[5m])" + }, + { + "serviceName": "Prometheus - Memory Usage", + "query": "node_memory_MemAvailable_bytes" + }, + { + "serviceName": "My FastAPI API - Uptime", + "query": "time() - process_start_time_seconds" + }, + { + "serviceName": "Last Down::", + "query": "down" + } + ] + } \ No newline at end of file diff --git a/status/prom/requirements.txt b/status/prom/requirements.txt new file mode 100644 index 0000000..6b8b5b2 --- /dev/null +++ b/status/prom/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.84.0 +uvicorn==0.18.3 +Jinja2==3.0.2 +py-grpc-prometheus==0.7.0 +prometheus_api_client + diff --git a/status/prom/scraper.py b/status/prom/scraper.py new file mode 100644 index 0000000..40b3654 --- /dev/null +++ b/status/prom/scraper.py @@ -0,0 +1,140 @@ +from fastapi.responses import HTMLResponse +from prometheus_api_client import PrometheusConnect +from fastapi import FastAPI, Request +from fastapi.templating import Jinja2Templates +from fastapi.staticfiles import StaticFiles +import uvicorn +from flags import get_args +import json +import time +import threading +import os +from datetime import datetime, timedelta +import pytz + +prom = PrometheusConnect(url = "http://prometheus:9090", disable_ssl=True)#this will query "http://prometheus:9090/api/v1/query?query=up" + +app = FastAPI() + +pacific_tz = pytz.timezone('US/Pacific') + +templates = Jinja2Templates(directory="templates") + +args = get_args() +metrics_data = [] +up_hours = 24 + +def polling_loop(interval, config): + while True: + global metrics_data + metrics_data = [] + for hosts in config: + for query in hosts["queries"]: + service_name = query["serviceName"] + prom_query = query["query"] + if prom_query == "up": + process_up_query(prom_query, service_name) + time.sleep(interval) + +service_data = {} + +def process_up_query(query, service_name): + global metrics_data, service_data + process_time_query("time() - process_start_time_seconds", service_name) + + try: + result = prom.custom_query(query=query) + if not result: + print(f"No results for query: {query}") + last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") + metrics_data.append({ + "instance": service_name, + "status": "Error in querying" + }) + return + + for metric in result: + instance = metric['metric'].get('instance', 'unknown') + value = metric['value'][1] + last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") + status = "Healthy" if float(value) > 0 else "Unhealthy" + if status == "Unhealthy": + current = get_first_match_time(prom=prom, prom_query="up", match_value=0, hours=up_hours) + metrics_data.append({ + "instance": service_name, + "status": current + }) + else: + metrics_data.append({ + "instance": service_name, + "status": "Healthy" + }) + except Exception as e: + print(f"Error processing query '{query}': {e}") + metrics_data.append({ + "instance": service_name, + "status": "Unhealthy due to error!" + }) + + +def process_time_query(query, service_name): + global metrics_data, up_hours + try: + result = prom.custom_query(query=query) + if result and len(result) > 0: + first_result = result[0] + uptime_seconds = float(first_result["value"][1]) + up_hours = int(uptime_seconds/3600) + if up_hours == 0: + up_hours = 1 + except Exception as e: + print(f"Error processing time query '{query}': {e}") + +def get_first_match_time(prom, prom_query, match_value=0, hours=24): + global metrics_data + prom_query = "up" + start_time = datetime.now() - timedelta(hours=hours) + end_time = datetime.now() + + try: + result = prom.get_metric_range_data( + metric_name=prom_query, + start_time=start_time, + end_time=end_time, + ) + + for series in result: + saw_up = False + for timestamp, value in reversed(series["values"]): + v = float(value) + if v == 1: + saw_up = True + elif v == 0 and saw_up: + utc_time = datetime.utcfromtimestamp(float(timestamp)) + pacific_time = utc_time.astimezone(pacific_tz) + readable_time = pacific_time.strftime("%Y-%m-%d %H:%M:%S %Z") + status = f"Unhealthy as of {readable_time}" + return status + except Exception as e: + print(f"Error in get_first_match_time: {e}") + return "Error checking status history" + + +@app.get("/", response_class=HTMLResponse) +async def get_metrics(request: Request): + return templates.TemplateResponse( + "health.html", + {"request": request, "metrics": metrics_data, "timestamp": datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z")} + ) + +def main(): + with open(args.json, "r") as file: + config = json.load(file) + + polling_thread = threading.Thread(target = polling_loop, args = (args.interval,config), daemon=True)#The daemon=True ensures the thread exits when the main program exits. + polling_thread.start() + + uvicorn.run(app, host="0.0.0.0", port=args.port) + +if __name__ == "__main__": + main() diff --git a/status/prom/server.py b/status/prom/server.py new file mode 100644 index 0000000..736aaa6 --- /dev/null +++ b/status/prom/server.py @@ -0,0 +1,71 @@ +from fastapi import FastAPI, HTTPException, Response +import uvicorn +import random +import prometheus_client as pc +from fastapi.middleware.cors import CORSMiddleware + +app = FastAPI() + +app.add_middleware( + CORSMiddleware, + allow_origins = ["*"], + allow_methods = ["*"], + allow_headers = ["*"], +) +heads_count = pc.Counter( + "heads_count",#metrics name + "number of heads",#help text +) +tails_count = pc.Counter( + "tails_count",#metrics name + "number of tails",#help text +) +flip_count = pc.Counter( + "flip_count",#metrics name + "number of flips",#help text +) + +# Counter for coin flips + +@app.get("/") +def root(): + return({ "message": "hello world" }) + +@app.get("/flip") +def flip_coin(times = None): + flip_counts = { + "heads": 0, + "tails": 0 + } + + if times is None: + # Single flip when no parameter is provided + # result = "heads" if random.random() > 0.5 else "tails" + # flip_counts[result] += 1 + return flip_counts + else: + try: + times_as_int = int(times) + for i in range(times_as_int): + result = "heads" if random.random() > 0.5 else "tails" + flip_counts[result] += 1 + heads_count.inc(flip_counts["heads"]) + tails_count.inc(flip_counts["tails"]) + flip_count.inc(times_as_int) + return flip_counts + except ValueError: + return {"error": "Parameter 'times' must be a valid integer"} + + +@app.get("/metrics") +def get_metrics(): + return Response( + media_type="text/plain", + content = pc.generate_latest(), + ) + + + +if __name__ == "__main__": + uvicorn.run(app , host= "0.0.0.0", port=5000) + diff --git a/status/prom/templates/health.html b/status/prom/templates/health.html new file mode 100644 index 0000000..3e97373 --- /dev/null +++ b/status/prom/templates/health.html @@ -0,0 +1,55 @@ + + + + Prometheus Metrics + + + +

Prometheus Metrics

+

Last updated: {{ timestamp }}

+ + + + + + {% for item in metrics %} + + + + + {% endfor %} +
InstanceValue
{{ item.instance }} + {{ item.status }} +
+ + \ No newline at end of file From 6b633999ceb17c1d6a122e89e19c09b8674acdca Mon Sep 17 00:00:00 2001 From: vineeshah Date: Thu, 3 Jul 2025 10:20:51 -0700 Subject: [PATCH 02/15] without coin-api --- status/{prom => }/docker-compose.yml | 11 +--- status/{prom => }/dockerfile.scraper | 0 status/{prom => }/flags.py | 6 +++ status/json.json | 6 +++ status/{prom => }/nginx.conf | 0 status/prom/dockerfile | 14 ----- status/prom/json.json | 13 ----- status/prom/output/metrics.html | 22 -------- status/prom/queries.txt | 21 -------- status/prom/server.py | 71 ------------------------- status/{prom => }/prometheus.yml | 0 status/{prom => }/requirements.txt | 0 status/{prom => }/scraper.py | 50 +++++++++++++---- status/{prom => }/templates/health.html | 0 14 files changed, 54 insertions(+), 160 deletions(-) rename status/{prom => }/docker-compose.yml (75%) rename status/{prom => }/dockerfile.scraper (100%) rename status/{prom => }/flags.py (75%) create mode 100644 status/json.json rename status/{prom => }/nginx.conf (100%) delete mode 100644 status/prom/dockerfile delete mode 100644 status/prom/json.json delete mode 100644 status/prom/output/metrics.html delete mode 100644 status/prom/queries.txt delete mode 100644 status/prom/server.py rename status/{prom => }/prometheus.yml (100%) rename status/{prom => }/requirements.txt (100%) rename status/{prom => }/scraper.py (74%) rename status/{prom => }/templates/health.html (100%) diff --git a/status/prom/docker-compose.yml b/status/docker-compose.yml similarity index 75% rename from status/prom/docker-compose.yml rename to status/docker-compose.yml index 4f46dfe..f2ff942 100644 --- a/status/prom/docker-compose.yml +++ b/status/docker-compose.yml @@ -1,14 +1,6 @@ version: '2' services: - coin-api: - container_name: coin-api - build: - context: . - dockerfile: ./dockerfile - restart: 'on-failure' - ports: - - "5000:5000" prometheus: image: prom/prometheus:latest restart: always @@ -31,7 +23,8 @@ services: ports: - "8000:8000" restart: 'on-failure' - command: python3 scraper.py --json json.json + command: python3 scraper.py --json json.json --promurl "http://one.sce/prometheus/metrics" + nginx: image: nginx:1.25.3 diff --git a/status/prom/dockerfile.scraper b/status/dockerfile.scraper similarity index 100% rename from status/prom/dockerfile.scraper rename to status/dockerfile.scraper diff --git a/status/prom/flags.py b/status/flags.py similarity index 75% rename from status/prom/flags.py rename to status/flags.py index 69d6c0c..7194488 100644 --- a/status/prom/flags.py +++ b/status/flags.py @@ -21,5 +21,11 @@ def get_args(): required = True, help = "argument to a json file, where the json file specifies what services we need to query" ) + parser.add_argument( + "--promurl", + type = str, + default= "http://prometheus:9090", + help = "the url for the promtheus container thats running that has to be scraped" + ) return parser.parse_args() \ No newline at end of file diff --git a/status/json.json b/status/json.json new file mode 100644 index 0000000..5c8efee --- /dev/null +++ b/status/json.json @@ -0,0 +1,6 @@ +[ + { + "job-id": "prometheus-aggregation", + "query": "up" + } +] \ No newline at end of file diff --git a/status/prom/nginx.conf b/status/nginx.conf similarity index 100% rename from status/prom/nginx.conf rename to status/nginx.conf diff --git a/status/prom/dockerfile b/status/prom/dockerfile deleted file mode 100644 index b13d47b..0000000 --- a/status/prom/dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM python:3.9-slim - -WORKDIR /app - -COPY requirements.txt . - -RUN pip3 install -r requirements.txt - -COPY server.py . - - -EXPOSE 5000 - -CMD ["python3", "server.py"] diff --git a/status/prom/json.json b/status/prom/json.json deleted file mode 100644 index a4411c2..0000000 --- a/status/prom/json.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - { - "hostname": "localhost:3000/metrics", - "queries": [ - - { - "serviceName": "Coin-API", - "query": "up" - } - - ] - } -] \ No newline at end of file diff --git a/status/prom/output/metrics.html b/status/prom/output/metrics.html deleted file mode 100644 index 4f97d2b..0000000 --- a/status/prom/output/metrics.html +++ /dev/null @@ -1,22 +0,0 @@ - - - - - Prometheus Metrics - - -

Prometheus Metrics

- - - - - - - - - - - -
InstanceValue
coin-api:50001
- - diff --git a/status/prom/queries.txt b/status/prom/queries.txt deleted file mode 100644 index 5bc99ee..0000000 --- a/status/prom/queries.txt +++ /dev/null @@ -1,21 +0,0 @@ -{ - "hostname": "localhost:9090/metrics", - "queries": [ - { - "serviceName": "My FastAPI API - CPU Usage", - "query": "rate(node_cpu_seconds_total[5m])" - }, - { - "serviceName": "Prometheus - Memory Usage", - "query": "node_memory_MemAvailable_bytes" - }, - { - "serviceName": "My FastAPI API - Uptime", - "query": "time() - process_start_time_seconds" - }, - { - "serviceName": "Last Down::", - "query": "down" - } - ] - } \ No newline at end of file diff --git a/status/prom/server.py b/status/prom/server.py deleted file mode 100644 index 736aaa6..0000000 --- a/status/prom/server.py +++ /dev/null @@ -1,71 +0,0 @@ -from fastapi import FastAPI, HTTPException, Response -import uvicorn -import random -import prometheus_client as pc -from fastapi.middleware.cors import CORSMiddleware - -app = FastAPI() - -app.add_middleware( - CORSMiddleware, - allow_origins = ["*"], - allow_methods = ["*"], - allow_headers = ["*"], -) -heads_count = pc.Counter( - "heads_count",#metrics name - "number of heads",#help text -) -tails_count = pc.Counter( - "tails_count",#metrics name - "number of tails",#help text -) -flip_count = pc.Counter( - "flip_count",#metrics name - "number of flips",#help text -) - -# Counter for coin flips - -@app.get("/") -def root(): - return({ "message": "hello world" }) - -@app.get("/flip") -def flip_coin(times = None): - flip_counts = { - "heads": 0, - "tails": 0 - } - - if times is None: - # Single flip when no parameter is provided - # result = "heads" if random.random() > 0.5 else "tails" - # flip_counts[result] += 1 - return flip_counts - else: - try: - times_as_int = int(times) - for i in range(times_as_int): - result = "heads" if random.random() > 0.5 else "tails" - flip_counts[result] += 1 - heads_count.inc(flip_counts["heads"]) - tails_count.inc(flip_counts["tails"]) - flip_count.inc(times_as_int) - return flip_counts - except ValueError: - return {"error": "Parameter 'times' must be a valid integer"} - - -@app.get("/metrics") -def get_metrics(): - return Response( - media_type="text/plain", - content = pc.generate_latest(), - ) - - - -if __name__ == "__main__": - uvicorn.run(app , host= "0.0.0.0", port=5000) - diff --git a/status/prom/prometheus.yml b/status/prometheus.yml similarity index 100% rename from status/prom/prometheus.yml rename to status/prometheus.yml diff --git a/status/prom/requirements.txt b/status/requirements.txt similarity index 100% rename from status/prom/requirements.txt rename to status/requirements.txt diff --git a/status/prom/scraper.py b/status/scraper.py similarity index 74% rename from status/prom/scraper.py rename to status/scraper.py index 40b3654..2570709 100644 --- a/status/prom/scraper.py +++ b/status/scraper.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from fastapi.responses import HTMLResponse from prometheus_api_client import PrometheusConnect from fastapi import FastAPI, Request @@ -11,8 +12,9 @@ import os from datetime import datetime, timedelta import pytz +import requests + -prom = PrometheusConnect(url = "http://prometheus:9090", disable_ssl=True)#this will query "http://prometheus:9090/api/v1/query?query=up" app = FastAPI() @@ -21,19 +23,45 @@ templates = Jinja2Templates(directory="templates") args = get_args() + +prom = PrometheusConnect(url = args.promurl, disable_ssl=True)#this will query "http://prometheus:9090/api/v1/query?query=up" + metrics_data = [] up_hours = 24 +@dataclass +class metrics: + job_name: str + timestamp: float + value: float + +def check_status(query): + params = {"query" : query} + try: + response = requests.get("http://prometheus:9090/api/v1/query", params = params) + response.raise_for_status()# Raise an error for HTTP issues + json_response = response.json() + if json_response["status"]=="success": + return True + elif json_response["status"]==None: + print("the status key does not exist!") + return False + else: + return False + + except Exception as e: + print(f"Error querying Prometheus: {e}") + return None + def polling_loop(interval, config): while True: global metrics_data metrics_data = [] for hosts in config: - for query in hosts["queries"]: - service_name = query["serviceName"] - prom_query = query["query"] - if prom_query == "up": - process_up_query(prom_query, service_name) + service_name = hosts["job-id"] + prom_query = hosts["query"] + if prom_query == "up": + process_up_query(prom_query, service_name) time.sleep(interval) service_data = {} @@ -41,7 +69,8 @@ def polling_loop(interval, config): def process_up_query(query, service_name): global metrics_data, service_data process_time_query("time() - process_start_time_seconds", service_name) - + if not check_status(query="up"): + print("status is not success, please look into it!!") try: result = prom.custom_query(query=query) if not result: @@ -54,9 +83,10 @@ def process_up_query(query, service_name): return for metric in result: - instance = metric['metric'].get('instance', 'unknown') - value = metric['value'][1] - last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") + job_name = metric.get('metric',{}).get('job', "")#for later use in dataclass + time_stamp = metric.get('value', [])[0]#for later use in dataclass + value = metric.get('value', [])[1] + # last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") status = "Healthy" if float(value) > 0 else "Unhealthy" if status == "Unhealthy": current = get_first_match_time(prom=prom, prom_query="up", match_value=0, hours=up_hours) diff --git a/status/prom/templates/health.html b/status/templates/health.html similarity index 100% rename from status/prom/templates/health.html rename to status/templates/health.html From 68b44ddfc146c41dc2b06626daacbb25e541b3ac Mon Sep 17 00:00:00 2001 From: vineeshah Date: Thu, 3 Jul 2025 12:35:58 -0700 Subject: [PATCH 03/15] refined code for required promurl --- status/docker-compose.yml | 2 +- status/scraper.py | 24 +++++++------ status/templates/health.html | 68 ++++++++++++++++++++++++++++-------- 3 files changed, 68 insertions(+), 26 deletions(-) diff --git a/status/docker-compose.yml b/status/docker-compose.yml index f2ff942..c63ca4f 100644 --- a/status/docker-compose.yml +++ b/status/docker-compose.yml @@ -23,7 +23,7 @@ services: ports: - "8000:8000" restart: 'on-failure' - command: python3 scraper.py --json json.json --promurl "http://one.sce/prometheus/metrics" + command: python3 scraper.py --json json.json --promurl "http://one.sce/prometheus" nginx: diff --git a/status/scraper.py b/status/scraper.py index 2570709..0150614 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -54,8 +54,8 @@ def check_status(query): return None def polling_loop(interval, config): + global metrics_data while True: - global metrics_data metrics_data = [] for hosts in config: service_name = hosts["job-id"] @@ -83,20 +83,22 @@ def process_up_query(query, service_name): return for metric in result: - job_name = metric.get('metric',{}).get('job', "")#for later use in dataclass - time_stamp = metric.get('value', [])[0]#for later use in dataclass + instance = metric["metric"].get("instance", "unknown") + job_name = metric["metric"].get("job", "unknown")#for later use in dataclass value = metric.get('value', [])[1] # last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") status = "Healthy" if float(value) > 0 else "Unhealthy" if status == "Unhealthy": current = get_first_match_time(prom=prom, prom_query="up", match_value=0, hours=up_hours) metrics_data.append({ - "instance": service_name, + "instance": instance, + "job":job_name, "status": current }) else: metrics_data.append({ - "instance": service_name, + "instance": instance, + "job": job_name, "status": "Healthy" }) except Exception as e: @@ -112,11 +114,13 @@ def process_time_query(query, service_name): try: result = prom.custom_query(query=query) if result and len(result) > 0: - first_result = result[0] - uptime_seconds = float(first_result["value"][1]) - up_hours = int(uptime_seconds/3600) - if up_hours == 0: - up_hours = 1 + for metric in result: + instance = metric["metric"].get("instance", "unknown") + job_name = metric["metric"].get("job", "unknown") + uptime_seconds = float(metric["value"][1]) + up_hours = int(uptime_seconds / 3600) + if up_hours == 0: + up_hours = 1 except Exception as e: print(f"Error processing time query '{query}': {e}") diff --git a/status/templates/health.html b/status/templates/health.html index 3e97373..ade2ca6 100644 --- a/status/templates/health.html +++ b/status/templates/health.html @@ -4,33 +4,71 @@ Prometheus Metrics @@ -39,12 +77,12 @@

Prometheus Metrics

Last updated: {{ timestamp }}

- - + + {% for item in metrics %} - + From da578d972d66ae1805b5b3969dff698664718137 Mon Sep 17 00:00:00 2001 From: vineeshah Date: Wed, 25 Jun 2025 12:32:18 -0700 Subject: [PATCH 04/15] everything for system status --- status/prom/docker-compose.yml | 41 +++++++++ status/prom/dockerfile | 14 +++ status/prom/dockerfile.scraper | 14 +++ status/prom/flags.py | 25 ++++++ status/prom/json.json | 13 +++ status/prom/nginx.conf | 31 +++++++ status/prom/output/metrics.html | 22 +++++ status/prom/prometheus.yml | 8 ++ status/prom/queries.txt | 21 +++++ status/prom/requirements.txt | 6 ++ status/prom/scraper.py | 140 ++++++++++++++++++++++++++++++ status/prom/server.py | 71 +++++++++++++++ status/prom/templates/health.html | 55 ++++++++++++ 13 files changed, 461 insertions(+) create mode 100644 status/prom/docker-compose.yml create mode 100644 status/prom/dockerfile create mode 100644 status/prom/dockerfile.scraper create mode 100644 status/prom/flags.py create mode 100644 status/prom/json.json create mode 100644 status/prom/nginx.conf create mode 100644 status/prom/output/metrics.html create mode 100644 status/prom/prometheus.yml create mode 100644 status/prom/queries.txt create mode 100644 status/prom/requirements.txt create mode 100644 status/prom/scraper.py create mode 100644 status/prom/server.py create mode 100644 status/prom/templates/health.html diff --git a/status/prom/docker-compose.yml b/status/prom/docker-compose.yml new file mode 100644 index 0000000..4f46dfe --- /dev/null +++ b/status/prom/docker-compose.yml @@ -0,0 +1,41 @@ +version: '2' + +services: + coin-api: + container_name: coin-api + build: + context: . + dockerfile: ./dockerfile + restart: 'on-failure' + ports: + - "5000:5000" + prometheus: + image: prom/prometheus:latest + restart: always + ports: + - 9090:9090 + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + command: + - --config.file=/etc/prometheus/prometheus.yml + + scraper: + container_name: scraper + build: + context: . + dockerfile: ./dockerfile.scraper + volumes: + - ./output:/app/output + - ./templates:/app/templates + - ./json.json:/app/json.json + ports: + - "8000:8000" + restart: 'on-failure' + command: python3 scraper.py --json json.json + + nginx: + image: nginx:1.25.3 + ports: + - 80:80 + volumes: + - ./nginx.conf:/etc/nginx/nginx.conf diff --git a/status/prom/dockerfile b/status/prom/dockerfile new file mode 100644 index 0000000..b13d47b --- /dev/null +++ b/status/prom/dockerfile @@ -0,0 +1,14 @@ +FROM python:3.9-slim + +WORKDIR /app + +COPY requirements.txt . + +RUN pip3 install -r requirements.txt + +COPY server.py . + + +EXPOSE 5000 + +CMD ["python3", "server.py"] diff --git a/status/prom/dockerfile.scraper b/status/prom/dockerfile.scraper new file mode 100644 index 0000000..420ce1f --- /dev/null +++ b/status/prom/dockerfile.scraper @@ -0,0 +1,14 @@ +# filepath: /Users/vineet/Projects/sce_system_status/prom/Dockerfile.scraper +FROM python:3.9-slim + +WORKDIR /app + +COPY requirements.txt . + +RUN pip3 install -r requirements.txt + +COPY flags.py . +COPY scraper.py . +COPY templates/ ./templates/ + +CMD ["python3", "scraper.py", "--json", "json.json"] \ No newline at end of file diff --git a/status/prom/flags.py b/status/prom/flags.py new file mode 100644 index 0000000..69d6c0c --- /dev/null +++ b/status/prom/flags.py @@ -0,0 +1,25 @@ +import argparse + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--interval", + "-int", + type= int, + default = 15, + help = "interval for how often queries should be done" + ) + parser.add_argument( + "--port", + type = int, + default = 8000, + help = "port for server to be hosted on, defaults to 8000" + ) + parser.add_argument( + "--json", + type = str, + required = True, + help = "argument to a json file, where the json file specifies what services we need to query" + ) + + return parser.parse_args() \ No newline at end of file diff --git a/status/prom/json.json b/status/prom/json.json new file mode 100644 index 0000000..a4411c2 --- /dev/null +++ b/status/prom/json.json @@ -0,0 +1,13 @@ +[ + { + "hostname": "localhost:3000/metrics", + "queries": [ + + { + "serviceName": "Coin-API", + "query": "up" + } + + ] + } +] \ No newline at end of file diff --git a/status/prom/nginx.conf b/status/prom/nginx.conf new file mode 100644 index 0000000..35277a6 --- /dev/null +++ b/status/prom/nginx.conf @@ -0,0 +1,31 @@ +http { + # Define cache path and parameters + proxy_cache_path /var/cache/nginx levels=1:2 keys_zone=html_cache:10m max_size=100m inactive=60m; + proxy_temp_path /var/cache/nginx/temp; + + #for http://localhost/status + server{ + listen 80; + server_name _; + + # Enable caching + proxy_cache html_cache; + proxy_cache_valid 200 302 30s; + proxy_cache_valid 404 30s; + + location /{ + proxy_pass http://scraper:8000; + + # Cache HTML files + location ~* \.html$ { + proxy_pass http://scraper:8000; + proxy_cache html_cache; + proxy_cache_min_uses 1; + proxy_cache_lock on; + add_header X-Cache-Status $upstream_cache_status; + } + } + } +} + +events{ } \ No newline at end of file diff --git a/status/prom/output/metrics.html b/status/prom/output/metrics.html new file mode 100644 index 0000000..4f97d2b --- /dev/null +++ b/status/prom/output/metrics.html @@ -0,0 +1,22 @@ + + + + + Prometheus Metrics + + +

Prometheus Metrics

+
InstanceValueJobStatus
{{ item.instance }}{{ item.job }} {{ item.status }}
+ + + + + + + + + + +
InstanceValue
coin-api:50001
+ + diff --git a/status/prom/prometheus.yml b/status/prom/prometheus.yml new file mode 100644 index 0000000..a50693f --- /dev/null +++ b/status/prom/prometheus.yml @@ -0,0 +1,8 @@ +global: + scrape_interval: 10s + + +scrape_configs: + - job_name: 'coin-api' + static_configs: + - targets: ['coin-api:5000'] diff --git a/status/prom/queries.txt b/status/prom/queries.txt new file mode 100644 index 0000000..5bc99ee --- /dev/null +++ b/status/prom/queries.txt @@ -0,0 +1,21 @@ +{ + "hostname": "localhost:9090/metrics", + "queries": [ + { + "serviceName": "My FastAPI API - CPU Usage", + "query": "rate(node_cpu_seconds_total[5m])" + }, + { + "serviceName": "Prometheus - Memory Usage", + "query": "node_memory_MemAvailable_bytes" + }, + { + "serviceName": "My FastAPI API - Uptime", + "query": "time() - process_start_time_seconds" + }, + { + "serviceName": "Last Down::", + "query": "down" + } + ] + } \ No newline at end of file diff --git a/status/prom/requirements.txt b/status/prom/requirements.txt new file mode 100644 index 0000000..6b8b5b2 --- /dev/null +++ b/status/prom/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.84.0 +uvicorn==0.18.3 +Jinja2==3.0.2 +py-grpc-prometheus==0.7.0 +prometheus_api_client + diff --git a/status/prom/scraper.py b/status/prom/scraper.py new file mode 100644 index 0000000..40b3654 --- /dev/null +++ b/status/prom/scraper.py @@ -0,0 +1,140 @@ +from fastapi.responses import HTMLResponse +from prometheus_api_client import PrometheusConnect +from fastapi import FastAPI, Request +from fastapi.templating import Jinja2Templates +from fastapi.staticfiles import StaticFiles +import uvicorn +from flags import get_args +import json +import time +import threading +import os +from datetime import datetime, timedelta +import pytz + +prom = PrometheusConnect(url = "http://prometheus:9090", disable_ssl=True)#this will query "http://prometheus:9090/api/v1/query?query=up" + +app = FastAPI() + +pacific_tz = pytz.timezone('US/Pacific') + +templates = Jinja2Templates(directory="templates") + +args = get_args() +metrics_data = [] +up_hours = 24 + +def polling_loop(interval, config): + while True: + global metrics_data + metrics_data = [] + for hosts in config: + for query in hosts["queries"]: + service_name = query["serviceName"] + prom_query = query["query"] + if prom_query == "up": + process_up_query(prom_query, service_name) + time.sleep(interval) + +service_data = {} + +def process_up_query(query, service_name): + global metrics_data, service_data + process_time_query("time() - process_start_time_seconds", service_name) + + try: + result = prom.custom_query(query=query) + if not result: + print(f"No results for query: {query}") + last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") + metrics_data.append({ + "instance": service_name, + "status": "Error in querying" + }) + return + + for metric in result: + instance = metric['metric'].get('instance', 'unknown') + value = metric['value'][1] + last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") + status = "Healthy" if float(value) > 0 else "Unhealthy" + if status == "Unhealthy": + current = get_first_match_time(prom=prom, prom_query="up", match_value=0, hours=up_hours) + metrics_data.append({ + "instance": service_name, + "status": current + }) + else: + metrics_data.append({ + "instance": service_name, + "status": "Healthy" + }) + except Exception as e: + print(f"Error processing query '{query}': {e}") + metrics_data.append({ + "instance": service_name, + "status": "Unhealthy due to error!" + }) + + +def process_time_query(query, service_name): + global metrics_data, up_hours + try: + result = prom.custom_query(query=query) + if result and len(result) > 0: + first_result = result[0] + uptime_seconds = float(first_result["value"][1]) + up_hours = int(uptime_seconds/3600) + if up_hours == 0: + up_hours = 1 + except Exception as e: + print(f"Error processing time query '{query}': {e}") + +def get_first_match_time(prom, prom_query, match_value=0, hours=24): + global metrics_data + prom_query = "up" + start_time = datetime.now() - timedelta(hours=hours) + end_time = datetime.now() + + try: + result = prom.get_metric_range_data( + metric_name=prom_query, + start_time=start_time, + end_time=end_time, + ) + + for series in result: + saw_up = False + for timestamp, value in reversed(series["values"]): + v = float(value) + if v == 1: + saw_up = True + elif v == 0 and saw_up: + utc_time = datetime.utcfromtimestamp(float(timestamp)) + pacific_time = utc_time.astimezone(pacific_tz) + readable_time = pacific_time.strftime("%Y-%m-%d %H:%M:%S %Z") + status = f"Unhealthy as of {readable_time}" + return status + except Exception as e: + print(f"Error in get_first_match_time: {e}") + return "Error checking status history" + + +@app.get("/", response_class=HTMLResponse) +async def get_metrics(request: Request): + return templates.TemplateResponse( + "health.html", + {"request": request, "metrics": metrics_data, "timestamp": datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z")} + ) + +def main(): + with open(args.json, "r") as file: + config = json.load(file) + + polling_thread = threading.Thread(target = polling_loop, args = (args.interval,config), daemon=True)#The daemon=True ensures the thread exits when the main program exits. + polling_thread.start() + + uvicorn.run(app, host="0.0.0.0", port=args.port) + +if __name__ == "__main__": + main() diff --git a/status/prom/server.py b/status/prom/server.py new file mode 100644 index 0000000..736aaa6 --- /dev/null +++ b/status/prom/server.py @@ -0,0 +1,71 @@ +from fastapi import FastAPI, HTTPException, Response +import uvicorn +import random +import prometheus_client as pc +from fastapi.middleware.cors import CORSMiddleware + +app = FastAPI() + +app.add_middleware( + CORSMiddleware, + allow_origins = ["*"], + allow_methods = ["*"], + allow_headers = ["*"], +) +heads_count = pc.Counter( + "heads_count",#metrics name + "number of heads",#help text +) +tails_count = pc.Counter( + "tails_count",#metrics name + "number of tails",#help text +) +flip_count = pc.Counter( + "flip_count",#metrics name + "number of flips",#help text +) + +# Counter for coin flips + +@app.get("/") +def root(): + return({ "message": "hello world" }) + +@app.get("/flip") +def flip_coin(times = None): + flip_counts = { + "heads": 0, + "tails": 0 + } + + if times is None: + # Single flip when no parameter is provided + # result = "heads" if random.random() > 0.5 else "tails" + # flip_counts[result] += 1 + return flip_counts + else: + try: + times_as_int = int(times) + for i in range(times_as_int): + result = "heads" if random.random() > 0.5 else "tails" + flip_counts[result] += 1 + heads_count.inc(flip_counts["heads"]) + tails_count.inc(flip_counts["tails"]) + flip_count.inc(times_as_int) + return flip_counts + except ValueError: + return {"error": "Parameter 'times' must be a valid integer"} + + +@app.get("/metrics") +def get_metrics(): + return Response( + media_type="text/plain", + content = pc.generate_latest(), + ) + + + +if __name__ == "__main__": + uvicorn.run(app , host= "0.0.0.0", port=5000) + diff --git a/status/prom/templates/health.html b/status/prom/templates/health.html new file mode 100644 index 0000000..3e97373 --- /dev/null +++ b/status/prom/templates/health.html @@ -0,0 +1,55 @@ + + + + Prometheus Metrics + + + +

Prometheus Metrics

+

Last updated: {{ timestamp }}

+ + + + + + {% for item in metrics %} + + + + + {% endfor %} +
InstanceValue
{{ item.instance }} + {{ item.status }} +
+ + \ No newline at end of file From 315cf9ef90ad064cdad2756cbc7e8247e081845f Mon Sep 17 00:00:00 2001 From: vineeshah Date: Thu, 3 Jul 2025 10:20:51 -0700 Subject: [PATCH 05/15] without coin-api --- status/{prom => }/docker-compose.yml | 11 +--- status/{prom => }/dockerfile.scraper | 0 status/{prom => }/flags.py | 6 +++ status/json.json | 6 +++ status/{prom => }/nginx.conf | 0 status/prom/dockerfile | 14 ----- status/prom/json.json | 13 ----- status/prom/output/metrics.html | 22 -------- status/prom/queries.txt | 21 -------- status/prom/server.py | 71 ------------------------- status/{prom => }/prometheus.yml | 0 status/{prom => }/requirements.txt | 0 status/{prom => }/scraper.py | 50 +++++++++++++---- status/{prom => }/templates/health.html | 0 14 files changed, 54 insertions(+), 160 deletions(-) rename status/{prom => }/docker-compose.yml (75%) rename status/{prom => }/dockerfile.scraper (100%) rename status/{prom => }/flags.py (75%) create mode 100644 status/json.json rename status/{prom => }/nginx.conf (100%) delete mode 100644 status/prom/dockerfile delete mode 100644 status/prom/json.json delete mode 100644 status/prom/output/metrics.html delete mode 100644 status/prom/queries.txt delete mode 100644 status/prom/server.py rename status/{prom => }/prometheus.yml (100%) rename status/{prom => }/requirements.txt (100%) rename status/{prom => }/scraper.py (74%) rename status/{prom => }/templates/health.html (100%) diff --git a/status/prom/docker-compose.yml b/status/docker-compose.yml similarity index 75% rename from status/prom/docker-compose.yml rename to status/docker-compose.yml index 4f46dfe..f2ff942 100644 --- a/status/prom/docker-compose.yml +++ b/status/docker-compose.yml @@ -1,14 +1,6 @@ version: '2' services: - coin-api: - container_name: coin-api - build: - context: . - dockerfile: ./dockerfile - restart: 'on-failure' - ports: - - "5000:5000" prometheus: image: prom/prometheus:latest restart: always @@ -31,7 +23,8 @@ services: ports: - "8000:8000" restart: 'on-failure' - command: python3 scraper.py --json json.json + command: python3 scraper.py --json json.json --promurl "http://one.sce/prometheus/metrics" + nginx: image: nginx:1.25.3 diff --git a/status/prom/dockerfile.scraper b/status/dockerfile.scraper similarity index 100% rename from status/prom/dockerfile.scraper rename to status/dockerfile.scraper diff --git a/status/prom/flags.py b/status/flags.py similarity index 75% rename from status/prom/flags.py rename to status/flags.py index 69d6c0c..7194488 100644 --- a/status/prom/flags.py +++ b/status/flags.py @@ -21,5 +21,11 @@ def get_args(): required = True, help = "argument to a json file, where the json file specifies what services we need to query" ) + parser.add_argument( + "--promurl", + type = str, + default= "http://prometheus:9090", + help = "the url for the promtheus container thats running that has to be scraped" + ) return parser.parse_args() \ No newline at end of file diff --git a/status/json.json b/status/json.json new file mode 100644 index 0000000..5c8efee --- /dev/null +++ b/status/json.json @@ -0,0 +1,6 @@ +[ + { + "job-id": "prometheus-aggregation", + "query": "up" + } +] \ No newline at end of file diff --git a/status/prom/nginx.conf b/status/nginx.conf similarity index 100% rename from status/prom/nginx.conf rename to status/nginx.conf diff --git a/status/prom/dockerfile b/status/prom/dockerfile deleted file mode 100644 index b13d47b..0000000 --- a/status/prom/dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM python:3.9-slim - -WORKDIR /app - -COPY requirements.txt . - -RUN pip3 install -r requirements.txt - -COPY server.py . - - -EXPOSE 5000 - -CMD ["python3", "server.py"] diff --git a/status/prom/json.json b/status/prom/json.json deleted file mode 100644 index a4411c2..0000000 --- a/status/prom/json.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - { - "hostname": "localhost:3000/metrics", - "queries": [ - - { - "serviceName": "Coin-API", - "query": "up" - } - - ] - } -] \ No newline at end of file diff --git a/status/prom/output/metrics.html b/status/prom/output/metrics.html deleted file mode 100644 index 4f97d2b..0000000 --- a/status/prom/output/metrics.html +++ /dev/null @@ -1,22 +0,0 @@ - - - - - Prometheus Metrics - - -

Prometheus Metrics

- - - - - - - - - - - -
InstanceValue
coin-api:50001
- - diff --git a/status/prom/queries.txt b/status/prom/queries.txt deleted file mode 100644 index 5bc99ee..0000000 --- a/status/prom/queries.txt +++ /dev/null @@ -1,21 +0,0 @@ -{ - "hostname": "localhost:9090/metrics", - "queries": [ - { - "serviceName": "My FastAPI API - CPU Usage", - "query": "rate(node_cpu_seconds_total[5m])" - }, - { - "serviceName": "Prometheus - Memory Usage", - "query": "node_memory_MemAvailable_bytes" - }, - { - "serviceName": "My FastAPI API - Uptime", - "query": "time() - process_start_time_seconds" - }, - { - "serviceName": "Last Down::", - "query": "down" - } - ] - } \ No newline at end of file diff --git a/status/prom/server.py b/status/prom/server.py deleted file mode 100644 index 736aaa6..0000000 --- a/status/prom/server.py +++ /dev/null @@ -1,71 +0,0 @@ -from fastapi import FastAPI, HTTPException, Response -import uvicorn -import random -import prometheus_client as pc -from fastapi.middleware.cors import CORSMiddleware - -app = FastAPI() - -app.add_middleware( - CORSMiddleware, - allow_origins = ["*"], - allow_methods = ["*"], - allow_headers = ["*"], -) -heads_count = pc.Counter( - "heads_count",#metrics name - "number of heads",#help text -) -tails_count = pc.Counter( - "tails_count",#metrics name - "number of tails",#help text -) -flip_count = pc.Counter( - "flip_count",#metrics name - "number of flips",#help text -) - -# Counter for coin flips - -@app.get("/") -def root(): - return({ "message": "hello world" }) - -@app.get("/flip") -def flip_coin(times = None): - flip_counts = { - "heads": 0, - "tails": 0 - } - - if times is None: - # Single flip when no parameter is provided - # result = "heads" if random.random() > 0.5 else "tails" - # flip_counts[result] += 1 - return flip_counts - else: - try: - times_as_int = int(times) - for i in range(times_as_int): - result = "heads" if random.random() > 0.5 else "tails" - flip_counts[result] += 1 - heads_count.inc(flip_counts["heads"]) - tails_count.inc(flip_counts["tails"]) - flip_count.inc(times_as_int) - return flip_counts - except ValueError: - return {"error": "Parameter 'times' must be a valid integer"} - - -@app.get("/metrics") -def get_metrics(): - return Response( - media_type="text/plain", - content = pc.generate_latest(), - ) - - - -if __name__ == "__main__": - uvicorn.run(app , host= "0.0.0.0", port=5000) - diff --git a/status/prom/prometheus.yml b/status/prometheus.yml similarity index 100% rename from status/prom/prometheus.yml rename to status/prometheus.yml diff --git a/status/prom/requirements.txt b/status/requirements.txt similarity index 100% rename from status/prom/requirements.txt rename to status/requirements.txt diff --git a/status/prom/scraper.py b/status/scraper.py similarity index 74% rename from status/prom/scraper.py rename to status/scraper.py index 40b3654..2570709 100644 --- a/status/prom/scraper.py +++ b/status/scraper.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from fastapi.responses import HTMLResponse from prometheus_api_client import PrometheusConnect from fastapi import FastAPI, Request @@ -11,8 +12,9 @@ import os from datetime import datetime, timedelta import pytz +import requests + -prom = PrometheusConnect(url = "http://prometheus:9090", disable_ssl=True)#this will query "http://prometheus:9090/api/v1/query?query=up" app = FastAPI() @@ -21,19 +23,45 @@ templates = Jinja2Templates(directory="templates") args = get_args() + +prom = PrometheusConnect(url = args.promurl, disable_ssl=True)#this will query "http://prometheus:9090/api/v1/query?query=up" + metrics_data = [] up_hours = 24 +@dataclass +class metrics: + job_name: str + timestamp: float + value: float + +def check_status(query): + params = {"query" : query} + try: + response = requests.get("http://prometheus:9090/api/v1/query", params = params) + response.raise_for_status()# Raise an error for HTTP issues + json_response = response.json() + if json_response["status"]=="success": + return True + elif json_response["status"]==None: + print("the status key does not exist!") + return False + else: + return False + + except Exception as e: + print(f"Error querying Prometheus: {e}") + return None + def polling_loop(interval, config): while True: global metrics_data metrics_data = [] for hosts in config: - for query in hosts["queries"]: - service_name = query["serviceName"] - prom_query = query["query"] - if prom_query == "up": - process_up_query(prom_query, service_name) + service_name = hosts["job-id"] + prom_query = hosts["query"] + if prom_query == "up": + process_up_query(prom_query, service_name) time.sleep(interval) service_data = {} @@ -41,7 +69,8 @@ def polling_loop(interval, config): def process_up_query(query, service_name): global metrics_data, service_data process_time_query("time() - process_start_time_seconds", service_name) - + if not check_status(query="up"): + print("status is not success, please look into it!!") try: result = prom.custom_query(query=query) if not result: @@ -54,9 +83,10 @@ def process_up_query(query, service_name): return for metric in result: - instance = metric['metric'].get('instance', 'unknown') - value = metric['value'][1] - last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") + job_name = metric.get('metric',{}).get('job', "")#for later use in dataclass + time_stamp = metric.get('value', [])[0]#for later use in dataclass + value = metric.get('value', [])[1] + # last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") status = "Healthy" if float(value) > 0 else "Unhealthy" if status == "Unhealthy": current = get_first_match_time(prom=prom, prom_query="up", match_value=0, hours=up_hours) diff --git a/status/prom/templates/health.html b/status/templates/health.html similarity index 100% rename from status/prom/templates/health.html rename to status/templates/health.html From a0acfc5a1a6cd2d44c9604df62f83638dd2e3d28 Mon Sep 17 00:00:00 2001 From: vineeshah Date: Thu, 3 Jul 2025 12:35:58 -0700 Subject: [PATCH 06/15] refined code for required promurl --- status/docker-compose.yml | 2 +- status/scraper.py | 24 +++++++------ status/templates/health.html | 68 ++++++++++++++++++++++++++++-------- 3 files changed, 68 insertions(+), 26 deletions(-) diff --git a/status/docker-compose.yml b/status/docker-compose.yml index f2ff942..c63ca4f 100644 --- a/status/docker-compose.yml +++ b/status/docker-compose.yml @@ -23,7 +23,7 @@ services: ports: - "8000:8000" restart: 'on-failure' - command: python3 scraper.py --json json.json --promurl "http://one.sce/prometheus/metrics" + command: python3 scraper.py --json json.json --promurl "http://one.sce/prometheus" nginx: diff --git a/status/scraper.py b/status/scraper.py index 2570709..0150614 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -54,8 +54,8 @@ def check_status(query): return None def polling_loop(interval, config): + global metrics_data while True: - global metrics_data metrics_data = [] for hosts in config: service_name = hosts["job-id"] @@ -83,20 +83,22 @@ def process_up_query(query, service_name): return for metric in result: - job_name = metric.get('metric',{}).get('job', "")#for later use in dataclass - time_stamp = metric.get('value', [])[0]#for later use in dataclass + instance = metric["metric"].get("instance", "unknown") + job_name = metric["metric"].get("job", "unknown")#for later use in dataclass value = metric.get('value', [])[1] # last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") status = "Healthy" if float(value) > 0 else "Unhealthy" if status == "Unhealthy": current = get_first_match_time(prom=prom, prom_query="up", match_value=0, hours=up_hours) metrics_data.append({ - "instance": service_name, + "instance": instance, + "job":job_name, "status": current }) else: metrics_data.append({ - "instance": service_name, + "instance": instance, + "job": job_name, "status": "Healthy" }) except Exception as e: @@ -112,11 +114,13 @@ def process_time_query(query, service_name): try: result = prom.custom_query(query=query) if result and len(result) > 0: - first_result = result[0] - uptime_seconds = float(first_result["value"][1]) - up_hours = int(uptime_seconds/3600) - if up_hours == 0: - up_hours = 1 + for metric in result: + instance = metric["metric"].get("instance", "unknown") + job_name = metric["metric"].get("job", "unknown") + uptime_seconds = float(metric["value"][1]) + up_hours = int(uptime_seconds / 3600) + if up_hours == 0: + up_hours = 1 except Exception as e: print(f"Error processing time query '{query}': {e}") diff --git a/status/templates/health.html b/status/templates/health.html index 3e97373..ade2ca6 100644 --- a/status/templates/health.html +++ b/status/templates/health.html @@ -4,33 +4,71 @@ Prometheus Metrics @@ -39,12 +77,12 @@

Prometheus Metrics

Last updated: {{ timestamp }}

- - + + {% for item in metrics %} - + From b69aa26d6ee23b8f4412a5e0cc7eaa9ed5b9b611 Mon Sep 17 00:00:00 2001 From: vineeshah Date: Mon, 7 Jul 2025 17:39:15 -0700 Subject: [PATCH 07/15] combined docker-compose --- docker-compose.yml | 22 +++++++++++++++++++ ...{docker-compose.yml => docker-compose.txt} | 10 --------- status/prometheus.yml | 8 ------- status/scraper.py | 2 ++ 4 files changed, 24 insertions(+), 18 deletions(-) rename status/{docker-compose.yml => docker-compose.txt} (68%) delete mode 100644 status/prometheus.yml diff --git a/docker-compose.yml b/docker-compose.yml index b047c88..5c0da9f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -72,6 +72,28 @@ services: - "/var/run/docker.sock:/var/run/docker.sock" - portainer_data:/data + scraper: + container_name: scraper + build: + context: . + dockerfile: ./status/dockerfile.scraper + volumes: + - ./status/output:/app/output + - ./status/templates:/app/templates + - ./status/json.json:/app/json.json + ports: + - "8000:8000" + restart: 'on-failure' + command: python3 scraper.py --json json.json --promurl "http://one.sce/prometheus" + + + nginx: + image: nginx:1.25.3 + ports: + - 80:80 + volumes: + - ./status/nginx.conf:/etc/nginx/nginx.conf + volumes: alertmanager-data: grafana-data: diff --git a/status/docker-compose.yml b/status/docker-compose.txt similarity index 68% rename from status/docker-compose.yml rename to status/docker-compose.txt index c63ca4f..37b26d9 100644 --- a/status/docker-compose.yml +++ b/status/docker-compose.txt @@ -1,16 +1,6 @@ version: '2' services: - prometheus: - image: prom/prometheus:latest - restart: always - ports: - - 9090:9090 - volumes: - - ./prometheus.yml:/etc/prometheus/prometheus.yml - command: - - --config.file=/etc/prometheus/prometheus.yml - scraper: container_name: scraper build: diff --git a/status/prometheus.yml b/status/prometheus.yml deleted file mode 100644 index a50693f..0000000 --- a/status/prometheus.yml +++ /dev/null @@ -1,8 +0,0 @@ -global: - scrape_interval: 10s - - -scrape_configs: - - job_name: 'coin-api' - static_configs: - - targets: ['coin-api:5000'] diff --git a/status/scraper.py b/status/scraper.py index 0150614..4cf0209 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -71,6 +71,8 @@ def process_up_query(query, service_name): process_time_query("time() - process_start_time_seconds", service_name) if not check_status(query="up"): print("status is not success, please look into it!!") + else: + print("status is success in the query!!") try: result = prom.custom_query(query=query) if not result: From 70e068925adb5d06edfbd9f12de6d81a4092cf5f Mon Sep 17 00:00:00 2001 From: vineeshah Date: Tue, 8 Jul 2025 07:53:29 -0700 Subject: [PATCH 08/15] prom container clean-up and common docker file --- docker-compose.yml | 3 +-- status/docker-compose.txt | 24 ------------------------ status/dockerfile.scraper | 8 ++++---- status/{templates => }/health.html | 0 status/scraper.py | 2 +- 5 files changed, 6 insertions(+), 31 deletions(-) delete mode 100644 status/docker-compose.txt rename status/{templates => }/health.html (100%) diff --git a/docker-compose.yml b/docker-compose.yml index 5c0da9f..9c92650 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -78,8 +78,7 @@ services: context: . dockerfile: ./status/dockerfile.scraper volumes: - - ./status/output:/app/output - - ./status/templates:/app/templates + - ./status/health.html:/app/health.html - ./status/json.json:/app/json.json ports: - "8000:8000" diff --git a/status/docker-compose.txt b/status/docker-compose.txt deleted file mode 100644 index 37b26d9..0000000 --- a/status/docker-compose.txt +++ /dev/null @@ -1,24 +0,0 @@ -version: '2' - -services: - scraper: - container_name: scraper - build: - context: . - dockerfile: ./dockerfile.scraper - volumes: - - ./output:/app/output - - ./templates:/app/templates - - ./json.json:/app/json.json - ports: - - "8000:8000" - restart: 'on-failure' - command: python3 scraper.py --json json.json --promurl "http://one.sce/prometheus" - - - nginx: - image: nginx:1.25.3 - ports: - - 80:80 - volumes: - - ./nginx.conf:/etc/nginx/nginx.conf diff --git a/status/dockerfile.scraper b/status/dockerfile.scraper index 420ce1f..1fd210c 100644 --- a/status/dockerfile.scraper +++ b/status/dockerfile.scraper @@ -3,12 +3,12 @@ FROM python:3.9-slim WORKDIR /app -COPY requirements.txt . +COPY ./status/requirements.txt . RUN pip3 install -r requirements.txt -COPY flags.py . -COPY scraper.py . -COPY templates/ ./templates/ +COPY ./status/flags.py . +COPY ./status/scraper.py . +COPY ./status/health.html /app/health.html CMD ["python3", "scraper.py", "--json", "json.json"] \ No newline at end of file diff --git a/status/templates/health.html b/status/health.html similarity index 100% rename from status/templates/health.html rename to status/health.html diff --git a/status/scraper.py b/status/scraper.py index 4cf0209..149bf94 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -20,7 +20,7 @@ pacific_tz = pytz.timezone('US/Pacific') -templates = Jinja2Templates(directory="templates") +templates = Jinja2Templates(directory=".") args = get_args() From a8eadba3ef4e60b182481bc4c522f28c61e76ce7 Mon Sep 17 00:00:00 2001 From: vineeshah Date: Tue, 8 Jul 2025 12:32:14 -0700 Subject: [PATCH 09/15] cleaning up and renaming --- docker-compose.yml | 17 +++----- status/{dockerfile.scraper => dockerfile} | 2 +- status/flags.py | 31 -------------- status/requirements.txt | 2 +- status/scraper.py | 50 ++++++++++++++++++++--- 5 files changed, 51 insertions(+), 51 deletions(-) rename status/{dockerfile.scraper => dockerfile} (84%) delete mode 100644 status/flags.py diff --git a/docker-compose.yml b/docker-compose.yml index 9c92650..6f09ec1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -73,26 +73,19 @@ services: - portainer_data:/data scraper: - container_name: scraper + container_name: system_status build: context: . - dockerfile: ./status/dockerfile.scraper + dockerfile: ./status/dockerfile volumes: - ./status/health.html:/app/health.html - ./status/json.json:/app/json.json - ports: - - "8000:8000" restart: 'on-failure' - command: python3 scraper.py --json json.json --promurl "http://one.sce/prometheus" + command: python3 scraper.py + --json json.json + --promurl "http://one.sce/prometheus" - nginx: - image: nginx:1.25.3 - ports: - - 80:80 - volumes: - - ./status/nginx.conf:/etc/nginx/nginx.conf - volumes: alertmanager-data: grafana-data: diff --git a/status/dockerfile.scraper b/status/dockerfile similarity index 84% rename from status/dockerfile.scraper rename to status/dockerfile index 1fd210c..a1bcf37 100644 --- a/status/dockerfile.scraper +++ b/status/dockerfile @@ -11,4 +11,4 @@ COPY ./status/flags.py . COPY ./status/scraper.py . COPY ./status/health.html /app/health.html -CMD ["python3", "scraper.py", "--json", "json.json"] \ No newline at end of file +ENTRYPOINT ["python3", "scraper.py"] \ No newline at end of file diff --git a/status/flags.py b/status/flags.py deleted file mode 100644 index 7194488..0000000 --- a/status/flags.py +++ /dev/null @@ -1,31 +0,0 @@ -import argparse - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--interval", - "-int", - type= int, - default = 15, - help = "interval for how often queries should be done" - ) - parser.add_argument( - "--port", - type = int, - default = 8000, - help = "port for server to be hosted on, defaults to 8000" - ) - parser.add_argument( - "--json", - type = str, - required = True, - help = "argument to a json file, where the json file specifies what services we need to query" - ) - parser.add_argument( - "--promurl", - type = str, - default= "http://prometheus:9090", - help = "the url for the promtheus container thats running that has to be scraped" - ) - - return parser.parse_args() \ No newline at end of file diff --git a/status/requirements.txt b/status/requirements.txt index 6b8b5b2..112ac36 100644 --- a/status/requirements.txt +++ b/status/requirements.txt @@ -2,5 +2,5 @@ fastapi==0.84.0 uvicorn==0.18.3 Jinja2==3.0.2 py-grpc-prometheus==0.7.0 -prometheus_api_client +prometheus_api_client==0.4.0 diff --git a/status/scraper.py b/status/scraper.py index 149bf94..90d7d85 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -5,7 +5,7 @@ from fastapi.templating import Jinja2Templates from fastapi.staticfiles import StaticFiles import uvicorn -from flags import get_args +# from flags import get_args import json import time import threading @@ -13,8 +13,39 @@ from datetime import datetime, timedelta import pytz import requests +import argparse +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--interval", + "-int", + type= int, + default = 15, + help = "interval for how often queries should be done" + ) + parser.add_argument( + "--port", + type = int, + default = 8000, + help = "port for server to be hosted on, defaults to 8000" + ) + parser.add_argument( + "--json", + type = str, + required = True, + help = "argument to a json file, where the json file specifies what services we need to query" + ) + parser.add_argument( + "--promurl", + type = str, + default= "http://prometheus:9090", + help = "the url for the promtheus container thats running that has to be scraped" + ) + + return parser.parse_args() + app = FastAPI() @@ -41,7 +72,7 @@ def check_status(query): response = requests.get("http://prometheus:9090/api/v1/query", params = params) response.raise_for_status()# Raise an error for HTTP issues json_response = response.json() - if json_response["status"]=="success": + if json_response.get("status")=="success": return True elif json_response["status"]==None: print("the status key does not exist!") @@ -85,8 +116,8 @@ def process_up_query(query, service_name): return for metric in result: - instance = metric["metric"].get("instance", "unknown") - job_name = metric["metric"].get("job", "unknown")#for later use in dataclass + instance = metric.get("metric",{}).get("instance", "unknown") + job_name = metric.get("metric",{}).get("job", "unknown")#for later use in dataclass value = metric.get('value', [])[1] # last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") status = "Healthy" if float(value) > 0 else "Unhealthy" @@ -117,8 +148,8 @@ def process_time_query(query, service_name): result = prom.custom_query(query=query) if result and len(result) > 0: for metric in result: - instance = metric["metric"].get("instance", "unknown") - job_name = metric["metric"].get("job", "unknown") + instance = metric.get("metric",{}).get("instance", "unknown") + job_name = metric.get("metric",{}).get("job", "unknown") uptime_seconds = float(metric["value"][1]) up_hours = int(uptime_seconds / 3600) if up_hours == 0: @@ -174,3 +205,10 @@ def main(): if __name__ == "__main__": main() + +# nginx: +# image: nginx:1.25.3 +# ports: +# - 80:80 +# volumes: +# - ./status/nginx.conf:/etc/nginx/nginx.conf From 6597cf7338da959c31a3239ac41f180601959449 Mon Sep 17 00:00:00 2001 From: vineeshah Date: Tue, 8 Jul 2025 16:33:33 -0700 Subject: [PATCH 10/15] python formatter applied --- status/scraper.py | 135 +++++++++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 60 deletions(-) diff --git a/status/scraper.py b/status/scraper.py index 90d7d85..698ea4b 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -4,7 +4,8 @@ from fastapi import FastAPI, Request from fastapi.templating import Jinja2Templates from fastapi.staticfiles import StaticFiles -import uvicorn +import uvicorn + # from flags import get_args import json import time @@ -21,27 +22,27 @@ def get_args(): parser.add_argument( "--interval", "-int", - type= int, - default = 15, - help = "interval for how often queries should be done" + type=int, + default=15, + help="interval for how often queries should be done", ) parser.add_argument( "--port", - type = int, - default = 8000, - help = "port for server to be hosted on, defaults to 8000" + type=int, + default=8000, + help="port for server to be hosted on, defaults to 8000", ) parser.add_argument( "--json", - type = str, - required = True, - help = "argument to a json file, where the json file specifies what services we need to query" + type=str, + required=True, + help="argument to a json file, where the json file specifies what services we need to query", ) parser.add_argument( "--promurl", - type = str, - default= "http://prometheus:9090", - help = "the url for the promtheus container thats running that has to be scraped" + type=str, + default="http://prometheus:9090", + help="the url for the promtheus container thats running that has to be scraped", ) return parser.parse_args() @@ -49,16 +50,19 @@ def get_args(): app = FastAPI() -pacific_tz = pytz.timezone('US/Pacific') +pacific_tz = pytz.timezone("US/Pacific") templates = Jinja2Templates(directory=".") args = get_args() -prom = PrometheusConnect(url = args.promurl, disable_ssl=True)#this will query "http://prometheus:9090/api/v1/query?query=up" +prom = PrometheusConnect( + url=args.promurl, disable_ssl=True +) # this will query "http://prometheus:9090/api/v1/query?query=up" metrics_data = [] -up_hours = 24 +up_hours = 24 + @dataclass class metrics: @@ -66,15 +70,16 @@ class metrics: timestamp: float value: float + def check_status(query): - params = {"query" : query} + params = {"query": query} try: - response = requests.get("http://prometheus:9090/api/v1/query", params = params) - response.raise_for_status()# Raise an error for HTTP issues + response = requests.get("http://prometheus:9090/api/v1/query", params=params) + response.raise_for_status() # Raise an error for HTTP issues json_response = response.json() - if json_response.get("status")=="success": + if json_response.get("status") == "success": return True - elif json_response["status"]==None: + elif json_response["status"] == None: print("the status key does not exist!") return False else: @@ -84,19 +89,22 @@ def check_status(query): print(f"Error querying Prometheus: {e}") return None + def polling_loop(interval, config): - global metrics_data - while True: - metrics_data = [] - for hosts in config: - service_name = hosts["job-id"] - prom_query = hosts["query"] - if prom_query == "up": - process_up_query(prom_query, service_name) - time.sleep(interval) + global metrics_data + while True: + metrics_data = [] + for hosts in config: + service_name = hosts["job-id"] + prom_query = hosts["query"] + if prom_query == "up": + process_up_query(prom_query, service_name) + time.sleep(interval) + service_data = {} + def process_up_query(query, service_name): global metrics_data, service_data process_time_query("time() - process_start_time_seconds", service_name) @@ -109,37 +117,35 @@ def process_up_query(query, service_name): if not result: print(f"No results for query: {query}") last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") - metrics_data.append({ - "instance": service_name, - "status": "Error in querying" - }) + metrics_data.append( + {"instance": service_name, "status": "Error in querying"} + ) return - + for metric in result: - instance = metric.get("metric",{}).get("instance", "unknown") - job_name = metric.get("metric",{}).get("job", "unknown")#for later use in dataclass - value = metric.get('value', [])[1] + instance = metric.get("metric", {}).get("instance", "unknown") + job_name = metric.get("metric", {}).get( + "job", "unknown" + ) # for later use in dataclass + value = metric.get("value", [])[1] # last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") status = "Healthy" if float(value) > 0 else "Unhealthy" if status == "Unhealthy": - current = get_first_match_time(prom=prom, prom_query="up", match_value=0, hours=up_hours) - metrics_data.append({ - "instance": instance, - "job":job_name, - "status": current - }) + current = get_first_match_time( + prom=prom, prom_query="up", match_value=0, hours=up_hours + ) + metrics_data.append( + {"instance": instance, "job": job_name, "status": current} + ) else: - metrics_data.append({ - "instance": instance, - "job": job_name, - "status": "Healthy" - }) + metrics_data.append( + {"instance": instance, "job": job_name, "status": "Healthy"} + ) except Exception as e: print(f"Error processing query '{query}': {e}") - metrics_data.append({ - "instance": service_name, - "status": "Unhealthy due to error!" - }) + metrics_data.append( + {"instance": service_name, "status": "Unhealthy due to error!"} + ) def process_time_query(query, service_name): @@ -147,9 +153,9 @@ def process_time_query(query, service_name): try: result = prom.custom_query(query=query) if result and len(result) > 0: - for metric in result: - instance = metric.get("metric",{}).get("instance", "unknown") - job_name = metric.get("metric",{}).get("job", "unknown") + for metric in result: + instance = metric.get("metric", {}).get("instance", "unknown") + job_name = metric.get("metric", {}).get("job", "unknown") uptime_seconds = float(metric["value"][1]) up_hours = int(uptime_seconds / 3600) if up_hours == 0: @@ -157,11 +163,12 @@ def process_time_query(query, service_name): except Exception as e: print(f"Error processing time query '{query}': {e}") + def get_first_match_time(prom, prom_query, match_value=0, hours=24): global metrics_data prom_query = "up" start_time = datetime.now() - timedelta(hours=hours) - end_time = datetime.now() + end_time = datetime.now() try: result = prom.get_metric_range_data( @@ -190,19 +197,27 @@ def get_first_match_time(prom, prom_query, match_value=0, hours=24): @app.get("/", response_class=HTMLResponse) async def get_metrics(request: Request): return templates.TemplateResponse( - "health.html", - {"request": request, "metrics": metrics_data, "timestamp": datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z")} + "health.html", + { + "request": request, + "metrics": metrics_data, + "timestamp": datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z"), + }, ) + def main(): with open(args.json, "r") as file: config = json.load(file) - polling_thread = threading.Thread(target = polling_loop, args = (args.interval,config), daemon=True)#The daemon=True ensures the thread exits when the main program exits. + polling_thread = threading.Thread( + target=polling_loop, args=(args.interval, config), daemon=True + ) # The daemon=True ensures the thread exits when the main program exits. polling_thread.start() uvicorn.run(app, host="0.0.0.0", port=args.port) + if __name__ == "__main__": main() From 37b12c11d2fa80fb6684ebf3f9fbd0ddc4d7dee5 Mon Sep 17 00:00:00 2001 From: vineeshah Date: Wed, 9 Jul 2025 18:57:58 -0700 Subject: [PATCH 11/15] used proper logging and other improvements per suggestions --- docker-compose.yml | 7 ++-- status/dockerfile | 2 ++ status/scraper.py | 80 ++++++++++++++++++++++++++++------------------ 3 files changed, 55 insertions(+), 34 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 6f09ec1..541b002 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -81,9 +81,10 @@ services: - ./status/health.html:/app/health.html - ./status/json.json:/app/json.json restart: 'on-failure' - command: python3 scraper.py - --json json.json - --promurl "http://one.sce/prometheus" + command: + - --config json.json + - --prometheus_url "http://one.sce/prometheus" + - -vvv volumes: diff --git a/status/dockerfile b/status/dockerfile index a1bcf37..1a9ea20 100644 --- a/status/dockerfile +++ b/status/dockerfile @@ -8,7 +8,9 @@ COPY ./status/requirements.txt . RUN pip3 install -r requirements.txt COPY ./status/flags.py . + COPY ./status/scraper.py . + COPY ./status/health.html /app/health.html ENTRYPOINT ["python3", "scraper.py"] \ No newline at end of file diff --git a/status/scraper.py b/status/scraper.py index 698ea4b..1cf9897 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -15,12 +15,13 @@ import pytz import requests import argparse +import logging def get_args(): parser = argparse.ArgumentParser() parser.add_argument( - "--interval", + "--query-interval-seconds", "-int", type=int, default=15, @@ -33,17 +34,24 @@ def get_args(): help="port for server to be hosted on, defaults to 8000", ) parser.add_argument( - "--json", + "--config", type=str, required=True, help="argument to a json file, where the json file specifies what services we need to query", ) parser.add_argument( - "--promurl", + "--prometheus-url", type=str, - default="http://prometheus:9090", + default="http://one.sce/prometheus", help="the url for the promtheus container thats running that has to be scraped", ) + parser.add_argument( + "--verbose", + "-v", + action="count", + default=0, + help="increase output verbosity)", + ) return parser.parse_args() @@ -57,12 +65,20 @@ def get_args(): args = get_args() prom = PrometheusConnect( - url=args.promurl, disable_ssl=True + url=args.prometheus_url, disable_ssl=True ) # this will query "http://prometheus:9090/api/v1/query?query=up" metrics_data = [] up_hours = 24 +logging.Formatter.converter = time.gmtime + +logging.basicConfig( + format="%(asctime)s.%(msecs)03dZ %(levelname)s:%(name)s:%(message)s", + datefmt="%Y-%m-%dT%H:%M:%S", + level=logging.ERROR - (args.verbose * 10), +) + @dataclass class metrics: @@ -74,19 +90,19 @@ class metrics: def check_status(query): params = {"query": query} try: - response = requests.get("http://prometheus:9090/api/v1/query", params=params) + response = requests.get(f"{args.prometheus_url}/api/v1/query", params=params) response.raise_for_status() # Raise an error for HTTP issues json_response = response.json() if json_response.get("status") == "success": return True elif json_response["status"] == None: - print("the status key does not exist!") + logging.info("the status key does not exist!") return False else: return False except Exception as e: - print(f"Error querying Prometheus: {e}") + logging.exception(f"Error querying Prometheus: {e}") return None @@ -95,8 +111,8 @@ def polling_loop(interval, config): while True: metrics_data = [] for hosts in config: - service_name = hosts["job-id"] - prom_query = hosts["query"] + service_name = hosts.get("job-id", "prometheus-aggregation") + prom_query = hosts.get("query", "up") if prom_query == "up": process_up_query(prom_query, service_name) time.sleep(interval) @@ -109,13 +125,13 @@ def process_up_query(query, service_name): global metrics_data, service_data process_time_query("time() - process_start_time_seconds", service_name) if not check_status(query="up"): - print("status is not success, please look into it!!") + logging.warning("status is not success, please look into it!!") else: - print("status is success in the query!!") + logging.info("status is success in the query!!") try: result = prom.custom_query(query=query) if not result: - print(f"No results for query: {query}") + logging.info(f"No results for query: {query}") last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") metrics_data.append( {"instance": service_name, "status": "Error in querying"} @@ -142,7 +158,7 @@ def process_up_query(query, service_name): {"instance": instance, "job": job_name, "status": "Healthy"} ) except Exception as e: - print(f"Error processing query '{query}': {e}") + logging.exception(f"Error processing query '{query}': {e}") metrics_data.append( {"instance": service_name, "status": "Unhealthy due to error!"} ) @@ -161,7 +177,7 @@ def process_time_query(query, service_name): if up_hours == 0: up_hours = 1 except Exception as e: - print(f"Error processing time query '{query}': {e}") + logging.exception(f"Error processing time query '{query}': {e}") def get_first_match_time(prom, prom_query, match_value=0, hours=24): @@ -190,7 +206,7 @@ def get_first_match_time(prom, prom_query, match_value=0, hours=24): status = f"Unhealthy as of {readable_time}" return status except Exception as e: - print(f"Error in get_first_match_time: {e}") + logging.exception(f"Error in get_first_match_time: {e}") return "Error checking status history" @@ -207,23 +223,25 @@ async def get_metrics(request: Request): def main(): - with open(args.json, "r") as file: - config = json.load(file) - - polling_thread = threading.Thread( - target=polling_loop, args=(args.interval, config), daemon=True - ) # The daemon=True ensures the thread exits when the main program exits. - polling_thread.start() - - uvicorn.run(app, host="0.0.0.0", port=args.port) + try: + with open(args.config, "r") as file: + config = json.load(file) + polling_thread = threading.Thread( + target=polling_loop, args=(args.query_interval_seconds, config), daemon=True + ) # The daemon=True ensures the thread exits when the main program exits. + polling_thread.start() + + uvicorn.run(app, host="0.0.0.0", port=args.port) + except FileNotFoundError: + logging.critical(f"Configuration file '{args.config}' not found!") + exit(1) + except Exception as e: + logging.exception("Unexpected error occurred!") + exit(1) + + if __name__ == "__main__": main() -# nginx: -# image: nginx:1.25.3 -# ports: -# - 80:80 -# volumes: -# - ./status/nginx.conf:/etc/nginx/nginx.conf From 94a8d787a3888738c5a9115152709ca73a1bb1a4 Mon Sep 17 00:00:00 2001 From: vineeshah Date: Wed, 9 Jul 2025 19:45:32 -0700 Subject: [PATCH 12/15] in mondo we trust --- status/scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/status/scraper.py b/status/scraper.py index 1cf9897..a8457b8 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -74,6 +74,7 @@ def get_args(): logging.Formatter.converter = time.gmtime logging.basicConfig( + # in mondo we trust format="%(asctime)s.%(msecs)03dZ %(levelname)s:%(name)s:%(message)s", datefmt="%Y-%m-%dT%H:%M:%S", level=logging.ERROR - (args.verbose * 10), From d24ab57a1bbaa1d8c8c1502b304cfb8a51eeb478 Mon Sep 17 00:00:00 2001 From: vineeshah Date: Sun, 13 Jul 2025 21:53:44 -0700 Subject: [PATCH 13/15] updated status("success") checking --- status/scraper.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/status/scraper.py b/status/scraper.py index a8457b8..d316f59 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -94,12 +94,8 @@ def check_status(query): response = requests.get(f"{args.prometheus_url}/api/v1/query", params=params) response.raise_for_status() # Raise an error for HTTP issues json_response = response.json() - if json_response.get("status") == "success": - return True - elif json_response["status"] == None: - logging.info("the status key does not exist!") - return False - else: + if json_response.get("status") != "success": + print(f"json response did not include success in status key, {json_response}") return False except Exception as e: From 324ebae2a26eae07457fc99ed11645def45433c4 Mon Sep 17 00:00:00 2001 From: vineeshah Date: Sun, 13 Jul 2025 22:01:34 -0700 Subject: [PATCH 14/15] using more .get() to make it safer --- status/scraper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/status/scraper.py b/status/scraper.py index d316f59..d46a612 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -140,7 +140,7 @@ def process_up_query(query, service_name): job_name = metric.get("metric", {}).get( "job", "unknown" ) # for later use in dataclass - value = metric.get("value", [])[1] + value = metric.get("value", [None, None])[1] # last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") status = "Healthy" if float(value) > 0 else "Unhealthy" if status == "Unhealthy": @@ -169,7 +169,7 @@ def process_time_query(query, service_name): for metric in result: instance = metric.get("metric", {}).get("instance", "unknown") job_name = metric.get("metric", {}).get("job", "unknown") - uptime_seconds = float(metric["value"][1]) + uptime_seconds = float(metric.get("value", [None,None])[1] or 0) up_hours = int(uptime_seconds / 3600) if up_hours == 0: up_hours = 1 @@ -192,7 +192,7 @@ def get_first_match_time(prom, prom_query, match_value=0, hours=24): for series in result: saw_up = False - for timestamp, value in reversed(series["values"]): + for timestamp, value in reversed(series.get("values", [])): v = float(value) if v == 1: saw_up = True From 80f85331d93110b62cef5f0428c0d264170f751d Mon Sep 17 00:00:00 2001 From: vineeshah Date: Mon, 28 Jul 2025 15:13:31 -0400 Subject: [PATCH 15/15] python fixes --- status/scraper.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/status/scraper.py b/status/scraper.py index d46a612..7d6ae89 100644 --- a/status/scraper.py +++ b/status/scraper.py @@ -110,8 +110,7 @@ def polling_loop(interval, config): for hosts in config: service_name = hosts.get("job-id", "prometheus-aggregation") prom_query = hosts.get("query", "up") - if prom_query == "up": - process_up_query(prom_query, service_name) + process_up_query(prom_query, service_name) time.sleep(interval) @@ -142,7 +141,10 @@ def process_up_query(query, service_name): ) # for later use in dataclass value = metric.get("value", [None, None])[1] # last_active = datetime.now(pacific_tz).strftime("%Y-%m-%d %H:%M:%S %Z") - status = "Healthy" if float(value) > 0 else "Unhealthy" + if float(value) > 0: + status = "Healthy" + else: + status = "Unhealthy" if status == "Unhealthy": current = get_first_match_time( prom=prom, prom_query="up", match_value=0, hours=up_hours @@ -150,10 +152,10 @@ def process_up_query(query, service_name): metrics_data.append( {"instance": instance, "job": job_name, "status": current} ) - else: - metrics_data.append( - {"instance": instance, "job": job_name, "status": "Healthy"} - ) + continue + metrics_data.append( + {"instance": instance, "job": job_name, "status": "Healthy"} + ) except Exception as e: logging.exception(f"Error processing query '{query}': {e}") metrics_data.append( @@ -170,7 +172,7 @@ def process_time_query(query, service_name): instance = metric.get("metric", {}).get("instance", "unknown") job_name = metric.get("metric", {}).get("job", "unknown") uptime_seconds = float(metric.get("value", [None,None])[1] or 0) - up_hours = int(uptime_seconds / 3600) + up_hours = uptime_seconds // 3600 if up_hours == 0: up_hours = 1 except Exception as e:
InstanceValueJobStatus
{{ item.instance }}{{ item.job }} {{ item.status }}