From 884bbcd7ae6a24d2240632b5f471f4ac850014a2 Mon Sep 17 00:00:00 2001 From: voetberg Date: Mon, 5 Aug 2024 15:27:57 -0500 Subject: [PATCH 1/2] Common: rewrite check_transfer_queue_status * Rewrite base query with sqla2.0 * Move CASE statements into python logic * Sort Imports * Update header * Update exception statement * Change gauge to PrometheusPusher --- common/check_transfer_queues_status | 112 ++++++++++++---------------- 1 file changed, 46 insertions(+), 66 deletions(-) diff --git a/common/check_transfer_queues_status b/common/check_transfer_queues_status index 15f94b9b..727a94e0 100755 --- a/common/check_transfer_queues_status +++ b/common/check_transfer_queues_status @@ -1,85 +1,65 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 +#!/usr/bin/env python3 +# Copyright European Organization for Nuclear Research (CERN) since 2012 # # Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# Authors: -# - Mario Lassnig, , 2013-2021 -# - Cedric Serfon, , 2014 -# - Wen Guan, , 2015 -# - Thomas Beermann, , 2019 +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Probe to check the queues of the transfer service """ -from __future__ import print_function - import sys +import traceback +from urllib.parse import urlparse -from prometheus_client import CollectorRegistry, Gauge, push_to_gateway -from rucio.common.config import config_get -from rucio.db.sqla.session import BASE, get_session +from sqlalchemy import func, select -from utils.common import probe_metrics +from rucio.db.sqla import models +from rucio.db.sqla.session import get_session + +from utils.common import PrometheusPusher # Exit statuses OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 -if BASE.metadata.schema: - schema = BASE.metadata.schema + '.' -else: - schema = '' - -active_queue = """SELECT -CASE - WHEN state = 'S' THEN 'queues.requests.submitted.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'Q' THEN 'queues.requests.queued.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'F' THEN 'queues.requests.failed.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'D' THEN 'queues.requests.done.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'L' THEN 'queues.requests.lost.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'W' THEN 'queues.requests.waiting.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'M' THEN 'queues.requests.mismatchscheme.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'G' THEN 'queues.requests.submitting.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'N' THEN 'queues.requests.nosources.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'O' THEN 'queues.requests.onlytapesources.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'A' THEN 'queues.requests.submissionfailed.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'U' THEN 'queues.requests.suspend.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'P' THEN 'queues.requests.preparing.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - ELSE state -END state_desc, -num_rows -FROM -( -select state, count(*) num_rows, activity, external_host -FROM {schema}requests -GROUP BY state, activity, external_host -)""".format(schema=schema) - -PROM_SERVERS = config_get('monitor', 'prometheus_servers', raise_exception=False, default='') -if PROM_SERVERS != '': - PROM_SERVERS = PROM_SERVERS.split(',') - if __name__ == "__main__": try: - registry = CollectorRegistry() - g = Gauge('conveyor_queues_requests', '', labelnames=('state', 'activity', 'external_host'), registry=registry) session = get_session() - for k in session.execute(active_queue).fetchall(): - print(k[0], k[1], end=" ") - probe_metrics.gauge(name=k[0].replace('-', '_')).set(k[1]) - items = k[0].split('.') - state = items[2] - activity = items[3] - external_host = items[4].replace('-', '_') - g.labels(**{'activity': activity, 'state': state, 'external_host': external_host}).set(k[1]) - if len(PROM_SERVERS): - for server in PROM_SERVERS: - try: - push_to_gateway(server.strip(), job='check_transfer_queues_status', registry=registry) - except: - continue - except: + statement = select( + models.Request.state, + models.Request.activity, + models.Request.external_host, + func.count() + ).group_by( + models.Request.state, + models.Request.activity, + models.Request.external_host + ) + + with PrometheusPusher() as manager: + for row in session.execute(statement): + state = f"queues.requests.{row.state.name.lower()}" + activity = row.activity.replace(" ", "_") + if row.external_host is not None: + external_host = urlparse(row.external_host).hostname.replace(".", "_") + else: + external_host = 'no_fts_host' + + print(f"{state}.{activity}.{external_host} {row.count}") + (manager.gauge( + "transfer.{state}.{activity}.{external_host}", + documentation="Status of requests by state, activity, external host.") + .labels(activity=activity, state=state, external_host=external_host) + .set(row.count)) + except Exception: + print(traceback.format_exc()) sys.exit(UNKNOWN) sys.exit(OK) From 621380a653cae4b5099e66e8591f021f42edd839 Mon Sep 17 00:00:00 2001 From: voetberg Date: Thu, 15 Aug 2024 10:50:36 -0500 Subject: [PATCH 2/2] Common: Rename metric conveyor.queues.requests --- common/check_transfer_queues_status | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/check_transfer_queues_status b/common/check_transfer_queues_status index 727a94e0..27983554 100755 --- a/common/check_transfer_queues_status +++ b/common/check_transfer_queues_status @@ -55,7 +55,7 @@ if __name__ == "__main__": print(f"{state}.{activity}.{external_host} {row.count}") (manager.gauge( - "transfer.{state}.{activity}.{external_host}", + "conveyor.queues.requests.{state}.{activity}.{external_host}", documentation="Status of requests by state, activity, external host.") .labels(activity=activity, state=state, external_host=external_host) .set(row.count))