diff --git a/common/check_transfer_queues_status b/common/check_transfer_queues_status index 15f94b9..6d9c9a4 100755 --- a/common/check_transfer_queues_status +++ b/common/check_transfer_queues_status @@ -1,15 +1,17 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 +#!/usr/bin/env python3 +# Copyright European Organization for Nuclear Research (CERN) since 2012 # # Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# Authors: -# - Mario Lassnig, , 2013-2021 -# - Cedric Serfon, , 2014 -# - Wen Guan, , 2015 -# - Thomas Beermann, , 2019 +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Probe to check the queues of the transfer service @@ -17,46 +19,20 @@ Probe to check the queues of the transfer service from __future__ import print_function import sys +from urllib.parse import urlparse from prometheus_client import CollectorRegistry, Gauge, push_to_gateway +from sqlalchemy import func, select + from rucio.common.config import config_get -from rucio.db.sqla.session import BASE, get_session +from rucio.db.sqla import models +from rucio.db.sqla.session import get_session from utils.common import probe_metrics # Exit statuses OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 -if BASE.metadata.schema: - schema = BASE.metadata.schema + '.' -else: - schema = '' - -active_queue = """SELECT -CASE - WHEN state = 'S' THEN 'queues.requests.submitted.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'Q' THEN 'queues.requests.queued.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'F' THEN 'queues.requests.failed.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'D' THEN 'queues.requests.done.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'L' THEN 'queues.requests.lost.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'W' THEN 'queues.requests.waiting.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'M' THEN 'queues.requests.mismatchscheme.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'G' THEN 'queues.requests.submitting.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'N' THEN 'queues.requests.nosources.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'O' THEN 'queues.requests.onlytapesources.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'A' THEN 'queues.requests.submissionfailed.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'U' THEN 'queues.requests.suspend.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'P' THEN 'queues.requests.preparing.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - ELSE state -END state_desc, -num_rows -FROM -( -select state, count(*) num_rows, activity, external_host -FROM {schema}requests -GROUP BY state, activity, external_host -)""".format(schema=schema) - PROM_SERVERS = config_get('monitor', 'prometheus_servers', raise_exception=False, default='') if PROM_SERVERS != '': PROM_SERVERS = PROM_SERVERS.split(',') @@ -66,14 +42,31 @@ if __name__ == "__main__": registry = CollectorRegistry() g = Gauge('conveyor_queues_requests', '', labelnames=('state', 'activity', 'external_host'), registry=registry) session = get_session() - for k in session.execute(active_queue).fetchall(): - print(k[0], k[1], end=" ") - probe_metrics.gauge(name=k[0].replace('-', '_')).set(k[1]) - items = k[0].split('.') - state = items[2] - activity = items[3] - external_host = items[4].replace('-', '_') - g.labels(**{'activity': activity, 'state': state, 'external_host': external_host}).set(k[1]) + + statement = select( + models.Request.state, + models.Request.activity, + models.Request.external_host, + func.count() + ).group_by( + models.Request.state, + models.Request.activity, + models.Request.external_host + ) + + for k in session.execute(statement).fetchall(): + count = k.count + state = k.state.name.lower() + activity = k.activity.replace(" ", "-") + if k.external_host is not None: + external_host = urlparse(k.external_host).hostname.replace(".", "_") + else: + external_host = 'no_fts_host' + + print(state, count, end=" ") + probe_metrics.gauge(name=state).set(count) + g.labels(**{'activity': activity, 'state': state, 'external_host': external_host}).set(count) + if len(PROM_SERVERS): for server in PROM_SERVERS: try: