diff --git a/metrics-influxdb-arc b/metrics-influxdb-arc index bc61359..d548ebe 100755 --- a/metrics-influxdb-arc +++ b/metrics-influxdb-arc @@ -1,5 +1,4 @@ -#!/usr/bin/python - +#!/usr/bin/env python3 import os import time @@ -17,75 +16,75 @@ arex_job_states = [ # Get numbers of jobs in each state def getJobsStatesInfo(): - control_subdirs = ['accepting', 'finished', 'processing', 'restarting'] - - data = '' - states = {} - - for control_subdir in control_subdirs: - subdir = os.path.join('/var/spool/arc/jobstatus', control_subdir) - - if not os.path.isdir(subdir): - return - - try: - for status_file in os.listdir(subdir): - try: - f = open(os.path.join(subdir, status_file)) - except IOError, e: - print 'Could not open status file %s: %s' % status_file, str(e) - continue - - status = f.readline().strip() - if status in states: - states[status] += 1 - else: - states[status] = 1 - f.close() - except OSError, e: - print 'Could not list status files in %s: %s' % subdir, str(e) - - for state in arex_job_states: - if state in states: - value = states[state] - else: - value = 0 - data += 'jobs,state=' + state + ' value=' + str(value) + '\n' - - return data + control_subdirs = ['accepting', 'finished', 'processing', 'restarting'] + + data = '' + states = {} + + for control_subdir in control_subdirs: + subdir = os.path.join('/var/spool/arc/jobstatus', control_subdir) + + if not os.path.isdir(subdir): + return + + try: + for status_file in os.listdir(subdir): + try: + f = open(os.path.join(subdir, status_file)) + except IOError as e: + print ('Could not open status file %s: %s' % status_file, str(e)) + continue + + status = f.readline().strip() + if status in states: + states[status] += 1 + else: + states[status] = 1 + f.close() + except OSError as e: + print ('Could not list status files in %s: %s' % subdir, str(e)) + + for state in arex_job_states: + if state in states: + value = states[state] + else: + value = 0 + data += 'jobs,state=' + state + ' value=' + str(value) + '\n' + + return data # Get number of jobs in the processing subdirectory def getProcessingJobs(): - processing = 0 + processing = 0 - processing_dir = '/var/spool/arc/jobstatus/processing' + processing_dir = '/var/spool/arc/jobstatus/processing' - try: - entries = os.listdir(processing_dir) - except OSError, e: - print "Error listing dir %s: %s" % processing_dir, str(e) - return - processing += len(entries) + try: + entries = os.listdir(processing_dir) + except OSError as e: + print ("Error listing dir %s: %s" % processing_dir, str(e)) + return + processing += len(entries) - data = 'jobs,state=PROCESSING value=' + str(processing) + '\n' - return data + data = 'jobs,state=PROCESSING value=' + str(processing) + '\n' + return data # Get the time since the modification timestamp of the gm-heartbeat file def getHeartBeatInfo(): - heartbeat = '/var/spool/arc/jobstatus/gm-heartbeat' - try: - statinfo = os.stat(heartbeat) - except OSError, e: - print "Error with heartbeat file: %s" %str(e) - return + heartbeat = '/var/spool/arc/jobstatus/gm-heartbeat' + try: + statinfo = os.stat(heartbeat) + except OSError as e: + print ("Error with heartbeat file: %s" %str(e)) + return - mtime = statinfo.st_mtime - now = time.time() - heartbeat_time = now - mtime + mtime = statinfo.st_mtime + now = time.time() + heartbeat_time = now - mtime - data = 'arex_heartbeat_lastseen value=' + str(heartbeat_time) + '\n' - return data + data = 'arex_heartbeat_lastseen value=' + str(heartbeat_time) + '\n' + return data # Generate metrics data = '' @@ -93,4 +92,4 @@ data += getJobsStatesInfo() data += getProcessingJobs() data += getHeartBeatInfo() -print data +print (data) diff --git a/metrics-influxdb-condor-capacity b/metrics-influxdb-condor-capacity index c26a515..f0271c6 100755 --- a/metrics-influxdb-condor-capacity +++ b/metrics-influxdb-condor-capacity @@ -1,33 +1,32 @@ -#!/usr/bin/python +#!/usr/bin/env python3 import htcondor -import classad coll = htcondor.Collector() startds = coll.query(htcondor.AdTypes.Startd, "PartitionableSlot =?=True", ["Machine","TotalMemory","Memory","TotalCpus","Cpus","RalScaling","ScalingFactor","RalCluster","StartJobs"]) data = '' for startd in startds: - # Determine the scaling factor - scalingFactor = 0 - if "RalScaling" in startd: - scalingFactor = startd["RalScaling"] - elif "ScalingFactor" in startd: - scalingFactor = startd["ScalingFactor"] + # Determine the scaling factor + scalingFactor = 0 + if "RalScaling" in startd: + scalingFactor = startd["RalScaling"] + elif "ScalingFactor" in startd: + scalingFactor = startd["ScalingFactor"] - capacityTotal = int(4.0*scalingFactor*startd["TotalCpus"]) - capacityIdle = int(4.0*scalingFactor*startd["Cpus"]) - capacityUsed = capacityTotal - capacityIdle + capacityTotal = int(4.0*scalingFactor*startd["TotalCpus"]) + capacityIdle = int(4.0*scalingFactor*startd["Cpus"]) + capacityUsed = capacityTotal - capacityIdle - memoryTotal = startd["TotalMemory"] - memoryIdle = startd["Memory"] - memoryUsed = memoryTotal - memoryIdle + memoryTotal = startd["TotalMemory"] + memoryIdle = startd["Memory"] + memoryUsed = memoryTotal - memoryIdle - if "StartJobs" in startd: - if startd["StartJobs"]: - data = data + 'capacity,type=idle,resource=cpu,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(capacityIdle)+'\n' - data = data + 'capacity,type=idle,resource=memory,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(memoryIdle)+'\n' + if "StartJobs" in startd: + if startd["StartJobs"]: + data = data + 'capacity,type=idle,resource=cpu,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(capacityIdle)+'\n' + data = data + 'capacity,type=idle,resource=memory,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(memoryIdle)+'\n' - data = data + 'capacity,type=used,resource=cpu,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(capacityUsed)+'\n' - data = data + 'capacity,type=used,resource=memory,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(memoryUsed)+'\n' + data = data + 'capacity,type=used,resource=cpu,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(capacityUsed)+'\n' + data = data + 'capacity,type=used,resource=memory,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(memoryUsed)+'\n' -print data +print (data) diff --git a/metrics-influxdb-condor-cpu-unused b/metrics-influxdb-condor-cpu-unused index dfdc702..e6b459f 100755 --- a/metrics-influxdb-condor-cpu-unused +++ b/metrics-influxdb-condor-cpu-unused @@ -1,6 +1,4 @@ -#!/usr/bin/python - -import classad +#!/usr/bin/env python3 import htcondor # initialization @@ -17,32 +15,31 @@ minimumMemory = 1000000 coll = htcondor.Collector() results = coll.query(htcondor.AdTypes.Schedd, "true", ["Name"]) for result in results: - host = result["Name"] - if 'arc-ce' in host: - scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, host) - schedd = htcondor.Schedd(scheddAd) - jobs = schedd.query('JobStatus == 1', ["RequestMemory"]) - for job in jobs: - if job["RequestMemory"] < minimumMemory and job["RequestMemory"] > 0: - minimumMemory = job["RequestMemory"] + host = result["Name"] + if 'arc-ce' in host: + scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, host) + schedd = htcondor.Schedd(scheddAd) + jobs = schedd.query('JobStatus == 1', ["RequestMemory"]) + for job in jobs: + if job["RequestMemory"] < minimumMemory and job["RequestMemory"] > 0: + minimumMemory = job["RequestMemory"] coll = htcondor.Collector() results = coll.query(htcondor.AdTypes.Startd, "PartitionableSlot =?= True", ["Name", "TotalSlotCpus", "Cpus", "TotalSlotMemory", "Memory", "State", "PREEMPTABLE_ONLY", "StartJobs", "NODE_IS_HEALTHY"]) for result in results: - if result["Cpus"] > 0: - if str(result["State"]) == "Drained": - coresDraining += result["Cpus"] - elif 'PREEMPTABLE_ONLY' in result and result['PREEMPTABLE_ONLY'] == True: - coresPreemptable += result["Cpus"] - elif 'NODE_IS_HEALTHY' in result and result["NODE_IS_HEALTHY"] != True: - coresUnhealthy += result["Cpus"] - elif result["StartJobs"] == False: - coresStartJobs += result["Cpus"] - elif result["Memory"] < minimumMemory: - coresMemory += result["Cpus"] - else: - coresOther += result["Cpus"] + if result["Cpus"] > 0: + if str(result["State"]) == "Drained": + coresDraining += result["Cpus"] + elif 'PREEMPTABLE_ONLY' in result and result['PREEMPTABLE_ONLY'] == True: + coresPreemptable += result["Cpus"] + elif 'NODE_IS_HEALTHY' in result and result["NODE_IS_HEALTHY"] != True: + coresUnhealthy += result["Cpus"] + elif result["StartJobs"] == False: + coresStartJobs += result["Cpus"] + elif result["Memory"] < minimumMemory: + coresMemory += result["Cpus"] + else: + coresOther += result["Cpus"] data = 'CoresNotUsed draining=%d,preemptable=%d,unhealthy=%d,disabled=%d,memory=%d,other=%d' % (coresDraining, coresPreemptable, coresUnhealthy, coresStartJobs, coresMemory, coresOther) -print data - +print (data) diff --git a/metrics-influxdb-condor-general b/metrics-influxdb-condor-general index 3d58007..57adb78 100755 --- a/metrics-influxdb-condor-general +++ b/metrics-influxdb-condor-general @@ -1,6 +1,4 @@ -#!/usr/bin/python - -import classad +#!/usr/bin/env python3 import htcondor coll = htcondor.Collector() @@ -16,8 +14,8 @@ numNegotiators = len(results) data += 'NumNegotiators value=' + str(numNegotiators) + '\n' for result in results: - data += 'NegotiatorRecentDaemonCoreDutyCycle,host=' + result["Machine"] + ' value=' + str(result["RecentDaemonCoreDutyCycle"]) + '\n' - data += 'LastNegotiationCycleDuration,host=' + result["Machine"] + ' value=' + str(result["LastNegotiationCycleDuration0"]) + '\n' + data += 'NegotiatorRecentDaemonCoreDutyCycle,host=' + result["Machine"] + ' value=' + str(result["RecentDaemonCoreDutyCycle"]) + '\n' + data += 'LastNegotiationCycleDuration,host=' + result["Machine"] + ' value=' + str(result["LastNegotiationCycleDuration0"]) + '\n' # Schedds results = coll.query(htcondor.AdTypes.Schedd, "true", ["Name", "RecentDaemonCoreDutyCycle", "Autoclusters", "RecentJobsStarted", "RecentJobsExited", "RecentJobsSubmitted", "RecentJobsKilled", "RecentStatsLifetime"]) @@ -25,21 +23,21 @@ numSchedds = len(results) data += 'NumSchedds value=' + str(numSchedds) + '\n' for result in results: - data += 'ScheddRecentDaemonCoreDutyCycle,host=' + result["Name"] + ' value=' + str(result["RecentDaemonCoreDutyCycle"]) + '\n' - data += 'AutoClusters,host=' + result["Name"] + ' value=' + str(result["Autoclusters"]) + '\n' - - lifetime = result["RecentStatsLifetime"] - recentJobsStarted = 0 - recentJobsExited = 0 - recentJobsSubmitted = 0 - recentJobsKilled = 0 - - if lifetime > 0: - recentJobsStarted = result["RecentJobsStarted"]*60.0/lifetime - recentJobsExited = result["RecentJobsExited"]*60.0/lifetime - recentJobsSubmitted = result["RecentJobsSubmitted"]*60.0/lifetime - recentJobsKilled = result["RecentJobsKilled"]*60.0/lifetime - data += 'RecentJobs,host=' + result["Name"] + ' Started=' + str(recentJobsStarted) + ',Exited=' + str(recentJobsExited) + ',Submitted=' + str(recentJobsSubmitted) + ',Killed=' + str(recentJobsKilled) + '\n' + data += 'ScheddRecentDaemonCoreDutyCycle,host=' + result["Name"] + ' value=' + str(result["RecentDaemonCoreDutyCycle"]) + '\n' + data += 'AutoClusters,host=' + result["Name"] + ' value=' + str(result["Autoclusters"]) + '\n' + + lifetime = result["RecentStatsLifetime"] + recentJobsStarted = 0 + recentJobsExited = 0 + recentJobsSubmitted = 0 + recentJobsKilled = 0 + + if lifetime > 0: + recentJobsStarted = result["RecentJobsStarted"]*60.0/lifetime + recentJobsExited = result["RecentJobsExited"]*60.0/lifetime + recentJobsSubmitted = result["RecentJobsSubmitted"]*60.0/lifetime + recentJobsKilled = result["RecentJobsKilled"]*60.0/lifetime + data += 'RecentJobs,host=' + result["Name"] + ' Started=' + str(recentJobsStarted) + ',Exited=' + str(recentJobsExited) + ',Submitted=' + str(recentJobsSubmitted) + ',Killed=' + str(recentJobsKilled) + '\n' # Startds numStartds = 0 @@ -70,63 +68,63 @@ numCpusMesos = 0 results = coll.query(htcondor.AdTypes.Startd, "PartitionableSlot=?=True", ["NODE_IS_HEALTHY", "StartJobs", "TotalCpus", "Cpus", "TotalMemory", "Memory","Machine","PREEMPTABLE_ONLY", "MESOS_TASK_ID", "CONTAINER_IMAGE_NAME", "RecentJobStarts", "TotalLoadAvg", "TotalDisk"]) for result in results: - cpuUsed += result["TotalCpus"] - result["Cpus"] - cpuFree += result["Cpus"] - cpusUseable += result["TotalCpus"] - result["Cpus"] - memoryUsed += result["TotalMemory"] - result["Memory"] - memoryFree += result["Memory"] - TotalLoadAvg += result["TotalLoadAvg"] - numStartds += 1 - if "NODE_IS_HEALTHY" in result and "StartJobs" in result: - if result["NODE_IS_HEALTHY"] == True and result["StartJobs"] == True: - numGoodStartds += 1 - cpusUseable += result["Cpus"] - MemoryUseableTotal += result["TotalMemory"] - TotalUseableDisk += result["TotalDisk"] - - if "Machine" in result: - if "RecentJobStarts" in result and result["Machine"] != "": - data += 'RecentJobStarts,host=%s value=%d\n' % (result["Machine"], result["RecentJobStarts"]) - - if "nubes" in result["Machine"]: - cpuUsedCloud += result["TotalCpus"] - result["Cpus"] - cpuFreeCloud += result["Cpus"] - if "MESOS_TASK_ID" in result: - cpuUsedMesos += result["TotalCpus"] - result["Cpus"] - cpuFreeMesos += result["Cpus"] - if "CONTAINER_IMAGE_NAME" in result: - if not cpuUsedMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): - cpuUsedMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 - if not cpuFreeMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): - cpuFreeMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 - if not memUsedMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): - memUsedMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 - if not memFreeMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): - memFreeMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 - if not numStartdsMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): - numStartdsMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 - if not numStartdsGoodMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): - numStartdsGoodMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 - if not numCpusUseableMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): - numCpusUseableMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 - - cpuFreeMesosI[result["CONTAINER_IMAGE_NAME"]] += result["Cpus"] - cpuUsedMesosI[result["CONTAINER_IMAGE_NAME"]] += result["TotalCpus"] - result["Cpus"] - memFreeMesosI[result["CONTAINER_IMAGE_NAME"]] += result["Memory"] - memUsedMesosI[result["CONTAINER_IMAGE_NAME"]] += result["TotalMemory"] - result["Memory"] - numStartdsMesosI[result["CONTAINER_IMAGE_NAME"]] += 1 - - numCpusMesos += result["TotalCpus"] - - if "NODE_IS_HEALTHY" in result and "StartJobs" in result: - if result["NODE_IS_HEALTHY"] == True and result["StartJobs"] == True: - numStartdsGoodMesosI[result["CONTAINER_IMAGE_NAME"]] += 1 - numCpusUseableMesosI[result["CONTAINER_IMAGE_NAME"]] += result["Cpus"] - numCpusUseableMesos += result["Cpus"] - - if "PREEMPTABLE_ONLY" in result: - if result["PREEMPTABLE_ONLY"]: - numStartdsPreemptableOnly += 1 + cpuUsed += result["TotalCpus"] - result["Cpus"] + cpuFree += result["Cpus"] + cpusUseable += result["TotalCpus"] - result["Cpus"] + memoryUsed += result["TotalMemory"] - result["Memory"] + memoryFree += result["Memory"] + TotalLoadAvg += result["TotalLoadAvg"] + numStartds += 1 + if "NODE_IS_HEALTHY" in result and "StartJobs" in result: + if result["NODE_IS_HEALTHY"] == True and result["StartJobs"] == True: + numGoodStartds += 1 + cpusUseable += result["Cpus"] + MemoryUseableTotal += result["TotalMemory"] + TotalUseableDisk += result["TotalDisk"] + + if "Machine" in result: + if "RecentJobStarts" in result and result["Machine"] != "": + data += 'RecentJobStarts,host=%s value=%d\n' % (result["Machine"], result["RecentJobStarts"]) + + if "nubes" in result["Machine"]: + cpuUsedCloud += result["TotalCpus"] - result["Cpus"] + cpuFreeCloud += result["Cpus"] + if "MESOS_TASK_ID" in result: + cpuUsedMesos += result["TotalCpus"] - result["Cpus"] + cpuFreeMesos += result["Cpus"] + if "CONTAINER_IMAGE_NAME" in result: + if not cpuUsedMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): + cpuUsedMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 + if not cpuFreeMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): + cpuFreeMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 + if not memUsedMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): + memUsedMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 + if not memFreeMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): + memFreeMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 + if not numStartdsMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): + numStartdsMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 + if not numStartdsGoodMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): + numStartdsGoodMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 + if not numCpusUseableMesosI.has_key(result["CONTAINER_IMAGE_NAME"]): + numCpusUseableMesosI[result["CONTAINER_IMAGE_NAME"]] = 0 + + cpuFreeMesosI[result["CONTAINER_IMAGE_NAME"]] += result["Cpus"] + cpuUsedMesosI[result["CONTAINER_IMAGE_NAME"]] += result["TotalCpus"] - result["Cpus"] + memFreeMesosI[result["CONTAINER_IMAGE_NAME"]] += result["Memory"] + memUsedMesosI[result["CONTAINER_IMAGE_NAME"]] += result["TotalMemory"] - result["Memory"] + numStartdsMesosI[result["CONTAINER_IMAGE_NAME"]] += 1 + + numCpusMesos += result["TotalCpus"] + + if "NODE_IS_HEALTHY" in result and "StartJobs" in result: + if result["NODE_IS_HEALTHY"] == True and result["StartJobs"] == True: + numStartdsGoodMesosI[result["CONTAINER_IMAGE_NAME"]] += 1 + numCpusUseableMesosI[result["CONTAINER_IMAGE_NAME"]] += result["Cpus"] + numCpusUseableMesos += result["Cpus"] + + if "PREEMPTABLE_ONLY" in result: + if result["PREEMPTABLE_ONLY"]: + numStartdsPreemptableOnly += 1 data += 'NumCpusUsed value=' + str(cpuUsed) + '\n' data += 'NumCpusFree value=' + str(cpuFree) + '\n' @@ -145,33 +143,33 @@ data += 'TotalLoadAvg value=' + str(TotalLoadAvg) + '\n' data += 'TotalUseableDisk value=' + str(TotalUseableDisk) + '\n' if numCpusMesos > 0: - cpusUsedPercentMesos = 100.0*float(cpuUsedMesos)/numCpusMesos - cpusUseablePercentMesos = 100.0*float(numCpusUseableMesos)/numCpusMesos + cpusUsedPercentMesos = 100.0*float(cpuUsedMesos)/numCpusMesos + cpusUseablePercentMesos = 100.0*float(numCpusUseableMesos)/numCpusMesos else: - cpusUsedPercentMesos = 0 - cpusUseablePercentMesos = 0 + cpusUsedPercentMesos = 0 + cpusUseablePercentMesos = 0 if numCpusUseableMesos > 0: - cpusUseableUsedPercentMesos = 100.0*float(cpuUsedMesos)/numCpusUseableMesos + cpusUseableUsedPercentMesos = 100.0*float(cpuUsedMesos)/numCpusUseableMesos else: - cpusUseableUsedPercentMesos = 0 + cpusUseableUsedPercentMesos = 0 data += 'mesoscpus percentUsedOfUseable=' + str(cpusUseableUsedPercentMesos) + ',percentUseable=' + str(cpusUseablePercentMesos) + ',percentUsed=' + str(cpusUsedPercentMesos) + '\n' for image in cpuUsedMesosI: - data += 'mesosbatch,image='+image+' freeCpu='+str(cpuFreeMesosI[image])+',usedCpu='+str(cpuUsedMesosI[image])+',freeMemory='+str(memFreeMesosI[image])+',usedMemory='+str(memUsedMesosI[image])+',numStartds='+str(numStartdsMesosI[image])+',numGoodStartds='+str(numStartdsGoodMesosI[image])+'\n' + data += 'mesosbatch,image='+image+' freeCpu='+str(cpuFreeMesosI[image])+',usedCpu='+str(cpuUsedMesosI[image])+',freeMemory='+str(memFreeMesosI[image])+',usedMemory='+str(memUsedMesosI[image])+',numStartds='+str(numStartdsMesosI[image])+',numGoodStartds='+str(numStartdsGoodMesosI[image])+'\n' if cpuUsed + cpuFree > 0: - cpuUsedPercent = float(cpuUsed)*100/(cpuUsed + cpuFree) - cpuUseablePercent = float(cpusUseable)*100/(cpuUsed + cpuFree) + cpuUsedPercent = float(cpuUsed)*100/(cpuUsed + cpuFree) + cpuUseablePercent = float(cpusUseable)*100/(cpuUsed + cpuFree) else: - cpuUsedPercent = 0 - cpuUseablePercent = 0 + cpuUsedPercent = 0 + cpuUseablePercent = 0 if cpusUseable > 0: - cpuUsedPercentGood = float(cpuUsed)*100/cpusUseable + cpuUsedPercentGood = float(cpuUsed)*100/cpusUseable else: - cpuUsedPercentGood = 0 + cpuUsedPercentGood = 0 data += 'cpus numUsed=' + str(int(cpuUsed)) + ',' + \ 'numFree=' + str(cpuFree) + ',' + \ @@ -181,4 +179,4 @@ data += 'cpus numUsed=' + str(int(cpuUsed)) + ',' + \ 'percentUseable=' + str(cpuUseablePercent) + ',' + \ 'percentUsedOfUseable=' + str(cpuUsedPercentGood) + '\n' -print data +print (data) diff --git a/metrics-influxdb-condor-jobs b/metrics-influxdb-condor-jobs index 8176c2e..1739d62 100755 --- a/metrics-influxdb-condor-jobs +++ b/metrics-influxdb-condor-jobs @@ -1,11 +1,10 @@ -#!/usr/bin/python -import classad -import htcondor +#!/usr/bin/env python3 import time import re +import htcondor # Ignoreable schedds - any failures when quering these schedds will be ignored -ignoreSchedFailures = ['arc-ce05.gridpp.rl.ac.uk', 'lcgvm21.gridpp.rl.ac.uk', 'arc-ce-test01.gridpp.rl.ac.uk'] +ignoreSchedFailures = ['arc-ce-test01.gridpp.rl.ac.uk', 'arc-ce-test02.gridpp.rl.ac.uk'] vosa = [] jobsW = {} @@ -18,195 +17,203 @@ data = '' coll = htcondor.Collector() results = coll.query(htcondor.AdTypes.Schedd, "true", ["Name"]) for result in results: - host = result["Name"] - scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, host) - schedd = htcondor.Schedd(scheddAd) - try: - ajobs = schedd.query('',["x509UserProxyVOName", "RequestCpus", "RequestMemory", "JobStatus", "JobCurrentStartDate", "RemoteSysCpu", "RemoteUserCpu", "isPreemptable", "DockerImage", "RemoteHost", "x509UserProxyFirstFQAN", "ResidentSetSize_RAW"]) - except: - print '# Unable to query schedd:',host,'so will try once more' - if host not in ignoreSchedFailures: - time.sleep(5) - try: - ajobs = schedd.query('',["x509UserProxyVOName", "RequestCpus", "RequestMemory", "JobStatus", "JobCurrentStartDate", "RemoteSysCpu", "RemoteUserCpu", "isPreemptable", "DockerImage", "RemoteHost", "x509UserProxyFirstFQAN", "ResidentSetSize_RAW"]) - except: - print '# Unable to query schedd:',host,'again' - exit(1) - else: - print '# - ignoring failure' - continue - - vos = [] - roles = [] - cpus = [] - - jobsR = {} - jobsRd = {} - jobsRc = {} - jobsI = {} - jobsH = {} - - jobsRp = {} - jobsIp = {} - jobsHp = {} - - jobsRr = {} - - for ajob in ajobs: - - role = 'UNDEFINED' - if "x509UserProxyFirstFQAN" in ajob: - m = re.search('\/Role\=(.*)\/', ajob["x509UserProxyFirstFQAN"]) - if m: - role = m.group(1) - if role not in roles: - roles.append(role) - - preemptable = 0 - if "isPreemptable" in ajob: - if ajob["isPreemptable"]: - preemptable = 1 - - if "x509UserProxyVOName" not in ajob: - ajob["x509UserProxyVOName"] = "undefined" - - if ajob["x509UserProxyVOName"] not in vos: - vos.append(ajob["x509UserProxyVOName"]) - - if ajob["x509UserProxyVOName"] not in vosa: - vosa.append(ajob["x509UserProxyVOName"]) - - if ajob["RequestCpus"] not in cpus: - cpus.append(ajob["RequestCpus"]) - - if ajob["JobStatus"] == 2: - if "ResidentSetSize_RAW" in ajob: - ResidentSetSize_RAW += ajob["ResidentSetSize_RAW"] - - if "DockerImage" in ajob: - if ajob["DockerImage"] != "undefined": - if ajob["DockerImage"] not in imagesR: - imagesR[ajob["DockerImage"]] = 0 - imagesR[ajob["DockerImage"]] += 1 - - if not jobsR.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): - jobsR[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 - jobsR[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsR[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 - - if not jobsRr.has_key((ajob["x509UserProxyVOName"], role)): - jobsRr[(ajob["x509UserProxyVOName"], role)] = 0 - jobsRr[(ajob["x509UserProxyVOName"], role)] = jobsRr[(ajob["x509UserProxyVOName"], role)] + 1 - - if preemptable == 1: - if not jobsRp.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): - jobsRp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 - jobsRp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsRp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 - - if "RemoteHost" in ajob: - if "nubes" in ajob["RemoteHost"]: - if not jobsRc.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): - jobsRc[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 - jobsRc[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsRc[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 - - if "JobCurrentStartDate" in ajob and "RemoteSysCpu" in ajob and "RemoteUserCpu" in ajob: - if not jobsW.has_key(ajob["x509UserProxyVOName"]): - jobsW[ajob["x509UserProxyVOName"]] = 0 - jobsW[ajob["x509UserProxyVOName"]] += ajob["RequestCpus"]*(time.time() - ajob["JobCurrentStartDate"]) + host = result["Name"] + scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, host) + schedd = htcondor.Schedd(scheddAd) + try: + ajobs = schedd.query('',["x509UserProxyVOName", "RequestCpus", "RequestMemory", "JobStatus", "JobCurrentStartDate", "RemoteSysCpu", "RemoteUserCpu", "isPreemptable", "DockerImage", "RemoteHost", "x509UserProxyFirstFQAN", "ResidentSetSize_RAW"]) + except: + print ('# Unable to query schedd:',host,'so will try once more') + if host not in ignoreSchedFailures: + time.sleep(5) + try: + ajobs = schedd.query('',["x509UserProxyVOName", "RequestCpus", "RequestMemory", "JobStatus", "JobCurrentStartDate", "RemoteSysCpu", "RemoteUserCpu", "isPreemptable", "DockerImage", "RemoteHost", "x509UserProxyFirstFQAN", "ResidentSetSize_RAW"]) + except: + print ('# Unable to query schedd:',host,'again') + exit(1) + else: + print ('# - ignoring failure') + continue + + vos = [] + roles = [] + cpus = [] + + # Initialize dictionary for running jobs + jobsR = {} + # Initialize dictionary for running jobs in Docker environment + jobsRd = {} + # Initialize dictionary for running jobs in Cloud environment + jobsRc = {} + # Initialize dictionary for idle jobs + jobsI = {} + # Initialize dictionary for on hold jobs + jobsH = {} + + # Initialize dictionary for running jobs and preemptable is true + jobsRp = {} + # Initialize dictionary for idle jobs and preemptable is true + jobsIp = {} + # Initialize dictionary for on hold jobs and preemptable is true + jobsHp = {} + + jobsRr = {} + + for ajob in ajobs: + + role = 'UNDEFINED' + if "x509UserProxyFirstFQAN" in ajob: + m = re.search('\/Role\=(.*)\/', ajob["x509UserProxyFirstFQAN"]) + if m: + role = m.group(1) + if role not in roles: + roles.append(role) + + preemptable = 0 + if "isPreemptable" in ajob: + if ajob["isPreemptable"]: + preemptable = 1 + + if "x509UserProxyVOName" not in ajob: + ajob["x509UserProxyVOName"] = "undefined" + + if ajob["x509UserProxyVOName"] not in vos: + vos.append(ajob["x509UserProxyVOName"]) + + if ajob["x509UserProxyVOName"] not in vosa: + vosa.append(ajob["x509UserProxyVOName"]) + + if ajob["RequestCpus"] not in cpus: + cpus.append(ajob["RequestCpus"]) + + if ajob["JobStatus"] == 2: + if "ResidentSetSize_RAW" in ajob: + ResidentSetSize_RAW += ajob["ResidentSetSize_RAW"] if "DockerImage" in ajob: - if ajob["DockerImage"] != "undefined": - if not jobsRd.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): - jobsRd[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 - jobsRd[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsRd[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 - - if not jobsC.has_key(ajob["x509UserProxyVOName"]): - jobsC[ajob["x509UserProxyVOName"]] = 0 - jobsC[ajob["x509UserProxyVOName"]] += ajob["RemoteSysCpu"] + ajob["RemoteUserCpu"] - - if ajob["JobStatus"] == 1: - if not jobsI.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): - jobsI[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 - jobsI[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsI[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 - - if preemptable == 1: - if not jobsIp.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): - jobsIp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 - jobsIp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsIp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 - - if ajob["JobStatus"] == 5: - if not jobsH.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): - jobsH[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 - jobsH[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsH[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 - - if preemptable == 1: - if not jobsHp.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): - jobsHp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 - jobsHp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsHp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 - - hostS = host.split('.')[0] - for vo in vos: - for role in roles: - if jobsRr.has_key((vo, role)): - data += 'jobs_by_role,vo='+vo+',status=running,schedd='+hostS+',role='+role+' value='+str(jobsRr[(vo,role)])+'\n' - - coresR = 0 - coresRd = 0 - coresRc = 0 - coresI = 0 - coresH = 0 - for cpu in cpus: - if jobsR.has_key((vo, cpu)): - data += 'jobs,vo='+vo+',status=running,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsR[(vo,cpu)])+'\n' - coresR += jobsR[(vo,cpu)] * cpu - - if jobsRd.has_key((vo, cpu)): - coresRd += jobsRd[(vo,cpu)] * cpu - - if jobsI.has_key((vo, cpu)): - data += 'jobs,vo='+vo+',status=idle,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsI[(vo,cpu)])+'\n' - coresI += jobsI[(vo,cpu)] * cpu - - if jobsH.has_key((vo, cpu)): - data += 'jobs,vo='+vo+',status=held,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsH[(vo,cpu)])+'\n' - coresH += jobsH[(vo,cpu)] * cpu - - if jobsRp.has_key((vo, cpu)): - data += 'pjobs,vo='+vo+',status=running,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsRp[(vo,cpu)])+'\n' - - if jobsRc.has_key((vo, cpu)): - coresRc += jobsRc[(vo,cpu)] * cpu - - if jobsIp.has_key((vo, cpu)): - data += 'pjobs,vo='+vo+',status=idle,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsIp[(vo,cpu)])+'\n' - - if jobsHp.has_key((vo, cpu)): - data += 'pjobs,vo='+vo+',status=held,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsHp[(vo,cpu)])+'\n' - - data += 'cores,vo='+vo+',status=running,schedd='+hostS+' value='+str(coresR)+'\n' - data += 'cores_docker,vo='+vo+',status=running,schedd='+hostS+' value='+str(coresRd)+'\n' - data += 'cores_cloud,vo='+vo+',status=running,schedd='+hostS+' value='+str(coresRc)+'\n' - data += 'cores,vo='+vo+',status=idle,schedd='+hostS+' value='+str(coresI)+'\n' - data += 'cores,vo='+vo+',status=held,schedd='+hostS+' value='+str(coresH)+'\n' + if ajob["DockerImage"] != "undefined": + if ajob["DockerImage"] not in imagesR: + imagesR[ajob["DockerImage"]] = 0 + imagesR[ajob["DockerImage"]] += 1 + + if not jobsR.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): + jobsR[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 + jobsR[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsR[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 + + if not jobsRr.has_key((ajob["x509UserProxyVOName"], role)): + jobsRr[(ajob["x509UserProxyVOName"], role)] = 0 + jobsRr[(ajob["x509UserProxyVOName"], role)] = jobsRr[(ajob["x509UserProxyVOName"], role)] + 1 + + if preemptable == 1: + if not jobsRp.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): + jobsRp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 + jobsRp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsRp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 + + if "RemoteHost" in ajob: + if "nubes" in ajob["RemoteHost"]: + if not jobsRc.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): + jobsRc[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 + jobsRc[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsRc[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 + + if "JobCurrentStartDate" in ajob and "RemoteSysCpu" in ajob and "RemoteUserCpu" in ajob: + if not jobsW.has_key(ajob["x509UserProxyVOName"]): + jobsW[ajob["x509UserProxyVOName"]] = 0 + jobsW[ajob["x509UserProxyVOName"]] += ajob["RequestCpus"]*(time.time() - ajob["JobCurrentStartDate"]) + + if "DockerImage" in ajob: + if ajob["DockerImage"] != "undefined": + if not jobsRd.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): + jobsRd[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 + jobsRd[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsRd[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 + + if not jobsC.has_key(ajob["x509UserProxyVOName"]): + jobsC[ajob["x509UserProxyVOName"]] = 0 + jobsC[ajob["x509UserProxyVOName"]] += ajob["RemoteSysCpu"] + ajob["RemoteUserCpu"] + + if ajob["JobStatus"] == 1: + if not jobsI.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): + jobsI[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 + jobsI[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsI[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 + + if preemptable == 1: + if not jobsIp.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): + jobsIp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 + jobsIp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsIp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 + + if ajob["JobStatus"] == 5: + if not jobsH.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): + jobsH[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 + jobsH[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsH[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 + + if preemptable == 1: + if not jobsHp.has_key((ajob["x509UserProxyVOName"], ajob["RequestCpus"])): + jobsHp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = 0 + jobsHp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] = jobsHp[(ajob["x509UserProxyVOName"], ajob["RequestCpus"])] + 1 + + hostS = host.split('.')[0] + for vo in vos: + for role in roles: + if jobsRr.has_key((vo, role)): + data += 'jobs_by_role,vo='+vo+',status=running,schedd='+hostS+',role='+role+' value='+str(jobsRr[(vo,role)])+'\n' + + coresR = 0 + coresRd = 0 + coresRc = 0 + coresI = 0 + coresH = 0 + for cpu in cpus: + if jobsR.has_key((vo, cpu)): + data += 'jobs,vo='+vo+',status=running,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsR[(vo,cpu)])+'\n' + coresR += jobsR[(vo,cpu)] * cpu + + if jobsRd.has_key((vo, cpu)): + coresRd += jobsRd[(vo,cpu)] * cpu + + if jobsI.has_key((vo, cpu)): + data += 'jobs,vo='+vo+',status=idle,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsI[(vo,cpu)])+'\n' + coresI += jobsI[(vo,cpu)] * cpu + + if jobsH.has_key((vo, cpu)): + data += 'jobs,vo='+vo+',status=held,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsH[(vo,cpu)])+'\n' + coresH += jobsH[(vo,cpu)] * cpu + + if jobsRp.has_key((vo, cpu)): + data += 'pjobs,vo='+vo+',status=running,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsRp[(vo,cpu)])+'\n' + + if jobsRc.has_key((vo, cpu)): + coresRc += jobsRc[(vo,cpu)] * cpu + + if jobsIp.has_key((vo, cpu)): + data += 'pjobs,vo='+vo+',status=idle,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsIp[(vo,cpu)])+'\n' + + if jobsHp.has_key((vo, cpu)): + data += 'pjobs,vo='+vo+',status=held,schedd='+hostS+',cpus='+str(cpu)+' value='+str(jobsHp[(vo,cpu)])+'\n' + + data += 'cores,vo='+vo+',status=running,schedd='+hostS+' value='+str(coresR)+'\n' + data += 'cores_docker,vo='+vo+',status=running,schedd='+hostS+' value='+str(coresRd)+'\n' + data += 'cores_cloud,vo='+vo+',status=running,schedd='+hostS+' value='+str(coresRc)+'\n' + data += 'cores,vo='+vo+',status=idle,schedd='+hostS+' value='+str(coresI)+'\n' + data += 'cores,vo='+vo+',status=held,schedd='+hostS+' value='+str(coresH)+'\n' wallTotal = 0 cpuTotal = 0 for vo in vosa: - if vo in jobsC: - wallTotal += jobsW[vo] - cpuTotal += jobsC[vo] - eff = -1.0 - if jobsW[vo] > 0: - eff = jobsC[vo]/jobsW[vo]*100.0 - data += 'cpu-efficiency,vo='+vo+' value='+str(eff)+'\n' + if vo in jobsC: + wallTotal += jobsW[vo] + cpuTotal += jobsC[vo] + eff = -1.0 + if jobsW[vo] > 0: + eff = jobsC[vo]/jobsW[vo]*100.0 + data += 'cpu-efficiency,vo='+vo+' value='+str(eff)+'\n' if wallTotal > 0: - eff = cpuTotal*100.0/wallTotal - data += 'cpu-efficiency-total value='+str(eff)+'\n' + eff = cpuTotal*100.0/wallTotal + data += 'cpu-efficiency-total value='+str(eff)+'\n' for key in imagesR: - data += 'image,status=running,image='+key+' value='+str(imagesR[key])+'\n' + data += 'image,status=running,image='+key+' value='+str(imagesR[key])+'\n' data += 'TotalResidentSetSize value='+str(ResidentSetSize_RAW)+'\n' -print data +print (data) diff --git a/metrics-influxdb-condor-jobs-too-long b/metrics-influxdb-condor-jobs-too-long index 47ef124..4b9d492 100755 --- a/metrics-influxdb-condor-jobs-too-long +++ b/metrics-influxdb-condor-jobs-too-long @@ -1,20 +1,19 @@ -#!/usr/bin/python -import classad +#!/usr/bin/env python3 import htcondor coll = htcondor.Collector() results = coll.query(htcondor.AdTypes.Schedd, "true", ["Name"]) for result in results: - host = result["Name"] - scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, host) - schedd = htcondor.Schedd(scheddAd) - try: - ajobs = schedd.query('JobStatus == 2 && CurrentTime - EnteredCurrentStatus > 345600', ["JobStatus"]) - except: - print '# Unable to query schedd:',host,'but continuing anyway' + host = result["Name"] + scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, host) + schedd = htcondor.Schedd(scheddAd) + try: + ajobs = schedd.query('JobStatus == 2 && CurrentTime - EnteredCurrentStatus > 345600', ["JobStatus"]) + except: + print ('# Unable to query schedd:',host,'but continuing anyway') - num = 0 - for ajob in ajobs: - num += 1 + num = 0 + for ajob in ajobs: + num += 1 - print 'JobsTooLong,schedd=%s value=%d' % (host, num) + print ('JobsTooLong,schedd=%s value=%d' % (host, num)) diff --git a/metrics-influxdb-condor-multijobstarts b/metrics-influxdb-condor-multijobstarts index 82f1dd9..209eeac 100755 --- a/metrics-influxdb-condor-multijobstarts +++ b/metrics-influxdb-condor-multijobstarts @@ -1,5 +1,4 @@ -#!/usr/bin/python -import classad +#!/usr/bin/env python3 import htcondor data = '' @@ -7,18 +6,18 @@ data = '' coll = htcondor.Collector() results = coll.query(htcondor.AdTypes.Schedd, "true", ["Name"]) for result in results: - host = result["Name"] - scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, host) - schedd = htcondor.Schedd(scheddAd) - try: - ajobs = schedd.query('NumJobStarts > 1',["JobStatus"]) - except: - print '# Unable to query schedd:',host,'but continuing anyway' + host = result["Name"] + scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, host) + schedd = htcondor.Schedd(scheddAd) + try: + ajobs = schedd.query('NumJobStarts > 1',["JobStatus"]) + except: + print ('# Unable to query schedd:',host,'but continuing anyway') - num = 0 - for ajob in ajobs: - num += 1 + num = 0 + for ajob in ajobs: + num += 1 - data += 'multijobstarts,schedd='+host.split('.')[0]+' value='+str(num)+'\n' + data += 'multijobstarts,schedd='+host.split('.')[0]+' value='+str(num)+'\n' -print data +print (data) diff --git a/metrics-influxdb-condor-wn-echo b/metrics-influxdb-condor-wn-echo index e2d2f1a..98750a1 100755 --- a/metrics-influxdb-condor-wn-echo +++ b/metrics-influxdb-condor-wn-echo @@ -1,5 +1,4 @@ -#!/usr/bin/python -import classad +#!/usr/bin/env python3 import htcondor coll = htcondor.Collector() @@ -30,5 +29,5 @@ for result in results: elif result['ECHO_XROOTD_PROXY_STATUS'] == 'None': echoXrootdProxyNone += 1 -print 'wn_gateways,type=xrootd_gateway healthy=%d,unhealthy=%d,notapplicable=%d' % (echoXrootdGatewayGood, echoXrootdGatewayBad, echoXrootdGatewayNone) -print 'wn_gateways,type=xrootd_proxy healthy=%d,unhealthy=%d,notapplicable=%d' % (echoXrootdProxyGood, echoXrootdProxyBad, echoXrootdProxyNone) +print ('wn_gateways,type=xrootd_gateway healthy=%d,unhealthy=%d,notapplicable=%d' % (echoXrootdGatewayGood, echoXrootdGatewayBad, echoXrootdGatewayNone)) +print ('wn_gateways,type=xrootd_proxy healthy=%d,unhealthy=%d,notapplicable=%d' % (echoXrootdProxyGood, echoXrootdProxyBad, echoXrootdProxyNone)) diff --git a/metrics-influxdb-condor-wn-problems b/metrics-influxdb-condor-wn-problems index a904f3b..32b6141 100755 --- a/metrics-influxdb-condor-wn-problems +++ b/metrics-influxdb-condor-wn-problems @@ -1,5 +1,4 @@ -#!/usr/bin/python -import classad +#!/usr/bin/env python3 import htcondor coll = htcondor.Collector() @@ -18,28 +17,28 @@ countScratch = 0 countClock = 0 for result in results: - if 'alice' in result["NODE_STATUS"]: - countCvmfsALICE += 1 - if 'atlas' in result["NODE_STATUS"]: - countCvmfsATLAS += 1 - if 'cms' in result["NODE_STATUS"]: - countCvmfsCMS += 1 - if 'lhcb' in result["NODE_STATUS"]: - countCvmfsLHCb += 1 - if 'grid' in result["NODE_STATUS"]: - countCvmfsGRID += 1 - if 'containerd' in result["NODE_STATUS"]: - countContainerd += 1 - if 'Docker' in result["NODE_STATUS"]: - countDocker += 1 - if 'Swap' in result["NODE_STATUS"]: - countSwap += 1 - if 'scratch' in result["NODE_STATUS"] or 'pool' in result["NODE_STATUS"]: - countScratch += 1 - if 'clock' in result["NODE_STATUS"]: - countClock += 1 - if 'Cannot create containers' in result["NODE_STATUS"]: - countContainers += 1 + if 'alice' in result["NODE_STATUS"]: + countCvmfsALICE += 1 + if 'atlas' in result["NODE_STATUS"]: + countCvmfsATLAS += 1 + if 'cms' in result["NODE_STATUS"]: + countCvmfsCMS += 1 + if 'lhcb' in result["NODE_STATUS"]: + countCvmfsLHCb += 1 + if 'grid' in result["NODE_STATUS"]: + countCvmfsGRID += 1 + if 'containerd' in result["NODE_STATUS"]: + countContainerd += 1 + if 'Docker' in result["NODE_STATUS"]: + countDocker += 1 + if 'Swap' in result["NODE_STATUS"]: + countSwap += 1 + if 'scratch' in result["NODE_STATUS"] or 'pool' in result["NODE_STATUS"]: + countScratch += 1 + if 'clock' in result["NODE_STATUS"]: + countClock += 1 + if 'Cannot create containers' in result["NODE_STATUS"]: + countContainers += 1 data = 'wn-problems,type=cvmfs-alice value='+str(countCvmfsALICE)+'\n' data = data + 'wn-problems,type=cvmfs-atlas value='+str(countCvmfsATLAS)+'\n' @@ -53,4 +52,4 @@ data = data + 'wn-problems,type=scratch value='+str(countScratch)+'\n' data = data + 'wn-problems,type=clock value='+str(countClock)+'\n' data = data + 'wn-problems,type=containers value='+str(countContainers)+'\n' -print data +print (data) diff --git a/metrics-influxdb-htcondor-groups b/metrics-influxdb-htcondor-groups index b1bf026..f8c4c6b 100755 --- a/metrics-influxdb-htcondor-groups +++ b/metrics-influxdb-htcondor-groups @@ -1,12 +1,11 @@ -#!/usr/bin/python -import sys +#!/usr/bin/env python3 import subprocess p = subprocess.Popen(["condor_config_val", "-negotiator", "GROUP_NAMES"], stdout=subprocess.PIPE) output, err = p.communicate() -output = output.replace(" ", "") -groups = output.split(',') +output = output.replace(b" ", b"") +groups = output.split(b',') p = subprocess.Popen(["condor_userprio", "-grouprollup", "-allusers"], stdout=subprocess.PIPE) output, err = p.communicate() @@ -17,33 +16,33 @@ quotas = {} usage = {} for line in lines: - if 'ByQuota' in line: - pieces = line.split() - group = pieces[0] - quota = pieces[1] - inuse = pieces[3] - quotas[group] = quota - usage[group] = inuse - if 'Number of users' in line: - pieces = line.split() - total = pieces[5] + if b'ByQuota' in line: + pieces = line.split() + group = pieces[0] + quota = pieces[1] + inuse = pieces[3] + quotas[group] = quota + usage[group] = inuse + if b'Number of users' in line: + pieces = line.split() + total = pieces[5] data = '' for group in groups: - if group in quotas and group in usage: - pieces = group.split('.') - percent = 1.0 - name = '' - for piece in pieces: - if len(name) > 0: - name = name + '.' + piece - else: - name = piece - percent = percent*float(quotas[name]) - cpus = percent*float(total) - fairness = 0 - if cpus > 0: - fairness = 100.0*float(usage[group])/cpus - data += "groups,name=%s allocation=%d,usage=%d,fairness=%d\n" % (group, cpus, int(usage[group]), fairness) - -print data + if group in quotas and group in usage: + pieces = group.split(b'.') + percent = 1.0 + name = '' + for piece in pieces: + if len(name) > 0: + name = name + b'.' + piece + else: + name = piece + percent = percent*float(quotas[name]) + cpus = percent*float(total) + fairness = 0 + if cpus > 0: + fairness = 100.0*float(usage[group])/cpus + data += "groups,name=%s allocation=%d,usage=%d,fairness=%d\n" % (group.decode(), cpus, int(float(usage[group])), fairness) + +print (data) diff --git a/package_influxdb_condor.sh b/package_influxdb_condor.sh new file mode 100644 index 0000000..403a09c --- /dev/null +++ b/package_influxdb_condor.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +if [[ $# -ne 1 ]]; then + echo "Usage: package_influxdb_condor.sh VERSION" + exit 1 +fi + +fpm \ + --input-type dir \ + --output-type rpm \ + --name tier1-condor-telegraf-metrics \ + --version $1 \ + --iteration 1 \ + --architecture noarch \ + --prefix '/usr/bin/' \ + --vendor 'Science and Technology Facilties Council' \ + --url 'https://github.com/stfc/telegraf-scripts' \ + --description 'Scripts used to monitor HTCondor at RAL' \ + --depends 'python3' \ + metrics-influxdb-condor* metrics-influxdb-htcondor-groups