Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 59 additions & 60 deletions metrics-influxdb-arc
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python

#!/usr/bin/env python3
import os
import time

Expand All @@ -17,80 +16,80 @@ arex_job_states = [

# Get numbers of jobs in each state
def getJobsStatesInfo():
control_subdirs = ['accepting', 'finished', 'processing', 'restarting']

data = ''
states = {}

for control_subdir in control_subdirs:
subdir = os.path.join('/var/spool/arc/jobstatus', control_subdir)

if not os.path.isdir(subdir):
return

try:
for status_file in os.listdir(subdir):
try:
f = open(os.path.join(subdir, status_file))
except IOError, e:
print 'Could not open status file %s: %s' % status_file, str(e)
continue

status = f.readline().strip()
if status in states:
states[status] += 1
else:
states[status] = 1
f.close()
except OSError, e:
print 'Could not list status files in %s: %s' % subdir, str(e)

for state in arex_job_states:
if state in states:
value = states[state]
else:
value = 0
data += 'jobs,state=' + state + ' value=' + str(value) + '\n'

return data
control_subdirs = ['accepting', 'finished', 'processing', 'restarting']

data = ''
states = {}

for control_subdir in control_subdirs:
subdir = os.path.join('/var/spool/arc/jobstatus', control_subdir)

if not os.path.isdir(subdir):
return

try:
for status_file in os.listdir(subdir):
try:
f = open(os.path.join(subdir, status_file))
except IOError as e:
print ('Could not open status file %s: %s' % status_file, str(e))
continue

status = f.readline().strip()
if status in states:
states[status] += 1
else:
states[status] = 1
f.close()
except OSError as e:
print ('Could not list status files in %s: %s' % subdir, str(e))

for state in arex_job_states:
if state in states:
value = states[state]
else:
value = 0
data += 'jobs,state=' + state + ' value=' + str(value) + '\n'

return data

# Get number of jobs in the processing subdirectory
def getProcessingJobs():
processing = 0
processing = 0

processing_dir = '/var/spool/arc/jobstatus/processing'
processing_dir = '/var/spool/arc/jobstatus/processing'

try:
entries = os.listdir(processing_dir)
except OSError, e:
print "Error listing dir %s: %s" % processing_dir, str(e)
return
processing += len(entries)
try:
entries = os.listdir(processing_dir)
except OSError as e:
print ("Error listing dir %s: %s" % processing_dir, str(e))
return
processing += len(entries)

data = 'jobs,state=PROCESSING value=' + str(processing) + '\n'
return data
data = 'jobs,state=PROCESSING value=' + str(processing) + '\n'
return data

# Get the time since the modification timestamp of the gm-heartbeat file
def getHeartBeatInfo():

heartbeat = '/var/spool/arc/jobstatus/gm-heartbeat'
try:
statinfo = os.stat(heartbeat)
except OSError, e:
print "Error with heartbeat file: %s" %str(e)
return
heartbeat = '/var/spool/arc/jobstatus/gm-heartbeat'
try:
statinfo = os.stat(heartbeat)
except OSError as e:
print ("Error with heartbeat file: %s" %str(e))
return

mtime = statinfo.st_mtime
now = time.time()
heartbeat_time = now - mtime
mtime = statinfo.st_mtime
now = time.time()
heartbeat_time = now - mtime

data = 'arex_heartbeat_lastseen value=' + str(heartbeat_time) + '\n'
return data
data = 'arex_heartbeat_lastseen value=' + str(heartbeat_time) + '\n'
return data

# Generate metrics
data = ''
data += getJobsStatesInfo()
data += getProcessingJobs()
data += getHeartBeatInfo()

print data
print (data)
41 changes: 20 additions & 21 deletions metrics-influxdb-condor-capacity
Original file line number Diff line number Diff line change
@@ -1,33 +1,32 @@
#!/usr/bin/python
#!/usr/bin/env python3
import htcondor
import classad

coll = htcondor.Collector()
startds = coll.query(htcondor.AdTypes.Startd, "PartitionableSlot =?=True", ["Machine","TotalMemory","Memory","TotalCpus","Cpus","RalScaling","ScalingFactor","RalCluster","StartJobs"])

data = ''
for startd in startds:
# Determine the scaling factor
scalingFactor = 0
if "RalScaling" in startd:
scalingFactor = startd["RalScaling"]
elif "ScalingFactor" in startd:
scalingFactor = startd["ScalingFactor"]
# Determine the scaling factor
scalingFactor = 0
if "RalScaling" in startd:
scalingFactor = startd["RalScaling"]
elif "ScalingFactor" in startd:
scalingFactor = startd["ScalingFactor"]

capacityTotal = int(4.0*scalingFactor*startd["TotalCpus"])
capacityIdle = int(4.0*scalingFactor*startd["Cpus"])
capacityUsed = capacityTotal - capacityIdle
capacityTotal = int(4.0*scalingFactor*startd["TotalCpus"])
capacityIdle = int(4.0*scalingFactor*startd["Cpus"])
capacityUsed = capacityTotal - capacityIdle

memoryTotal = startd["TotalMemory"]
memoryIdle = startd["Memory"]
memoryUsed = memoryTotal - memoryIdle
memoryTotal = startd["TotalMemory"]
memoryIdle = startd["Memory"]
memoryUsed = memoryTotal - memoryIdle

if "StartJobs" in startd:
if startd["StartJobs"]:
data = data + 'capacity,type=idle,resource=cpu,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(capacityIdle)+'\n'
data = data + 'capacity,type=idle,resource=memory,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(memoryIdle)+'\n'
if "StartJobs" in startd:
if startd["StartJobs"]:
data = data + 'capacity,type=idle,resource=cpu,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(capacityIdle)+'\n'
data = data + 'capacity,type=idle,resource=memory,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(memoryIdle)+'\n'

data = data + 'capacity,type=used,resource=cpu,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(capacityUsed)+'\n'
data = data + 'capacity,type=used,resource=memory,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(memoryUsed)+'\n'
data = data + 'capacity,type=used,resource=cpu,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(capacityUsed)+'\n'
data = data + 'capacity,type=used,resource=memory,host='+startd["Machine"]+',tranche='+startd["RalCluster"]+' value='+str(memoryUsed)+'\n'

print data
print (data)
49 changes: 23 additions & 26 deletions metrics-influxdb-condor-cpu-unused
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
#!/usr/bin/python

import classad
#!/usr/bin/env python3
import htcondor

# initialization
Expand All @@ -17,32 +15,31 @@ minimumMemory = 1000000
coll = htcondor.Collector()
results = coll.query(htcondor.AdTypes.Schedd, "true", ["Name"])
for result in results:
host = result["Name"]
if 'arc-ce' in host:
scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, host)
schedd = htcondor.Schedd(scheddAd)
jobs = schedd.query('JobStatus == 1', ["RequestMemory"])
for job in jobs:
if job["RequestMemory"] < minimumMemory and job["RequestMemory"] > 0:
minimumMemory = job["RequestMemory"]
host = result["Name"]
if 'arc-ce' in host:
scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, host)
schedd = htcondor.Schedd(scheddAd)
jobs = schedd.query('JobStatus == 1', ["RequestMemory"])
for job in jobs:
if job["RequestMemory"] < minimumMemory and job["RequestMemory"] > 0:
minimumMemory = job["RequestMemory"]

coll = htcondor.Collector()
results = coll.query(htcondor.AdTypes.Startd, "PartitionableSlot =?= True", ["Name", "TotalSlotCpus", "Cpus", "TotalSlotMemory", "Memory", "State", "PREEMPTABLE_ONLY", "StartJobs", "NODE_IS_HEALTHY"])
for result in results:
if result["Cpus"] > 0:
if str(result["State"]) == "Drained":
coresDraining += result["Cpus"]
elif 'PREEMPTABLE_ONLY' in result and result['PREEMPTABLE_ONLY'] == True:
coresPreemptable += result["Cpus"]
elif 'NODE_IS_HEALTHY' in result and result["NODE_IS_HEALTHY"] != True:
coresUnhealthy += result["Cpus"]
elif result["StartJobs"] == False:
coresStartJobs += result["Cpus"]
elif result["Memory"] < minimumMemory:
coresMemory += result["Cpus"]
else:
coresOther += result["Cpus"]
if result["Cpus"] > 0:
if str(result["State"]) == "Drained":
coresDraining += result["Cpus"]
elif 'PREEMPTABLE_ONLY' in result and result['PREEMPTABLE_ONLY'] == True:
coresPreemptable += result["Cpus"]
elif 'NODE_IS_HEALTHY' in result and result["NODE_IS_HEALTHY"] != True:
coresUnhealthy += result["Cpus"]
elif result["StartJobs"] == False:
coresStartJobs += result["Cpus"]
elif result["Memory"] < minimumMemory:
coresMemory += result["Cpus"]
else:
coresOther += result["Cpus"]

data = 'CoresNotUsed draining=%d,preemptable=%d,unhealthy=%d,disabled=%d,memory=%d,other=%d' % (coresDraining, coresPreemptable, coresUnhealthy, coresStartJobs, coresMemory, coresOther)
print data

print (data)
Loading