diff options
author | Killian Murphy <killian.murphy@york.ac.uk> | 2020-11-03 18:11:39 +0000 |
---|---|---|
committer | Killian Murphy <killian.murphy@york.ac.uk> | 2020-11-03 18:11:39 +0000 |
commit | 86d4e1e5eef938837336a55afc087a02c1773e61 (patch) | |
tree | 62ba5bdef3acba0af67eb47b39c42e0743a1c1d5 /pending_on_qos.rb | |
parent | e9aa40ca482919267dc4edc3e4f3efbe8a0f81bb (diff) |
Add QoS pending raider definition
Add a raider for gauging how many jobs are PENDING on each partition due
to QoS reasons, for longer than a threshold number of seconds
Diffstat (limited to 'pending_on_qos.rb')
-rw-r--r-- | pending_on_qos.rb | 34 |
1 files changed, 18 insertions, 16 deletions
diff --git a/pending_on_qos.rb b/pending_on_qos.rb index 7d99c8c..8e8bfac 100644 --- a/pending_on_qos.rb +++ b/pending_on_qos.rb @@ -1,7 +1,9 @@ # frozen_string_literal: true -# For each Viking partition, report the number of jobs pending due to QoS -# reasons. +require 'date' + +# For each Viking partition, report the number of jobs pending for a long time +# due to QoS reasons. class PendingOnQos def initialize(collector, config) @collector = collector @@ -20,8 +22,6 @@ class PendingOnQos def raid @partition_thresholds.each do |partition, threshold| - start_time = (Time.now - threshold).strftime('%Y-%m-%d') - squeue_cmd = [ 'squeue', '--format="%A,%R,%V"', @@ -29,22 +29,24 @@ class PendingOnQos "--partition=#{partition}", '--state=PENDING' ].join(' ') - - output = `#{squeue_cmd}`.split('\n') - puts output - end + data = `#{squeue_cmd}`.split("\n").grep(/QOS/).map do |row| + row.split(',') + end - @collector.report!( - 'pending_on_qos', - 255, - { - help: 'Number of jobs pending for QoS reasons', + count = data.count do |columns| + (Time.now.to_i - DateTime.parse(columns[2]).to_time.to_i) > threshold + end + + @collector.report!( + 'pending_on_qos', + count, + help: 'Number of jobs pending beyond a threshold for QoS reasons', type: 'gauge', labels: { - partition: 'nodes' + partition: partition.to_s } - } - ) + ) + end end end |