diff options
author | Nat Lasseter <66247271+nl987@users.noreply.github.com> | 2020-11-04 12:12:56 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-11-04 12:12:56 +0000 |
commit | c045926de0021e0be5426b983a9f9820619bcb62 (patch) | |
tree | c4ed717670787f9184f9daac829aa3c1d0084f17 /pending_on_qos.rb | |
parent | b2ec20bcf89ea65d5be0e94b26b9579185a8d6f0 (diff) | |
parent | f99eeaaf3d589fbc2d915383fbbe9cb4f5f68acd (diff) |
Merge pull request #1 from university-of-york/feature/pending_on_qos
Feature/pending on qos
Diffstat (limited to 'pending_on_qos.rb')
-rw-r--r-- | pending_on_qos.rb | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/pending_on_qos.rb b/pending_on_qos.rb new file mode 100644 index 0000000..330011c --- /dev/null +++ b/pending_on_qos.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +require 'date' + +# For each Viking partition, report the number of jobs pending for a long time +# due to QoS reasons. +class PendingOnQos + def initialize(collector, config) + @collector = collector + @partition_thresholds = { + nodes: 604_800, + week: 1_209_600, + month: 2_419_200, + himem: 604_800, + himem_week: 1_209_600, + gpu: 604_800, + interactive: 900, + test: 900, + preempt: 2_419_200 + } + end + + def raid + @partition_thresholds.each do |partition, threshold| + squeue_cmd = [ + 'squeue', + '--format="%R,%V"', + '--noheader', + "--partition=#{partition}", + '--state=PENDING' + ].join(' ') + + data = `#{squeue_cmd}`.split("\n").grep(/QOS/).map do |row| + row.split(',') + end + + count = data.count do |columns| + (Time.now.to_i - DateTime.parse(columns[1]).to_time.to_i) > threshold + end + + @collector.report!( + 'pending_on_qos', + count, + help: 'Number of jobs pending beyond a threshold for QoS reasons', + type: 'gauge', + labels: { + partition: partition.to_s + } + ) + end + end +end |