aboutsummaryrefslogtreecommitdiff
path: root/slurm_pending_on_qos.rb
diff options
context:
space:
mode:
authorKillian Murphy <killian.murphy@york.ac.uk>2020-11-11 14:57:38 +0000
committerKillian Murphy <killian.murphy@york.ac.uk>2020-11-11 14:57:38 +0000
commit72937de200cb22b300970c7080a8a5040737010f (patch)
tree8d841f1e1dc3e634d1b80e4133850352a37c0828 /slurm_pending_on_qos.rb
parent37451d9cb9b5061326827a7858b2a7c71ff64e52 (diff)
Actually rename the QoS pending raider
Diffstat (limited to 'slurm_pending_on_qos.rb')
-rw-r--r--slurm_pending_on_qos.rb52
1 files changed, 52 insertions, 0 deletions
diff --git a/slurm_pending_on_qos.rb b/slurm_pending_on_qos.rb
new file mode 100644
index 0000000..288719a
--- /dev/null
+++ b/slurm_pending_on_qos.rb
@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+
+require 'date'
+
+# For each Viking partition, report the number of jobs pending for a long time
+# due to QoS reasons.
+class SlurmPendingOnQos
+ def initialize(collector, config)
+ @collector = collector
+ @partition_thresholds = {
+ nodes: 604_800,
+ week: 1_209_600,
+ month: 2_419_200,
+ himem: 604_800,
+ himem_week: 1_209_600,
+ gpu: 604_800,
+ interactive: 900,
+ test: 900,
+ preempt: 2_419_200
+ }
+ end
+
+ def raid
+ @partition_thresholds.each do |partition, threshold|
+ squeue_cmd = [
+ 'squeue',
+ '--format="%R,%V"',
+ '--noheader',
+ "--partition=#{partition}",
+ '--state=PENDING'
+ ].join(' ')
+
+ data = `#{squeue_cmd}`.split("\n").grep(/QOS/).map do |row|
+ row.split(',')
+ end
+
+ count = data.count do |columns|
+ (Time.now.to_i - DateTime.parse(columns[1]).to_time.to_i) > threshold
+ end
+
+ @collector.report!(
+ 'pending_on_qos',
+ count,
+ help: 'Number of jobs pending beyond a threshold for QoS reasons',
+ type: 'gauge',
+ labels: {
+ partition: partition.to_s
+ }
+ )
+ end
+ end
+end