aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNat Lasseter <66247271+nl987@users.noreply.github.com>2020-11-04 12:12:56 +0000
committerGitHub <noreply@github.com>2020-11-04 12:12:56 +0000
commitc045926de0021e0be5426b983a9f9820619bcb62 (patch)
treec4ed717670787f9184f9daac829aa3c1d0084f17
parentb2ec20bcf89ea65d5be0e94b26b9579185a8d6f0 (diff)
parentf99eeaaf3d589fbc2d915383fbbe9cb4f5f68acd (diff)
Merge pull request #1 from university-of-york/feature/pending_on_qos
Feature/pending on qos
-rw-r--r--.rubocop.yml5
-rw-r--r--pending_on_qos.rb52
2 files changed, 57 insertions, 0 deletions
diff --git a/.rubocop.yml b/.rubocop.yml
new file mode 100644
index 0000000..48211dc
--- /dev/null
+++ b/.rubocop.yml
@@ -0,0 +1,5 @@
+Layout/LineLength:
+ Max: 80
+
+AllCops:
+ NewCops: enable
diff --git a/pending_on_qos.rb b/pending_on_qos.rb
new file mode 100644
index 0000000..330011c
--- /dev/null
+++ b/pending_on_qos.rb
@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+
+require 'date'
+
+# For each Viking partition, report the number of jobs pending for a long time
+# due to QoS reasons.
+class PendingOnQos
+ def initialize(collector, config)
+ @collector = collector
+ @partition_thresholds = {
+ nodes: 604_800,
+ week: 1_209_600,
+ month: 2_419_200,
+ himem: 604_800,
+ himem_week: 1_209_600,
+ gpu: 604_800,
+ interactive: 900,
+ test: 900,
+ preempt: 2_419_200
+ }
+ end
+
+ def raid
+ @partition_thresholds.each do |partition, threshold|
+ squeue_cmd = [
+ 'squeue',
+ '--format="%R,%V"',
+ '--noheader',
+ "--partition=#{partition}",
+ '--state=PENDING'
+ ].join(' ')
+
+ data = `#{squeue_cmd}`.split("\n").grep(/QOS/).map do |row|
+ row.split(',')
+ end
+
+ count = data.count do |columns|
+ (Time.now.to_i - DateTime.parse(columns[1]).to_time.to_i) > threshold
+ end
+
+ @collector.report!(
+ 'pending_on_qos',
+ count,
+ help: 'Number of jobs pending beyond a threshold for QoS reasons',
+ type: 'gauge',
+ labels: {
+ partition: partition.to_s
+ }
+ )
+ end
+ end
+end