From 7b35c62b49f0a50a5fbe5aff0759c78326130785 Mon Sep 17 00:00:00 2001 From: Killian Murphy Date: Fri, 30 Oct 2020 16:13:04 +0000 Subject: Add beginning of QoS pending prototype raider --- .rubocop.yml | 5 +++++ pending_on_qos.rb | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 .rubocop.yml create mode 100644 pending_on_qos.rb diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 0000000..48211dc --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,5 @@ +Layout/LineLength: + Max: 80 + +AllCops: + NewCops: enable diff --git a/pending_on_qos.rb b/pending_on_qos.rb new file mode 100644 index 0000000..7f7b3df --- /dev/null +++ b/pending_on_qos.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +# For each Viking partition, report the number of jobs pending due to QoS +# reasons. +class PendingOnQos + def initialize(collector, config) + @collector = collector + @partition_thresholds = { + nodes: 604_800, + week: 1_209_600, + month: 2_419_200, + himem: 604_800, + himem_week: 1_209_600, + gpu: 604_800, + interactive: 900, + test: 900, + preempt: 2_419_200 + } + end + + def raid + @partition_thresholds.each do |partition, threshold| + start_time = (Time.now - @partition_thresholds[:partition]) + .strftime('%Y-%m-:%d') + + squeue_cmd = [ + 'squeue', + '--format="%A,%R,%V"', + '--noheader', + '--parsable2', + "--partition=#{partition}", + '--state=PENDING' + ].join(' ') + end + + @collector.report!( + name: 'pending_on_qos', + value: 255, + help: 'Number of jobs pending for QoS reasons', + type: 'gauge', + labels: { partition: 'nodes' } + ) + end +end -- cgit v1.2.1 From e9aa40ca482919267dc4edc3e4f3efbe8a0f81bb Mon Sep 17 00:00:00 2001 From: Killian Murphy Date: Fri, 30 Oct 2020 18:18:36 +0000 Subject: Checkpointing --- pending_on_qos.rb | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/pending_on_qos.rb b/pending_on_qos.rb index 7f7b3df..7d99c8c 100644 --- a/pending_on_qos.rb +++ b/pending_on_qos.rb @@ -20,25 +20,31 @@ class PendingOnQos def raid @partition_thresholds.each do |partition, threshold| - start_time = (Time.now - @partition_thresholds[:partition]) - .strftime('%Y-%m-:%d') + start_time = (Time.now - threshold).strftime('%Y-%m-%d') squeue_cmd = [ 'squeue', '--format="%A,%R,%V"', '--noheader', - '--parsable2', "--partition=#{partition}", '--state=PENDING' ].join(' ') + + output = `#{squeue_cmd}`.split('\n') + + puts output end @collector.report!( - name: 'pending_on_qos', - value: 255, - help: 'Number of jobs pending for QoS reasons', - type: 'gauge', - labels: { partition: 'nodes' } + 'pending_on_qos', + 255, + { + help: 'Number of jobs pending for QoS reasons', + type: 'gauge', + labels: { + partition: 'nodes' + } + } ) end end -- cgit v1.2.1 From 86d4e1e5eef938837336a55afc087a02c1773e61 Mon Sep 17 00:00:00 2001 From: Killian Murphy Date: Tue, 3 Nov 2020 18:11:39 +0000 Subject: Add QoS pending raider definition Add a raider for gauging how many jobs are PENDING on each partition due to QoS reasons, for longer than a threshold number of seconds --- pending_on_qos.rb | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/pending_on_qos.rb b/pending_on_qos.rb index 7d99c8c..8e8bfac 100644 --- a/pending_on_qos.rb +++ b/pending_on_qos.rb @@ -1,7 +1,9 @@ # frozen_string_literal: true -# For each Viking partition, report the number of jobs pending due to QoS -# reasons. +require 'date' + +# For each Viking partition, report the number of jobs pending for a long time +# due to QoS reasons. class PendingOnQos def initialize(collector, config) @collector = collector @@ -20,8 +22,6 @@ class PendingOnQos def raid @partition_thresholds.each do |partition, threshold| - start_time = (Time.now - threshold).strftime('%Y-%m-%d') - squeue_cmd = [ 'squeue', '--format="%A,%R,%V"', @@ -29,22 +29,24 @@ class PendingOnQos "--partition=#{partition}", '--state=PENDING' ].join(' ') - - output = `#{squeue_cmd}`.split('\n') - puts output - end + data = `#{squeue_cmd}`.split("\n").grep(/QOS/).map do |row| + row.split(',') + end - @collector.report!( - 'pending_on_qos', - 255, - { - help: 'Number of jobs pending for QoS reasons', + count = data.count do |columns| + (Time.now.to_i - DateTime.parse(columns[2]).to_time.to_i) > threshold + end + + @collector.report!( + 'pending_on_qos', + count, + help: 'Number of jobs pending beyond a threshold for QoS reasons', type: 'gauge', labels: { - partition: 'nodes' + partition: partition.to_s } - } - ) + ) + end end end -- cgit v1.2.1 From f99eeaaf3d589fbc2d915383fbbe9cb4f5f68acd Mon Sep 17 00:00:00 2001 From: Killian Murphy Date: Tue, 3 Nov 2020 18:14:47 +0000 Subject: Don't need job ID for counting --- pending_on_qos.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pending_on_qos.rb b/pending_on_qos.rb index 8e8bfac..330011c 100644 --- a/pending_on_qos.rb +++ b/pending_on_qos.rb @@ -24,7 +24,7 @@ class PendingOnQos @partition_thresholds.each do |partition, threshold| squeue_cmd = [ 'squeue', - '--format="%A,%R,%V"', + '--format="%R,%V"', '--noheader', "--partition=#{partition}", '--state=PENDING' @@ -35,7 +35,7 @@ class PendingOnQos end count = data.count do |columns| - (Time.now.to_i - DateTime.parse(columns[2]).to_time.to_i) > threshold + (Time.now.to_i - DateTime.parse(columns[1]).to_time.to_i) > threshold end @collector.report!( -- cgit v1.2.1