blob: 2881d68c340b5f29286fecbc64e6cdce85dfb73c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
class SlurmJobStates
def initialize(collector, config)
@collector = collector
@interval = config[:raid_every]
end
def raid
start_time = (Time.now - @interval).strftime("%H:%M:%S")
# Get raw data from sacct and read jobs into an array
raw = `sacct -a -P -o State,Partition -S #{start_time}`.lines
# remove any whitespace from the ends of each string
raw = raw.map(&:strip)
# drop the header line
raw = raw[1..-1]
# split each line into state and partition
raw = raw.map{ |l| l.split("|") }
# and remove the "by xxxxxx" from CANCELLED jobs
raw = raw.map { |state, partition| [state.split[0], partition] }
# Make a tally of each state/partition combo
tally = raw.tally
# Clean up any previously reported metrics
# to prevent stale labelsets
@collector.redact!("slurm_job_states")
# Report new metrics
tally.each do |labelset, number|
@collector.report!(
"slurm_job_states",
number,
help: "Number of jobs in each state",
type: "gauge",
labels: {state: labelset[0], partition: labelset[1]}
)
end
end
end
|