summaryrefslogtreecommitdiff
path: root/lib/custodian/alerts/mauve.rb
blob: bf4d08fd1d2c48e5917dd11d04e57cc8261e6ef3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
require 'custodian/util/bytemark'
require 'custodian/util/dns'
require 'custodian/util/prefix'

require 'digest/sha1'


#
#  This class encapsulates the raising and clearing of alerts via Mauve.
#
#  There is a helper method to update any alerts with details of whether the
# affected host is inside/outside the Bytemark network.
#
#  This is almost Bytemark-specific, although the server it talks to is
# indeed Open Source:
#
#    https://projects.bytemark.co.uk/projects/mauvealert
#
#
module Custodian

  module Alerter

    class AlertMauve < AlertFactory


      #
      # The test this alerter cares about
      #
      attr_reader :test

      #
      # Was this class loaded correctly?
      #
      attr_reader :loaded




      #
      # Constructor
      #
      def initialize(obj)
        @test = obj

        begin
          require 'mauve/sender'
          require 'mauve/proto'
          @loaded = true
        rescue
          puts 'ERROR Loading mauve libraries!'
          @loaded = false
        end
      end




      #
      # Generate an alert-message which will be raised via mauve.
      #
      def raise

        return unless @loaded

        #
        # Get ready to send to mauve.
        #
        update         = Mauve::Proto::AlertUpdate.new
        update.alert   = []
        update.source  = @settings.alert_source
        update.replace = false

        #
        # Construct a new alert structure.
        #
        alert = _get_alert(true)

        #
        #  We're raising this alert.
        #
        alert.raise_time = Time.now.to_i

        #
        # The supression period varies depending on the time of day.
        #
        hour = Time.now.hour
        wday = Time.now.wday

        #
        # Is this inside the working day?
        #
        working = false

        #
        # Lookup the start of the day.
        #
        day_start = @settings.key('day_start').to_i || 10
        day_end   = @settings.key('day_end').to_i   || 18

        #
        #  In hour suppress
        #
        working_suppress = @settings.key('working_suppress').to_i || 4
        oncall_suppress  = @settings.key('oncall_suppress').to_i  || 10

        #
        # If we're Monday-Friday, between the start & end time, then
        # we're in the working day.
        #
        if  ((wday.nonzero?) && (wday != 6)) &&
            (hour >= day_start && hour < day_end)
          working = true
        end

        #
        # The suppression period can now be determined.
        #
        period = working ? working_suppress : oncall_suppress

        #
        # And logged.
        #
        puts "Suppression period is #{period}m"

        #
        # We're going to suppress this alert now
        #
        alert.suppress_until = Time.now.to_i + (period * 60)

        #
        #  Update it and send it
        #
        update.alert << alert
        Mauve::Sender.new(@target).send(update)

      end



      #
      # Generate an alert-message which will be cleared via mauve.
      #
      def clear

        return unless @loaded

        #
        # Get ready to send to mauve.
        #
        update = Mauve::Proto::AlertUpdate.new
        update.alert   = []
        update.source  = @settings.alert_source
        update.replace = false

        #
        # Construct a new alert structure.
        #
        alert = _get_alert(false)

        #
        #  We're clearing this alert.
        #
        alert.clear_time = Time.now.to_i

        #
        #  Update it and send it
        #
        update.alert << alert
        Mauve::Sender.new(@target).send(update)

      end






      #
      # Using the test object, which was set in the constructor,
      # generate a useful alert that can be fired off to mauve.
      #
      # Most of the mess of this method is ensuring there is some
      # "helpful" data in the detail-field of the alert.
      #
      def _get_alert(failure)

        #
        # The subject of an alert MUST be one of:
        #
        #   1.  Hostname.
        #   2.  IP address
        #   3.  A URL.
        #
        # We attempt to resolve the alert to the hostname, as that is more
        # readable, if we have been given an IP address.
        #
        subject = @test.target

        if  (subject =~ /^([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)$/) ||
            (subject =~ /^([0-9a-f:]+)$/)
          res = Custodian::Util::DNS.ip_to_hostname(subject)
          if  res
            subject = res
          end
        end


        #
        #  The test type + test target
        #
        test_host = test.target
        test_type = test.get_type

        alert = Mauve::Proto::Alert.new

        #
        # Mauve only lets us use IDs which are <= 255 characters in length
        # hash the line from the parser to ensure it is short enough.
        # (IDs must be unique, per-source)
        #
        # Because there might be N-classes which implemented the test
        # we need to make sure these are distinct too.
        #
        id_key    = test.to_s
        id_key   += test.class.to_s
        alert.id  = Digest::SHA1.hexdigest(id_key)

        # Look for a subject-prefix
        subject_prefix = Custodian::Util::Prefix.text

        alert.subject = subject_prefix + subject
        alert.summary = "The #{test_type} test failed against #{test_host}"

        #
        #  If we're raising then add the error
        #
        if failure

          alert.detail = "<p>The #{test_type} test failed against #{test_host}.</p>"

          #
          #  The text from the job-defition
          #
          user_text = test.get_notification_text

          #
          # Add the user-detail if present
          #
          alert.detail = "#{alert.detail}<p>#{user_text}</p>" if !user_text.nil?

          #
          # Add the test-failure message
          #
          alert.detail = "#{alert.detail}<p>#{test.error}</p>"

          #
          #  Determine if this is inside/outside the bytemark network
          #
          location = expand_inside_bytemark(test_host)
          if !location.nil? && location.length
            alert.detail = "#{alert.detail}\n#{location}"
          end
        end

        #
        # Return the alert to the caller.
        #
        alert
      end


      #
      # Expand to a message indicating whether a hostname is inside the Bytemark network.
      # or not.
      #
      #
      def expand_inside_bytemark(host)

        #
        #  If the host is a URL then we need to work with the hostname component alone.
        #
        #  We'll also make the host a link that can be clicked in the alert we raise.
        #
        target = host
        if target =~ /^([a-z]+):\/\/([^\/]+)/
          target = $2.dup
          host   = "<a href=\"#{host}\">#{host}</a>"
        end


        #
        #  IP addresses we found for the host
        #
        ips = []


        #
        #  Resolve the target to an IP, unless it is already an address.
        #
        if  (target =~ /^([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)$/) ||
            (target =~ /^([0-9a-f:]+)$/)
          ips.push(target)
        else

          #
          # OK if it didn't look like an IP address then attempt to
          # look it up, as both IPv4 and IPv6.
          #
          begin
            timeout(30) do

              Resolv::DNS.open do |dns|

                ress = dns.getresources(target, Resolv::DNS::Resource::IN::A)
                ress.map { |r| ips.push(r.address.to_s) }

                ress = dns.getresources(target, Resolv::DNS::Resource::IN::AAAA)
                ress.map { |r| ips.push(r.address.to_s) }
              end
            end
          rescue Timeout::Error => _e
            return ''
          end
        end


        #
        # Did we fail to lookup any IPs?
        #
        return '' if ips.empty?


        #
        #  The string we return to the caller.
        #
        result = ''

        #
        #  Return the formatted message
        #
        ips.each do |ipaddr|
          if Custodian::Util::Bytemark.inside?(ipaddr.to_s)
            result += "<p>#{host} resolves to #{ipaddr} which is inside the Bytemark network.</p>"
          else
            result += "<p>#{host} resolves to #{ipaddr} which is OUTSIDE the Bytemark network.</p>"
          end
        end

        result
      end


      register_alert_type 'mauve'




    end
  end
end