From b5d3d793deedd60ebd5dda9056d012ae3a9411a8 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Thu, 29 Nov 2012 10:18:57 +0000 Subject: Added alert suppression to the server code + protobuf. --- lib/mauve/alert.rb | 64 ++++++++++++++++- lib/mauve/alert_changed.rb | 14 +++- lib/mauve/alert_group.rb | 30 ++++++++ lib/mauve/proto.rb | 4 ++ mauve.proto | 3 + test/tc_mauve_alert.rb | 162 ++++++++++++++++++++++++++++++++++++++++++- test/tc_mauve_alert_group.rb | 122 +++++++++++++++++++++++++++++++- 7 files changed, 391 insertions(+), 8 deletions(-) diff --git a/lib/mauve/alert.rb b/lib/mauve/alert.rb index ab13736..1e94bde 100644 --- a/lib/mauve/alert.rb +++ b/lib/mauve/alert.rb @@ -101,6 +101,7 @@ module Mauve property :raised_at, EpochTime property :cleared_at, EpochTime property :updated_at, EpochTime + property :suppress_until, EpochTime property :acknowledged_at, EpochTime property :acknowledged_by, String, :lazy => false property :update_type, String, :lazy => false @@ -458,7 +459,8 @@ module Mauve self.will_raise_at = nil self.cleared_at = nil # Don't clear will_clear_at - self.update_type = "raised" if self.update_type.nil? or self.update_type != "changed" or self.original_attributes[Alert.properties[:update_type]] == "cleared" + self.update_type = "raised" if self.update_type.nil? or !self.acknowledged? + self.suppress_until = nil unless self.suppressed? or self.was_raised? end self.save @@ -490,13 +492,14 @@ module Mauve self.cleared_at = at if self.cleared_at.nil? self.will_clear_at = nil self.update_type = "cleared" + self.suppress_until = nil unless self.suppressed? or self.was_cleared? end if self.save # # Clear all reminders. # - self.changes.all(:remind_at.not => nil, :at.lte => at, :update_type => "raised").each do |ac| + self.changes.all(:remind_at.not => nil).each do |ac| ac.remind_at = nil ac.save end @@ -513,7 +516,7 @@ module Mauve false end end - + # The next time this alert should be polled, either to raise, clear, or # unacknowledge, or nil if nothing is due. # @@ -545,6 +548,25 @@ module Mauve !raised_at.nil? and (cleared_at.nil? or raised_at > cleared_at) end + # Was the alert raised before changes were made? + # + # @return [Boolean] + def was_raised? + was_raised_at = if original_attributes.has_key?(Alert.properties[:raised_at]) + original_attributes[Alert.properties[:raised_at]] + else + self.raised_at + end + + was_cleared_at = if original_attributes.has_key?(Alert.properties[:cleared_at]) + original_attributes[Alert.properties[:cleared_at]] + else + self.cleared_at + end + + !was_raised_at.nil? and (was_cleared_at.nil? or was_raised_at > was_cleared_at) + end + # Is the alert acknowledged? # # @return [Boolean] @@ -552,6 +574,17 @@ module Mauve !acknowledged_at.nil? end + # Was the alert acknowledged before the current changes? + # + # @return [Boolean] + def was_acknowledged? + if original_attributes.has_key?(Alert.properties[:acknowledged_at]) + !original_attributes[Alert.properties[:acknowledged_at]].nil? + else + self.acknowledged? + end + end + # Is the alert cleared? Cleared is just the opposite of raised. # # @return [Boolean] @@ -559,6 +592,20 @@ module Mauve !raised? end + # Was the alert cleared before the current changes? + # + # @return [Boolean] + def was_cleared? + !was_raised? + end + + # Is the alert suppressed? + # + # @return [Boolean] + def suppressed? + self.suppress_until.is_a?(Time) and self.suppress_until > Time.now + end + # Work out an array of extra people to notify. # # @return [Array] array of persons @@ -807,6 +854,17 @@ module Mauve alert_db.updated_at = reception_time + # + # The alert suppression time can be set by an update if + # + # * the alert is not already suppressed + # * the alert has changed state. + # + if !alert_db.suppressed? and + ((alert_db.was_raised? and !alert_db.raised?) or (alert_db.was_cleared? and !alert_db.cleared?)) + alert_db.suppress_until = Time.at(alert.suppress_until + time_offset) + end + if alert_db.raised? # # If we're acknowledged, just save. diff --git a/lib/mauve/alert_changed.rb b/lib/mauve/alert_changed.rb index 6f1a717..64aee7b 100644 --- a/lib/mauve/alert_changed.rb +++ b/lib/mauve/alert_changed.rb @@ -99,9 +99,19 @@ module Mauve end # - # Push this notifitcation onto the queue. + # Bail out if notifications for this alert have been suppressed. # - Server.notification_push([alert, Time.now]) + if alert.suppressed? + # + # Skip reminders if the alert is suppressed. + # + logger.info("Notifications suppressed until #{alert.suppressed_until} for #{alert.inspect}") + else + # + # Push this notifitcation onto the queue. + # + Server.notification_push([alert, Time.now]) + end # # Need to make sure this reminder is cleared. diff --git a/lib/mauve/alert_group.rb b/lib/mauve/alert_group.rb index 76c2e98..c6d278c 100644 --- a/lib/mauve/alert_group.rb +++ b/lib/mauve/alert_group.rb @@ -154,6 +154,36 @@ module Mauve end during_runners = [] + + # + # Bail out if notifications for this alert have been suppressed. + # + if alert.suppressed? + logger.info("Notifications suppressed until #{alert.suppress_until} for #{alert.inspect}") + + this_reminder = AlertChanged.first_or_new( + :alert_id => alert.id, + :person => self.name, + :remind_at.not => nil, + :was_relevant => false + ) + + this_reminder.level = level.to_s + this_reminder.at = at + + if this_reminder.update_type.nil? or + (alert.raised? and this_reminder.update_type == "CLEARED") or + (alert.cleared? and %w(RAISED ACKNOWLEDGED).include?(this_reminder.update_type)) + + this_reminder.update_type = alert.update_type + this_reminder.remind_at = alert.suppress_until + else + this_reminder.remind_at = nil + end + + return this_reminder.save + end + # # This is where we set the reminder -- i.e. on a per-alert-group basis. diff --git a/lib/mauve/proto.rb b/lib/mauve/proto.rb index 6c22609..5c00572 100644 --- a/lib/mauve/proto.rb +++ b/lib/mauve/proto.rb @@ -46,6 +46,9 @@ # // # optional uint32 importance = 7; # +# // Suppress any notifications about this alert until this UNIX time. +# // +# optional uint64 suppress_until = 8; # } # # // The AlertUpdate is the unit of communication from an alerting source; @@ -102,6 +105,7 @@ module Mauve optional :string, :summary, 5 optional :string, :detail, 6 optional :uint32, :importance, 7 + optional :uint64, :suppress_until, 8 end class AlertUpdate < ::Protobuf::Message defined_in __FILE__ diff --git a/mauve.proto b/mauve.proto index 7bd25e4..cf030dd 100644 --- a/mauve.proto +++ b/mauve.proto @@ -44,6 +44,9 @@ message Alert { // optional uint32 importance = 7; + // Suppress any notifications about this alert until this UNIX time. + // + optional uint64 suppress_until = 8; } // The AlertUpdate is the unit of communication from an alerting source; diff --git a/test/tc_mauve_alert.rb b/test/tc_mauve_alert.rb index 269bf4c..61bd117 100644 --- a/test/tc_mauve_alert.rb +++ b/test/tc_mauve_alert.rb @@ -173,6 +173,123 @@ EOF end + def test_recieve_update_with_suppression + Server.instance.setup + + update = Proto::AlertUpdate.new + update.source = "test-host" + message = Proto::Alert.new + update.alert << message + message.id = "test_recieve_update" + + # + # If we send the same message every 2 minutes, with the suppress time + # moving forward each time, it should get set once, and then stay the same + # after that. + # + suppress_until = nil + + 5.times do |i| + message.raise_time = Time.now.to_i + message.suppress_until = Time.now.to_i + 10.minutes + suppress_until = Time.now + 10.minutes if suppress_until.nil? + + Alert.receive_update(update, Time.now, "127.0.0.1") + a = Alert.first(:alert_id => 'test_recieve_update') + + assert(a.raised?, "Alert not raised after a raised message has been sent") + assert(a.suppressed?, "Alert has not been suppressed when the suppress_until value has been set") + assert_equal(suppress_until, a.suppress_until, "The suppress until time has been set incorrectly when being raised repeatedly") + + Timecop.freeze(Time.now + 2.minute) + end + + # + # Ten minutes have passed == the suppression should have lapsed. + # + assert_equal(0, (Time.now - suppress_until).to_i, "Ten minutes have not elapsed!") + a = Alert.first(:alert_id => 'test_recieve_update') + assert(!a.suppressed?, "The alert is still suppressed past its suppress_until time") + + # + # Try again -- the suppression has expired, but the alert is still raised. + # This should not re-set the suppress_until value. + # + 5.times do + message.suppress_until = Time.now.to_i + 10.minutes + Alert.receive_update(update, Time.now, "127.0.0.1") + a = Alert.first(:alert_id => 'test_recieve_update') + assert(a.raised?) + assert_equal(suppress_until, a.suppress_until) + assert(!a.suppressed?) + + Timecop.freeze(Time.now + 2.minute) + end + + # + # Now on clear, we should be able to set the suppression time again. + # + suppress_until = Time.now + 10.minutes + message.raise_time = nil + message.clear_time = Time.now.to_i + message.suppress_until = suppress_until.to_i + + Alert.receive_update(update, Time.now, "127.0.0.1") + a = Alert.first(:alert_id => 'test_recieve_update') + assert(a.cleared?) + assert_equal(suppress_until, a.suppress_until) + + # + # Now move on two minutes, and raise the alert. The suppression time should not move forward. + # + Timecop.freeze(Time.now + 2.minutes) + + 4.times do + message.raise_time = Time.now.to_i + message.suppress_until = Time.now.to_i + 10.minutes + Alert.receive_update(update, Time.now, "127.0.0.1") + a = Alert.first(:alert_id => 'test_recieve_update') + + assert(a.raised?) + assert(a.suppressed?) + assert_equal(suppress_until, a.suppress_until) + + Timecop.freeze(Time.now + 2.minute) + end + + # + # 10 minutes have now passed, The alert should no longer be suppressed. + # + assert_equal(0, (Time.now - suppress_until).to_i, "Ten minutes have not elapsed!") + a = Alert.first(:alert_id => 'test_recieve_update') + assert(!a.suppressed?, "The alert is still suppressed past its suppress_until time") + + # + # Try again -- the suppression has expired, but the alert is still raised. + # In this case the suppress_until time should not be moved on, until the + # alert has cleared. + # + message.suppress_until = Time.now.to_i + 10.minutes + Alert.receive_update(update, Time.now, "127.0.0.1") + a = Alert.first(:alert_id => 'test_recieve_update') + assert(a.raised?) + assert_equal(suppress_until, a.suppress_until) + assert(!a.suppressed?) + + # + # Now clear, and the suppress time should be set back to nil. + # + message.raise_time = nil + message.clear_time = Time.now.to_i + message.suppress_until = nil + Alert.receive_update(update, Time.now, "127.0.0.1") + a = Alert.first(:alert_id => 'test_recieve_update') + assert(a.cleared?) + assert_equal(nil, a.suppress_until) + assert(!a.suppressed?) + + end + def test_notify_if_needed Configuration.current = ConfigurationBuilder.parse(@test_config) Server.instance.setup @@ -293,7 +410,7 @@ EOF Server.instance.setup alert = Alert.new( - :alert_id => "test_no_notification_for_old_alerts", + :alert_id => "test_destroy_history_on_destroy", :source => "test", :subject => "test" ) @@ -315,4 +432,47 @@ EOF end + def test_alert_suppression + # + # This is a bit of a duff test + # + Configuration.current = ConfigurationBuilder.parse(@test_config) + Server.instance.setup + + alert = Alert.new( + :alert_id => "test_alert_suppression1", + :source => "test", + :subject => "test", + :suppress_until => Time.now + 5.minutes + ) + alert.save + alert.reload + + assert_equal(Time.now + 5.minutes, alert.suppress_until) + assert(alert.suppressed?) + + alert = Alert.new( + :alert_id => "test_alert_suppression2", + :source => "test", + :subject => "test", + :suppress_until => Time.now - 5.minutes + ) + alert.save + alert.reload + + assert_equal(Time.now - 5.minutes, alert.suppress_until) + assert(!alert.suppressed?,"Alert marked as suppressed when the suppression period has expired") + + alert = Alert.new( + :alert_id => "test_alert_suppressio3", + :source => "test", + :subject => "test" + ) + alert.save + alert.reload + + assert_equal(nil, alert.suppress_until) + assert(!alert.suppressed?,"Alert marked as suppressed when the suppression period was never set") + end + end diff --git a/test/tc_mauve_alert_group.rb b/test/tc_mauve_alert_group.rb index 92a9d61..fc31702 100644 --- a/test/tc_mauve_alert_group.rb +++ b/test/tc_mauve_alert_group.rb @@ -106,11 +106,14 @@ EOF a = Alert.new( :alert_id => "test", :source => "test", - :subject => "test" + :subject => "test", + :suppress_until => Time.now + 5.minutes ) a.raise! - assert_equal("test1@example.com", notification_buffer.pop[2]) + # + # Should be suppressed. + # assert(notification_buffer.empty?) Timecop.freeze(Time.now + 5.minutes) @@ -161,6 +164,121 @@ EOF assert(notification_buffer.empty?) end + def test_alert_suppression + config=< "test", + :source => "test", + :subject => "test", + :suppress_until => Time.now + 5.minutes + ) + + # + # No notifications should be sent for 5 minutes + # + a.raise! + + 5.times do + assert(0,notification_buffer.length) + Timecop.freeze(Time.now + 1.minutes) + a.poll + AlertChanged.all.each{|ac| ac.poll} + end + + # + # After 5 minutes a notification should be sent, and a reminder set for 15 minutes afterwards. + # + assert(1,notification_buffer.length) + ac = a.changes.all(:remind_at.not => nil) + assert_equal(1,ac.length, "Only one reminder should be in place at end of suppression period") + assert_equal(Time.now+15.minutes, ac.first.remind_at, "Reminder not set for the correct time after suppression") + + # + # Clear the alert. + # + a.clear! + assert(1, notification_buffer.length) + ac = a.changes.all(:remind_at.not => nil) + assert_equal(0,ac.length, "No reminders should be in place after a clear") + + + ##### + # + # This covers a planned maintenance scenario, when an alert is suppressed + # whilst cleared. Flapping should not cause any notifications. + # + + # + # No notifications should be sent for 15 minutes + # + a.suppress_until = Time.now + 15.minutes + a.clear! + + Timecop.freeze(Time.now + 3.minutes) + + 2.times do + 5.times do + # + # Raise. This should not cause any notifications for 10 minutes. + # + a.raise! + assert(0,notification_buffer.length) + Timecop.freeze(Time.now + 1.minutes) + a.poll + AlertChanged.all.each{|ac| ac.poll} + end + + # + # This should not raise any alerts, and all reminders should be cleared. + # + a.clear! + assert(0,notification_buffer.length) + + Timecop.freeze(Time.now + 1.minutes) + a.poll + AlertChanged.all.each{|ac| ac.poll} + + ac = a.changes.all(:remind_at.not => nil) + assert_equal(0, ac.length, "Reminder in place at when the raised alert was cleared.") + end + + # Now re-raise. + a.raise! + assert(1,notification_buffer.length) + + ac = a.changes.all(:remind_at.not => nil) + assert_equal(1,ac.length, "Only one reminder should be in place at end of suppression period") + assert_equal(Time.now+15.minutes, ac.first.remind_at, "Reminder not set for the correct time after suppression") + end + end -- cgit v1.2.1