aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick J Cherry <patrick@bytemark.co.uk>2012-11-29 10:18:57 +0000
committerPatrick J Cherry <patrick@bytemark.co.uk>2012-11-29 10:18:57 +0000
commitb5d3d793deedd60ebd5dda9056d012ae3a9411a8 (patch)
tree7863c34f6dda4de768cb718d8cb31ec25f53f5a7
parent63ebee3b7ce56b49f6c545a4bc095c2f7499b7b8 (diff)
Added alert suppression to the server code + protobuf.
-rw-r--r--lib/mauve/alert.rb64
-rw-r--r--lib/mauve/alert_changed.rb14
-rw-r--r--lib/mauve/alert_group.rb30
-rw-r--r--lib/mauve/proto.rb4
-rw-r--r--mauve.proto3
-rw-r--r--test/tc_mauve_alert.rb162
-rw-r--r--test/tc_mauve_alert_group.rb122
7 files changed, 391 insertions, 8 deletions
diff --git a/lib/mauve/alert.rb b/lib/mauve/alert.rb
index ab13736..1e94bde 100644
--- a/lib/mauve/alert.rb
+++ b/lib/mauve/alert.rb
@@ -101,6 +101,7 @@ module Mauve
property :raised_at, EpochTime
property :cleared_at, EpochTime
property :updated_at, EpochTime
+ property :suppress_until, EpochTime
property :acknowledged_at, EpochTime
property :acknowledged_by, String, :lazy => false
property :update_type, String, :lazy => false
@@ -458,7 +459,8 @@ module Mauve
self.will_raise_at = nil
self.cleared_at = nil
# Don't clear will_clear_at
- self.update_type = "raised" if self.update_type.nil? or self.update_type != "changed" or self.original_attributes[Alert.properties[:update_type]] == "cleared"
+ self.update_type = "raised" if self.update_type.nil? or !self.acknowledged?
+ self.suppress_until = nil unless self.suppressed? or self.was_raised?
end
self.save
@@ -490,13 +492,14 @@ module Mauve
self.cleared_at = at if self.cleared_at.nil?
self.will_clear_at = nil
self.update_type = "cleared"
+ self.suppress_until = nil unless self.suppressed? or self.was_cleared?
end
if self.save
#
# Clear all reminders.
#
- self.changes.all(:remind_at.not => nil, :at.lte => at, :update_type => "raised").each do |ac|
+ self.changes.all(:remind_at.not => nil).each do |ac|
ac.remind_at = nil
ac.save
end
@@ -513,7 +516,7 @@ module Mauve
false
end
end
-
+
# The next time this alert should be polled, either to raise, clear, or
# unacknowledge, or nil if nothing is due.
#
@@ -545,6 +548,25 @@ module Mauve
!raised_at.nil? and (cleared_at.nil? or raised_at > cleared_at)
end
+ # Was the alert raised before changes were made?
+ #
+ # @return [Boolean]
+ def was_raised?
+ was_raised_at = if original_attributes.has_key?(Alert.properties[:raised_at])
+ original_attributes[Alert.properties[:raised_at]]
+ else
+ self.raised_at
+ end
+
+ was_cleared_at = if original_attributes.has_key?(Alert.properties[:cleared_at])
+ original_attributes[Alert.properties[:cleared_at]]
+ else
+ self.cleared_at
+ end
+
+ !was_raised_at.nil? and (was_cleared_at.nil? or was_raised_at > was_cleared_at)
+ end
+
# Is the alert acknowledged?
#
# @return [Boolean]
@@ -552,6 +574,17 @@ module Mauve
!acknowledged_at.nil?
end
+ # Was the alert acknowledged before the current changes?
+ #
+ # @return [Boolean]
+ def was_acknowledged?
+ if original_attributes.has_key?(Alert.properties[:acknowledged_at])
+ !original_attributes[Alert.properties[:acknowledged_at]].nil?
+ else
+ self.acknowledged?
+ end
+ end
+
# Is the alert cleared? Cleared is just the opposite of raised.
#
# @return [Boolean]
@@ -559,6 +592,20 @@ module Mauve
!raised?
end
+ # Was the alert cleared before the current changes?
+ #
+ # @return [Boolean]
+ def was_cleared?
+ !was_raised?
+ end
+
+ # Is the alert suppressed?
+ #
+ # @return [Boolean]
+ def suppressed?
+ self.suppress_until.is_a?(Time) and self.suppress_until > Time.now
+ end
+
# Work out an array of extra people to notify.
#
# @return [Array] array of persons
@@ -807,6 +854,17 @@ module Mauve
alert_db.updated_at = reception_time
+ #
+ # The alert suppression time can be set by an update if
+ #
+ # * the alert is not already suppressed
+ # * the alert has changed state.
+ #
+ if !alert_db.suppressed? and
+ ((alert_db.was_raised? and !alert_db.raised?) or (alert_db.was_cleared? and !alert_db.cleared?))
+ alert_db.suppress_until = Time.at(alert.suppress_until + time_offset)
+ end
+
if alert_db.raised?
#
# If we're acknowledged, just save.
diff --git a/lib/mauve/alert_changed.rb b/lib/mauve/alert_changed.rb
index 6f1a717..64aee7b 100644
--- a/lib/mauve/alert_changed.rb
+++ b/lib/mauve/alert_changed.rb
@@ -99,9 +99,19 @@ module Mauve
end
#
- # Push this notifitcation onto the queue.
+ # Bail out if notifications for this alert have been suppressed.
#
- Server.notification_push([alert, Time.now])
+ if alert.suppressed?
+ #
+ # Skip reminders if the alert is suppressed.
+ #
+ logger.info("Notifications suppressed until #{alert.suppressed_until} for #{alert.inspect}")
+ else
+ #
+ # Push this notifitcation onto the queue.
+ #
+ Server.notification_push([alert, Time.now])
+ end
#
# Need to make sure this reminder is cleared.
diff --git a/lib/mauve/alert_group.rb b/lib/mauve/alert_group.rb
index 76c2e98..c6d278c 100644
--- a/lib/mauve/alert_group.rb
+++ b/lib/mauve/alert_group.rb
@@ -154,6 +154,36 @@ module Mauve
end
during_runners = []
+
+ #
+ # Bail out if notifications for this alert have been suppressed.
+ #
+ if alert.suppressed?
+ logger.info("Notifications suppressed until #{alert.suppress_until} for #{alert.inspect}")
+
+ this_reminder = AlertChanged.first_or_new(
+ :alert_id => alert.id,
+ :person => self.name,
+ :remind_at.not => nil,
+ :was_relevant => false
+ )
+
+ this_reminder.level = level.to_s
+ this_reminder.at = at
+
+ if this_reminder.update_type.nil? or
+ (alert.raised? and this_reminder.update_type == "CLEARED") or
+ (alert.cleared? and %w(RAISED ACKNOWLEDGED).include?(this_reminder.update_type))
+
+ this_reminder.update_type = alert.update_type
+ this_reminder.remind_at = alert.suppress_until
+ else
+ this_reminder.remind_at = nil
+ end
+
+ return this_reminder.save
+ end
+
#
# This is where we set the reminder -- i.e. on a per-alert-group basis.
diff --git a/lib/mauve/proto.rb b/lib/mauve/proto.rb
index 6c22609..5c00572 100644
--- a/lib/mauve/proto.rb
+++ b/lib/mauve/proto.rb
@@ -46,6 +46,9 @@
# //
# optional uint32 importance = 7;
#
+# // Suppress any notifications about this alert until this UNIX time.
+# //
+# optional uint64 suppress_until = 8;
# }
#
# // The AlertUpdate is the unit of communication from an alerting source;
@@ -102,6 +105,7 @@ module Mauve
optional :string, :summary, 5
optional :string, :detail, 6
optional :uint32, :importance, 7
+ optional :uint64, :suppress_until, 8
end
class AlertUpdate < ::Protobuf::Message
defined_in __FILE__
diff --git a/mauve.proto b/mauve.proto
index 7bd25e4..cf030dd 100644
--- a/mauve.proto
+++ b/mauve.proto
@@ -44,6 +44,9 @@ message Alert {
//
optional uint32 importance = 7;
+ // Suppress any notifications about this alert until this UNIX time.
+ //
+ optional uint64 suppress_until = 8;
}
// The AlertUpdate is the unit of communication from an alerting source;
diff --git a/test/tc_mauve_alert.rb b/test/tc_mauve_alert.rb
index 269bf4c..61bd117 100644
--- a/test/tc_mauve_alert.rb
+++ b/test/tc_mauve_alert.rb
@@ -173,6 +173,123 @@ EOF
end
+ def test_recieve_update_with_suppression
+ Server.instance.setup
+
+ update = Proto::AlertUpdate.new
+ update.source = "test-host"
+ message = Proto::Alert.new
+ update.alert << message
+ message.id = "test_recieve_update"
+
+ #
+ # If we send the same message every 2 minutes, with the suppress time
+ # moving forward each time, it should get set once, and then stay the same
+ # after that.
+ #
+ suppress_until = nil
+
+ 5.times do |i|
+ message.raise_time = Time.now.to_i
+ message.suppress_until = Time.now.to_i + 10.minutes
+ suppress_until = Time.now + 10.minutes if suppress_until.nil?
+
+ Alert.receive_update(update, Time.now, "127.0.0.1")
+ a = Alert.first(:alert_id => 'test_recieve_update')
+
+ assert(a.raised?, "Alert not raised after a raised message has been sent")
+ assert(a.suppressed?, "Alert has not been suppressed when the suppress_until value has been set")
+ assert_equal(suppress_until, a.suppress_until, "The suppress until time has been set incorrectly when being raised repeatedly")
+
+ Timecop.freeze(Time.now + 2.minute)
+ end
+
+ #
+ # Ten minutes have passed == the suppression should have lapsed.
+ #
+ assert_equal(0, (Time.now - suppress_until).to_i, "Ten minutes have not elapsed!")
+ a = Alert.first(:alert_id => 'test_recieve_update')
+ assert(!a.suppressed?, "The alert is still suppressed past its suppress_until time")
+
+ #
+ # Try again -- the suppression has expired, but the alert is still raised.
+ # This should not re-set the suppress_until value.
+ #
+ 5.times do
+ message.suppress_until = Time.now.to_i + 10.minutes
+ Alert.receive_update(update, Time.now, "127.0.0.1")
+ a = Alert.first(:alert_id => 'test_recieve_update')
+ assert(a.raised?)
+ assert_equal(suppress_until, a.suppress_until)
+ assert(!a.suppressed?)
+
+ Timecop.freeze(Time.now + 2.minute)
+ end
+
+ #
+ # Now on clear, we should be able to set the suppression time again.
+ #
+ suppress_until = Time.now + 10.minutes
+ message.raise_time = nil
+ message.clear_time = Time.now.to_i
+ message.suppress_until = suppress_until.to_i
+
+ Alert.receive_update(update, Time.now, "127.0.0.1")
+ a = Alert.first(:alert_id => 'test_recieve_update')
+ assert(a.cleared?)
+ assert_equal(suppress_until, a.suppress_until)
+
+ #
+ # Now move on two minutes, and raise the alert. The suppression time should not move forward.
+ #
+ Timecop.freeze(Time.now + 2.minutes)
+
+ 4.times do
+ message.raise_time = Time.now.to_i
+ message.suppress_until = Time.now.to_i + 10.minutes
+ Alert.receive_update(update, Time.now, "127.0.0.1")
+ a = Alert.first(:alert_id => 'test_recieve_update')
+
+ assert(a.raised?)
+ assert(a.suppressed?)
+ assert_equal(suppress_until, a.suppress_until)
+
+ Timecop.freeze(Time.now + 2.minute)
+ end
+
+ #
+ # 10 minutes have now passed, The alert should no longer be suppressed.
+ #
+ assert_equal(0, (Time.now - suppress_until).to_i, "Ten minutes have not elapsed!")
+ a = Alert.first(:alert_id => 'test_recieve_update')
+ assert(!a.suppressed?, "The alert is still suppressed past its suppress_until time")
+
+ #
+ # Try again -- the suppression has expired, but the alert is still raised.
+ # In this case the suppress_until time should not be moved on, until the
+ # alert has cleared.
+ #
+ message.suppress_until = Time.now.to_i + 10.minutes
+ Alert.receive_update(update, Time.now, "127.0.0.1")
+ a = Alert.first(:alert_id => 'test_recieve_update')
+ assert(a.raised?)
+ assert_equal(suppress_until, a.suppress_until)
+ assert(!a.suppressed?)
+
+ #
+ # Now clear, and the suppress time should be set back to nil.
+ #
+ message.raise_time = nil
+ message.clear_time = Time.now.to_i
+ message.suppress_until = nil
+ Alert.receive_update(update, Time.now, "127.0.0.1")
+ a = Alert.first(:alert_id => 'test_recieve_update')
+ assert(a.cleared?)
+ assert_equal(nil, a.suppress_until)
+ assert(!a.suppressed?)
+
+ end
+
def test_notify_if_needed
Configuration.current = ConfigurationBuilder.parse(@test_config)
Server.instance.setup
@@ -293,7 +410,7 @@ EOF
Server.instance.setup
alert = Alert.new(
- :alert_id => "test_no_notification_for_old_alerts",
+ :alert_id => "test_destroy_history_on_destroy",
:source => "test",
:subject => "test"
)
@@ -315,4 +432,47 @@ EOF
end
+ def test_alert_suppression
+ #
+ # This is a bit of a duff test
+ #
+ Configuration.current = ConfigurationBuilder.parse(@test_config)
+ Server.instance.setup
+
+ alert = Alert.new(
+ :alert_id => "test_alert_suppression1",
+ :source => "test",
+ :subject => "test",
+ :suppress_until => Time.now + 5.minutes
+ )
+ alert.save
+ alert.reload
+
+ assert_equal(Time.now + 5.minutes, alert.suppress_until)
+ assert(alert.suppressed?)
+
+ alert = Alert.new(
+ :alert_id => "test_alert_suppression2",
+ :source => "test",
+ :subject => "test",
+ :suppress_until => Time.now - 5.minutes
+ )
+ alert.save
+ alert.reload
+
+ assert_equal(Time.now - 5.minutes, alert.suppress_until)
+ assert(!alert.suppressed?,"Alert marked as suppressed when the suppression period has expired")
+
+ alert = Alert.new(
+ :alert_id => "test_alert_suppressio3",
+ :source => "test",
+ :subject => "test"
+ )
+ alert.save
+ alert.reload
+
+ assert_equal(nil, alert.suppress_until)
+ assert(!alert.suppressed?,"Alert marked as suppressed when the suppression period was never set")
+ end
+
end
diff --git a/test/tc_mauve_alert_group.rb b/test/tc_mauve_alert_group.rb
index 92a9d61..fc31702 100644
--- a/test/tc_mauve_alert_group.rb
+++ b/test/tc_mauve_alert_group.rb
@@ -106,11 +106,14 @@ EOF
a = Alert.new(
:alert_id => "test",
:source => "test",
- :subject => "test"
+ :subject => "test",
+ :suppress_until => Time.now + 5.minutes
)
a.raise!
- assert_equal("test1@example.com", notification_buffer.pop[2])
+ #
+ # Should be suppressed.
+ #
assert(notification_buffer.empty?)
Timecop.freeze(Time.now + 5.minutes)
@@ -161,6 +164,121 @@ EOF
assert(notification_buffer.empty?)
end
+ def test_alert_suppression
+ config=<<EOF
+server {
+ database "sqlite3::memory:"
+ use_notification_buffer false
+}
+
+notification_method("email") {
+ debug!
+ deliver_to_queue []
+ disable_normal_delivery!
+}
+
+person ("test1") {
+ email "test1@example.com"
+ all { email }
+}
+
+alert_group("default") {
+ includes{ true }
+ notify("test1") {
+ during{ true }
+ every 15.minutes
+ }
+}
+EOF
+ Configuration.current = ConfigurationBuilder.parse(config)
+ notification_buffer = Configuration.current.notification_methods["email"].deliver_to_queue
+ Server.instance.setup
+
+ a = Alert.new(
+ :alert_id => "test",
+ :source => "test",
+ :subject => "test",
+ :suppress_until => Time.now + 5.minutes
+ )
+
+ #
+ # No notifications should be sent for 5 minutes
+ #
+ a.raise!
+
+ 5.times do
+ assert(0,notification_buffer.length)
+ Timecop.freeze(Time.now + 1.minutes)
+ a.poll
+ AlertChanged.all.each{|ac| ac.poll}
+ end
+
+ #
+ # After 5 minutes a notification should be sent, and a reminder set for 15 minutes afterwards.
+ #
+ assert(1,notification_buffer.length)
+ ac = a.changes.all(:remind_at.not => nil)
+ assert_equal(1,ac.length, "Only one reminder should be in place at end of suppression period")
+ assert_equal(Time.now+15.minutes, ac.first.remind_at, "Reminder not set for the correct time after suppression")
+
+ #
+ # Clear the alert.
+ #
+ a.clear!
+ assert(1, notification_buffer.length)
+ ac = a.changes.all(:remind_at.not => nil)
+ assert_equal(0,ac.length, "No reminders should be in place after a clear")
+
+
+ #####
+ #
+ # This covers a planned maintenance scenario, when an alert is suppressed
+ # whilst cleared. Flapping should not cause any notifications.
+ #
+
+ #
+ # No notifications should be sent for 15 minutes
+ #
+ a.suppress_until = Time.now + 15.minutes
+ a.clear!
+
+ Timecop.freeze(Time.now + 3.minutes)
+
+ 2.times do
+ 5.times do
+ #
+ # Raise. This should not cause any notifications for 10 minutes.
+ #
+ a.raise!
+ assert(0,notification_buffer.length)
+ Timecop.freeze(Time.now + 1.minutes)
+ a.poll
+ AlertChanged.all.each{|ac| ac.poll}
+ end
+
+ #
+ # This should not raise any alerts, and all reminders should be cleared.
+ #
+ a.clear!
+ assert(0,notification_buffer.length)
+
+ Timecop.freeze(Time.now + 1.minutes)
+ a.poll
+ AlertChanged.all.each{|ac| ac.poll}
+
+ ac = a.changes.all(:remind_at.not => nil)
+ assert_equal(0, ac.length, "Reminder in place at when the raised alert was cleared.")
+ end
+
+ # Now re-raise.
+ a.raise!
+ assert(1,notification_buffer.length)
+
+ ac = a.changes.all(:remind_at.not => nil)
+ assert_equal(1,ac.length, "Only one reminder should be in place at end of suppression period")
+ assert_equal(Time.now+15.minutes, ac.first.remind_at, "Reminder not set for the correct time after suppression")
+ end
+
end