summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSaku Ytti <saku@ytti.fi>2014-07-11 21:06:37 +0300
committerSaku Ytti <saku@ytti.fi>2014-07-11 21:06:37 +0300
commite3aa71fc17eba8586d2e0117b6b363942f577488 (patch)
tree4e9a91fa86cc6787c872b9273cf843ebf9aedb67
parentee1bcd1f4d8d9b487d9c37b8ad97c06f24bdb09a (diff)
add support for retrying failed attempts
Looks like this in syslog: Jul 11 21:05:53 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 22" Jul 11 21:05:53 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 23" Jul 11 21:05:54 ytti oxidized[9820]: 10.10.10.10 status no_connection, retry attempt 1 Jul 11 21:05:54 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 22" Jul 11 21:05:54 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 23" Jul 11 21:05:55 ytti oxidized[9820]: 10.10.10.10 status no_connection, retry attempt 2 Jul 11 21:05:55 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 22" Jul 11 21:05:55 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 23" Jul 11 21:05:56 ytti oxidized[9820]: 10.10.10.10 status no_connection, retry attempt 3 Jul 11 21:05:56 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 22" Jul 11 21:05:56 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 23" Jul 11 21:05:57 ytti oxidized[9820]: 10.10.10.10 status no_connection, retries exhausted, giving up
-rw-r--r--lib/oxidized/config.rb3
-rw-r--r--lib/oxidized/input/input.rb1
-rw-r--r--lib/oxidized/node.rb3
-rw-r--r--lib/oxidized/worker.rb15
4 files changed, 17 insertions, 5 deletions
diff --git a/lib/oxidized/config.rb b/lib/oxidized/config.rb
index 02698a6..b4d1c64 100644
--- a/lib/oxidized/config.rb
+++ b/lib/oxidized/config.rb
@@ -22,7 +22,8 @@ module Oxidized
CFGS.default.log = File.join Config::Root, 'log'
CFGS.default.debug = false
CFGS.default.threads = 30
- CFGS.default.timeout = 30
+ CFGS.default.timeout = 20
+ CFGS.default.retries = 3
CFGS.default.prompt = /^([\w.@-]+[#>]\s?)$/
CFGS.default.rest = '127.0.0.1:8888' # or false to disable
CFGS.default.vars = {} # could be 'enable'=>'enablePW'
diff --git a/lib/oxidized/input/input.rb b/lib/oxidized/input/input.rb
index 1184a0b..049c99a 100644
--- a/lib/oxidized/input/input.rb
+++ b/lib/oxidized/input/input.rb
@@ -11,6 +11,7 @@ module Oxidized
Timeout::Error,
Errno::ECONNRESET,
Errno::EHOSTUNREACH,
+ Errno::ENETUNREACH,
Errno::EPIPE,
],
}
diff --git a/lib/oxidized/node.rb b/lib/oxidized/node.rb
index c39bee1..72c84bd 100644
--- a/lib/oxidized/node.rb
+++ b/lib/oxidized/node.rb
@@ -6,7 +6,7 @@ module Oxidized
class ModelNotFound < OxidizedError; end
class Node
attr_reader :name, :ip, :model, :input, :output, :group, :auth, :prompt, :vars, :last
- attr_accessor :running, :user, :msg, :from, :stats
+ attr_accessor :running, :user, :msg, :from, :stats, :retry
alias :running? :running
def initialize opt
@name = opt[:name]
@@ -19,6 +19,7 @@ module Oxidized
@prompt = resolve_prompt opt
@vars = opt[:vars]
@stats = Stats.new
+ @retry = 0
# model instance needs to access node instance
@model.node = self
diff --git a/lib/oxidized/worker.rb b/lib/oxidized/worker.rb
index 58bf659..0e96212 100644
--- a/lib/oxidized/worker.rb
+++ b/lib/oxidized/worker.rb
@@ -29,17 +29,26 @@ module Oxidized
node.last = job
node.stats.add job
@jobs.duration job.time
+ node.running = false
if job.status == :success
msg = "update #{node.name}"
msg += " from #{node.from}" if node.from
msg += " with message '#{node.msg}'" if node.msg
node.output.new.store node.name, job.config,
:msg => msg, :user => node.user, :group => node.group
- node.reset
else
- Log.warn "#{node.name} status #{job.status}"
+ msg = "#{node.name} status #{job.status}"
+ if node.retry < CFG.retries
+ node.retry += 1
+ msg += ", retry attempt #{node.retry}"
+ @nodes.next node.name
+ else
+ msg += ", retries exhausted, giving up"
+ node.retry = 0
+ end
+ Log.warn msg
end
- node.running = false
+ node.reset
end
end
end