diff options
author | Saku Ytti <saku@ytti.fi> | 2014-07-11 21:06:37 +0300 |
---|---|---|
committer | Saku Ytti <saku@ytti.fi> | 2014-07-11 21:06:37 +0300 |
commit | e3aa71fc17eba8586d2e0117b6b363942f577488 (patch) | |
tree | 4e9a91fa86cc6787c872b9273cf843ebf9aedb67 | |
parent | ee1bcd1f4d8d9b487d9c37b8ad97c06f24bdb09a (diff) |
add support for retrying failed attempts
Looks like this in syslog:
Jul 11 21:05:53 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 22"
Jul 11 21:05:53 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 23"
Jul 11 21:05:54 ytti oxidized[9820]: 10.10.10.10 status no_connection, retry attempt 1
Jul 11 21:05:54 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 22"
Jul 11 21:05:54 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 23"
Jul 11 21:05:55 ytti oxidized[9820]: 10.10.10.10 status no_connection, retry attempt 2
Jul 11 21:05:55 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 22"
Jul 11 21:05:55 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 23"
Jul 11 21:05:56 ytti oxidized[9820]: 10.10.10.10 status no_connection, retry attempt 3
Jul 11 21:05:56 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 22"
Jul 11 21:05:56 ytti oxidized[9820]: 10.10.10.10 raised Errno::ENETUNREACH with msg "Network is unreachable - connect(2) for "10.10.10.10" port 23"
Jul 11 21:05:57 ytti oxidized[9820]: 10.10.10.10 status no_connection, retries exhausted, giving up
-rw-r--r-- | lib/oxidized/config.rb | 3 | ||||
-rw-r--r-- | lib/oxidized/input/input.rb | 1 | ||||
-rw-r--r-- | lib/oxidized/node.rb | 3 | ||||
-rw-r--r-- | lib/oxidized/worker.rb | 15 |
4 files changed, 17 insertions, 5 deletions
diff --git a/lib/oxidized/config.rb b/lib/oxidized/config.rb index 02698a6..b4d1c64 100644 --- a/lib/oxidized/config.rb +++ b/lib/oxidized/config.rb @@ -22,7 +22,8 @@ module Oxidized CFGS.default.log = File.join Config::Root, 'log' CFGS.default.debug = false CFGS.default.threads = 30 - CFGS.default.timeout = 30 + CFGS.default.timeout = 20 + CFGS.default.retries = 3 CFGS.default.prompt = /^([\w.@-]+[#>]\s?)$/ CFGS.default.rest = '127.0.0.1:8888' # or false to disable CFGS.default.vars = {} # could be 'enable'=>'enablePW' diff --git a/lib/oxidized/input/input.rb b/lib/oxidized/input/input.rb index 1184a0b..049c99a 100644 --- a/lib/oxidized/input/input.rb +++ b/lib/oxidized/input/input.rb @@ -11,6 +11,7 @@ module Oxidized Timeout::Error, Errno::ECONNRESET, Errno::EHOSTUNREACH, + Errno::ENETUNREACH, Errno::EPIPE, ], } diff --git a/lib/oxidized/node.rb b/lib/oxidized/node.rb index c39bee1..72c84bd 100644 --- a/lib/oxidized/node.rb +++ b/lib/oxidized/node.rb @@ -6,7 +6,7 @@ module Oxidized class ModelNotFound < OxidizedError; end class Node attr_reader :name, :ip, :model, :input, :output, :group, :auth, :prompt, :vars, :last - attr_accessor :running, :user, :msg, :from, :stats + attr_accessor :running, :user, :msg, :from, :stats, :retry alias :running? :running def initialize opt @name = opt[:name] @@ -19,6 +19,7 @@ module Oxidized @prompt = resolve_prompt opt @vars = opt[:vars] @stats = Stats.new + @retry = 0 # model instance needs to access node instance @model.node = self diff --git a/lib/oxidized/worker.rb b/lib/oxidized/worker.rb index 58bf659..0e96212 100644 --- a/lib/oxidized/worker.rb +++ b/lib/oxidized/worker.rb @@ -29,17 +29,26 @@ module Oxidized node.last = job node.stats.add job @jobs.duration job.time + node.running = false if job.status == :success msg = "update #{node.name}" msg += " from #{node.from}" if node.from msg += " with message '#{node.msg}'" if node.msg node.output.new.store node.name, job.config, :msg => msg, :user => node.user, :group => node.group - node.reset else - Log.warn "#{node.name} status #{job.status}" + msg = "#{node.name} status #{job.status}" + if node.retry < CFG.retries + node.retry += 1 + msg += ", retry attempt #{node.retry}" + @nodes.next node.name + else + msg += ", retries exhausted, giving up" + node.retry = 0 + end + Log.warn msg end - node.running = false + node.reset end end end |