From 53810efe8a7f92c47b5b6bab9de4afb52ce380b2 Mon Sep 17 00:00:00 2001 From: Steve Kemp Date: Tue, 8 Aug 2017 08:43:38 +0300 Subject: Ignore bogus DNS results. We've had a problem for the past few weeks (?) where we see false DNS errors when making http/https requests with `curb`/`libcurl`. To resolve these issues properly we're going to have to rewrite the code to avoid the current gem. However that is considerable work because of the hole we've back ourself into - wanting to test both IPv4 and IPv6 "properly". We'll have to duplicate that work if we use `net/http`, or even mroe so if we use `open3` and exec `curl -4|-6 ..` For the moment this commit changes how things are handled to deal with the issue we see - which doesn't solve the problem but will mask it. When custodian runs a test it will return a status-code: * Custodian::TestResult::TEST_FAILED * The test failed, such that an alert should be raised. * Custodian::TestResult::TEST_PASSED * The test succeeded, such that any previous alert should be cleared. * Custodian::TestResult::TEST_SKIPPED * Nothing should be done. As the failure we see is very very specific - an exception is thrown of the type `Curl::Err::HostResolutionError` - we can catch that and return `TEST_SKIPPED`. That means that there will be no (urgent) alert. Obviously the potential risk of swallowing all DNS-failures is that a domain might expire and we'd never know. So we'll do a little better than merely skipping the test if there are DNS failures: * If we see a DNS failure. * Then we try to lookup the host as an A & AAAA record. * If that succeeds we decide the issue was bogus. * If that fails then the host legitimately doesn't resolve so we raise an alert. To recap: * If a host fails normally - bogus status-code, or missing text - we behave as we did in the past. * Only in the case of a DNS-error from curb/curl do we go down this horrid path. * Where we try to confirm the error, and swallow it if false. This closes #13. --- lib/custodian/protocoltest/http.rb | 75 +++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/lib/custodian/protocoltest/http.rb b/lib/custodian/protocoltest/http.rb index a3f34cc..71388c8 100644 --- a/lib/custodian/protocoltest/http.rb +++ b/lib/custodian/protocoltest/http.rb @@ -337,8 +337,80 @@ module Custodian rescue Curl::Err::TooManyRedirectsError errors << "#{protocol_msg}: More than 10 redirections." rescue Curl::Err::HostResolutionError => x - # Log the DNS error-message. + + # + # We've been beset by a series of false-alerts in the recent + # past which have all occurred at this point: + # + # * We get bogus errors in resolving DNS from curb/libcurl. + # + # * These errors go away on retry. + # + # * But the retry isn't fast enough to outrace the + # supression-time of our alerts. + # + # For the moment we're going to _temporarily_ ignore these + # errors. + # + # * If a host has Connection-Refused, the wrong status-cde + # or similar failure it will be handled as normal. + # + # * If the host has genuinely lost DNS then we're going to + # raise an alert, but if it is this false-error then we + # will silently disable this test-run. + # + # + + # IP addresses we found for the host + ips = [] + + # Get the hostname we're connecting to. + u = URI.parse(test_url) + target = u.host + + # + # Resolve the target to an IP, unless it is already an address. + # + if (target =~ /^([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)$/) || + (target =~ /^([0-9a-f:]+)$/) + ips.push(target) + else + + # + # OK if it didn't look like an IP address then attempt to + # look it up, as both IPv4 and IPv6. + # + begin + timeout(30) do + + Resolv::DNS.open do |dns| + + ress = dns.getresources(target, Resolv::DNS::Resource::IN::A) + ress.map { |r| ips.push(r.address.to_s) } + + ress = dns.getresources(target, Resolv::DNS::Resource::IN::AAAA) + ress.map { |r| ips.push(r.address.to_s) } + end + end + rescue Timeout::Error => _e + # NOP + end + end + + # + # At this point we either have: + # + # "ips" containing entries - because the hostname resolved + # + # "ips" being empty because the DNS failure was genuine + # + return Custodian::TestResult::TEST_SKIPPED unless ips.empty? + + # + # Log the DNS error-message, as this is genuine. + # resolution_errors << "#{protocol_msg}: #{x.class}: #{x.message}\n #{x.backtrace.join("\n ")}." + rescue => x errors << "#{protocol_msg}: #{x.class}: #{x.message}\n #{x.backtrace.join("\n ")}." end @@ -361,6 +433,7 @@ module Custodian # uh-oh! Resolution failed on both protocols! if resolution_errors.length > 1 errors << "DNS Error when resolving host - #{resolution_errors.join(',')}" + end if !errors.empty? -- cgit v1.2.1 From e77603a491d3980c5d1fe8ae0af4c7414dde6eb4 Mon Sep 17 00:00:00 2001 From: Steve Kemp Date: Tue, 8 Aug 2017 08:49:42 +0300 Subject: Added changelog entry for this abomination. --- debian/changelog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/debian/changelog b/debian/changelog index 157a4c8..3827334 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +custodian (0.36) stable; urgency=high + + * If we receive a DNS-error from the curb-gem we ignore it, unless + failing to resolve the appropriate hostname ourselves. + + -- Steve Kemp Tue, 08 Aug 2017 08:55:08 +0200 + custodian (0.35) stable; urgency=low * Alert in more detail on DNS failures. -- cgit v1.2.1 From 3ddad643888312cfb32b27aee3a057106e1895e1 Mon Sep 17 00:00:00 2001 From: Steve Kemp Date: Tue, 8 Aug 2017 13:28:23 +0300 Subject: Updated to move ignore-dns-failure code into routine. That is then tested when resolve-errors are handled. --- lib/custodian/protocoltest/http.rb | 148 +++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 73 deletions(-) diff --git a/lib/custodian/protocoltest/http.rb b/lib/custodian/protocoltest/http.rb index 71388c8..88ac568 100644 --- a/lib/custodian/protocoltest/http.rb +++ b/lib/custodian/protocoltest/http.rb @@ -37,6 +37,78 @@ module Custodian # attr_reader :expected_status, :expected_content + # + # Should we ignore a (temporary) DNS error in this test? + # + # We've been beset by a series of false-alerts in the recent + # past which have all occurred at this point: + # + # * We get bogus errors in resolving DNS from curb/libcurl. + # + # * These errors go away on retry. + # + # * But the retry isn't fast enough to outrace the + # supression-time of our alerts. + # + # For the moment we're going to _temporarily_ ignore these errors. + # + # * If a host has Connection-Refused, the wrong status-cde + # or similar failure it will be handled as normal. + # + # * If the host has genuinely lost DNS then we're going to + # raise an alert, but if it is this false-error then we + # will silently disable this test-run. + # + def ignore_failure? + + # IP addresses we found for the host + ips = [] + + # Get the hostname we're connecting to. + u = URI.parse(@url) + target = u.host + + # + # Resolve the target to an IP, unless it is already an address. + # + if (target =~ /^([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)$/) || + (target =~ /^([0-9a-f:]+)$/) + ips.push(target) + else + + # + # OK if it didn't look like an IP address then attempt to + # look it up, as both IPv4 and IPv6. + # + begin + timeout(30) do + + Resolv::DNS.open do |dns| + + ress = dns.getresources(target, Resolv::DNS::Resource::IN::A) + ress.map { |r| ips.push(r.address.to_s) } + + ress = dns.getresources(target, Resolv::DNS::Resource::IN::AAAA) + ress.map { |r| ips.push(r.address.to_s) } + end + end + rescue Timeout::Error => _e + # NOP + end + end + + # + # At this point we either have: + # + # "ips" containing entries - because the hostname resolved + # + # "ips" being empty because the DNS failure was genuine + # + return ( ! ips.empty? ) + end + + + # # Constructor # @@ -337,78 +409,6 @@ module Custodian rescue Curl::Err::TooManyRedirectsError errors << "#{protocol_msg}: More than 10 redirections." rescue Curl::Err::HostResolutionError => x - - # - # We've been beset by a series of false-alerts in the recent - # past which have all occurred at this point: - # - # * We get bogus errors in resolving DNS from curb/libcurl. - # - # * These errors go away on retry. - # - # * But the retry isn't fast enough to outrace the - # supression-time of our alerts. - # - # For the moment we're going to _temporarily_ ignore these - # errors. - # - # * If a host has Connection-Refused, the wrong status-cde - # or similar failure it will be handled as normal. - # - # * If the host has genuinely lost DNS then we're going to - # raise an alert, but if it is this false-error then we - # will silently disable this test-run. - # - # - - # IP addresses we found for the host - ips = [] - - # Get the hostname we're connecting to. - u = URI.parse(test_url) - target = u.host - - # - # Resolve the target to an IP, unless it is already an address. - # - if (target =~ /^([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)$/) || - (target =~ /^([0-9a-f:]+)$/) - ips.push(target) - else - - # - # OK if it didn't look like an IP address then attempt to - # look it up, as both IPv4 and IPv6. - # - begin - timeout(30) do - - Resolv::DNS.open do |dns| - - ress = dns.getresources(target, Resolv::DNS::Resource::IN::A) - ress.map { |r| ips.push(r.address.to_s) } - - ress = dns.getresources(target, Resolv::DNS::Resource::IN::AAAA) - ress.map { |r| ips.push(r.address.to_s) } - end - end - rescue Timeout::Error => _e - # NOP - end - end - - # - # At this point we either have: - # - # "ips" containing entries - because the hostname resolved - # - # "ips" being empty because the DNS failure was genuine - # - return Custodian::TestResult::TEST_SKIPPED unless ips.empty? - - # - # Log the DNS error-message, as this is genuine. - # resolution_errors << "#{protocol_msg}: #{x.class}: #{x.message}\n #{x.backtrace.join("\n ")}." rescue => x @@ -432,8 +432,10 @@ module Custodian # uh-oh! Resolution failed on both protocols! if resolution_errors.length > 1 - errors << "DNS Error when resolving host - #{resolution_errors.join(',')}" + return Custodian::TestResult::TEST_SKIPPED if ignore_failure? + + errors << "DNS Error when resolving host - #{resolution_errors.join(',')}" end if !errors.empty? -- cgit v1.2.1 From 70b13e92297905631629a05e827d70e98c3f422d Mon Sep 17 00:00:00 2001 From: Steve Kemp Date: Tue, 8 Aug 2017 15:37:36 +0300 Subject: Sanity-check DNS on a per-protocol basis. When a failure occurs in looking up IPv4 addresses we confirm that, similarly when/if IPv6 lookups fail we confirm that before raising the alert. --- lib/custodian/protocoltest/http.rb | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/lib/custodian/protocoltest/http.rb b/lib/custodian/protocoltest/http.rb index 88ac568..32073f8 100644 --- a/lib/custodian/protocoltest/http.rb +++ b/lib/custodian/protocoltest/http.rb @@ -59,7 +59,7 @@ module Custodian # raise an alert, but if it is this false-error then we # will silently disable this test-run. # - def ignore_failure? + def ignore_failure?( protocol ) # IP addresses we found for the host ips = [] @@ -85,11 +85,16 @@ module Custodian Resolv::DNS.open do |dns| - ress = dns.getresources(target, Resolv::DNS::Resource::IN::A) - ress.map { |r| ips.push(r.address.to_s) } + if ( protocol == :ipv4 ) + ress = dns.getresources(target, Resolv::DNS::Resource::IN::A) + ress.map { |r| ips.push(r.address.to_s) } + elsif ( protocol == :ipv6 ) - ress = dns.getresources(target, Resolv::DNS::Resource::IN::AAAA) - ress.map { |r| ips.push(r.address.to_s) } + ress = dns.getresources(target, Resolv::DNS::Resource::IN::AAAA) + ress.map { |r| ips.push(r.address.to_s) } + else + raise ArgumentError, "Sanity-checking DNS-failure of unknown type" + end end end rescue Timeout::Error => _e @@ -409,7 +414,7 @@ module Custodian rescue Curl::Err::TooManyRedirectsError errors << "#{protocol_msg}: More than 10 redirections." rescue Curl::Err::HostResolutionError => x - resolution_errors << "#{protocol_msg}: #{x.class}: #{x.message}\n #{x.backtrace.join("\n ")}." + resolution_errors << "#{protocol_msg}: #{x.class}: #{x.message}\n #{x.backtrace.join("\n ")}." unless ignore_failure?( resolve_mode) rescue => x errors << "#{protocol_msg}: #{x.class}: #{x.message}\n #{x.backtrace.join("\n ")}." @@ -432,9 +437,6 @@ module Custodian # uh-oh! Resolution failed on both protocols! if resolution_errors.length > 1 - - return Custodian::TestResult::TEST_SKIPPED if ignore_failure? - errors << "DNS Error when resolving host - #{resolution_errors.join(',')}" end -- cgit v1.2.1 From 3be19cc5ba17e084c10c8f5d0eb114b01e53f733 Mon Sep 17 00:00:00 2001 From: Steve Kemp Date: Tue, 8 Aug 2017 15:59:53 +0300 Subject: Use a case-statement for both kinds of IP-matching. --- lib/custodian/protocoltest/http.rb | 64 +++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/lib/custodian/protocoltest/http.rb b/lib/custodian/protocoltest/http.rb index 32073f8..9ff734c 100644 --- a/lib/custodian/protocoltest/http.rb +++ b/lib/custodian/protocoltest/http.rb @@ -61,44 +61,52 @@ module Custodian # def ignore_failure?( protocol ) - # IP addresses we found for the host - ips = [] - # Get the hostname we're connecting to. u = URI.parse(@url) target = u.host - # - # Resolve the target to an IP, unless it is already an address. - # - if (target =~ /^([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)$/) || - (target =~ /^([0-9a-f:]+)$/) - ips.push(target) + # IPs for the target + ips = [] + + case protocol + when :ipv4 + if (target =~ /^([0-9]+)\.([0-9]+)\.([0-9]+)\.([0-9]+)$/) + ips << target + end + when :ipv6 + if (target =~ /^([0-9a-f:]+)$/) + ips << target + end else + raise ArgumentError, "Sanity-checking DNS-failure of unknown type: #{protocol}" + end - # - # OK if it didn't look like an IP address then attempt to - # look it up, as both IPv4 and IPv6. - # - begin - timeout(30) do + # Early termination? + return true unless ips.empty? + # + # OK if it didn't look like an IP address then attempt to + # look it up, as both IPv4 and IPv6. + # + begin + timeout(30) do + + type = case protocol + when :ipv4 + Resolv::DNS::Resource::IN::A + when :ipv6 + Resolv::DNS::Resource::IN::AAAA + else + raise ArgumentError, "Sanity-checking DNS-failure of unknown type: #{protocol}" + end + + begin Resolv::DNS.open do |dns| - - if ( protocol == :ipv4 ) - ress = dns.getresources(target, Resolv::DNS::Resource::IN::A) - ress.map { |r| ips.push(r.address.to_s) } - elsif ( protocol == :ipv6 ) - - ress = dns.getresources(target, Resolv::DNS::Resource::IN::AAAA) - ress.map { |r| ips.push(r.address.to_s) } - else - raise ArgumentError, "Sanity-checking DNS-failure of unknown type" - end + ips = dns.getresources(target, type) end + rescue Timeout::Error => _e + # NOP end - rescue Timeout::Error => _e - # NOP end end -- cgit v1.2.1 From 45f772cfe7626ec7c250d89c7291c23decf13558 Mon Sep 17 00:00:00 2001 From: Steve Kemp Date: Tue, 8 Aug 2017 16:38:10 +0300 Subject: Moved case statement outside timeout block. Also removed a redudant `begin`. --- lib/custodian/protocoltest/http.rb | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/lib/custodian/protocoltest/http.rb b/lib/custodian/protocoltest/http.rb index 9ff734c..0bf1a68 100644 --- a/lib/custodian/protocoltest/http.rb +++ b/lib/custodian/protocoltest/http.rb @@ -89,27 +89,26 @@ module Custodian # look it up, as both IPv4 and IPv6. # begin - timeout(30) do - type = case protocol - when :ipv4 - Resolv::DNS::Resource::IN::A - when :ipv6 - Resolv::DNS::Resource::IN::AAAA - else - raise ArgumentError, "Sanity-checking DNS-failure of unknown type: #{protocol}" - end - - begin - Resolv::DNS.open do |dns| - ips = dns.getresources(target, type) - end - rescue Timeout::Error => _e - # NOP + type = case protocol + when :ipv4 + Resolv::DNS::Resource::IN::A + when :ipv6 + Resolv::DNS::Resource::IN::AAAA + else + raise ArgumentError, "Sanity-checking DNS-failure of unknown type: #{protocol}" + end + + timeout(30) do + Resolv::DNS.open do |dns| + ips = dns.getresources(target, type) end end + rescue Timeout::Error => _e + # NOP end + # # At this point we either have: # -- cgit v1.2.1