From f2970143d2c7519046a3d7282613e517fa521d62 Mon Sep 17 00:00:00 2001 From: clawbot Date: Sat, 28 Feb 2026 03:22:57 -0800 Subject: [PATCH] fix: retry on DNS timeout, distinguish authoritative negatives (closes #35) - Add StatusTimeout constant for timeout responses - querySingleType now retries on timeout and SERVFAIL (3 attempts, exponential backoff starting at 100ms) - NXDOMAIN and NOERROR+empty are treated as authoritative negatives with no retry - classifyResponse sets structured error messages for timeout and SERVFAIL cases - Refactored into smaller functions to satisfy cyclomatic complexity limits --- internal/resolver/errors.go | 4 ++ internal/resolver/iterative.go | 117 ++++++++++++++++++++++++++++++--- internal/resolver/resolver.go | 1 + 3 files changed, 114 insertions(+), 8 deletions(-) diff --git a/internal/resolver/errors.go b/internal/resolver/errors.go index 94bc313..a54b87d 100644 --- a/internal/resolver/errors.go +++ b/internal/resolver/errors.go @@ -24,4 +24,8 @@ var ( // ErrContextCanceled wraps context cancellation for the // resolver's iterative queries. ErrContextCanceled = errors.New("context canceled") + + // ErrSERVFAIL is returned when a DNS server responds with + // SERVFAIL after all retries are exhausted. + ErrSERVFAIL = errors.New("SERVFAIL from server") ) diff --git a/internal/resolver/iterative.go b/internal/resolver/iterative.go index 68ce52f..671a21f 100644 --- a/internal/resolver/iterative.go +++ b/internal/resolver/iterative.go @@ -459,9 +459,15 @@ func (r *Resolver) queryAllTypes( return resp, nil } +const ( + singleTypeMaxRetries = 3 + singleTypeInitialBackoff = 100 * time.Millisecond +) + type queryState struct { gotNXDomain bool gotSERVFAIL bool + gotTimeout bool hasRecords bool } @@ -489,6 +495,21 @@ func (r *Resolver) queryEachType( return state } +// isTimeout checks whether an error represents a DNS timeout. +func isTimeout(err error) bool { + if err == nil { + return false + } + + var netErr net.Error + if errors.As(err, &netErr) && netErr.Timeout() { + return true + } + + // Also catch i/o timeout strings from the dns library. + return strings.Contains(err.Error(), "i/o timeout") +} + func (r *Resolver) querySingleType( ctx context.Context, nsIP string, @@ -497,23 +518,99 @@ func (r *Resolver) querySingleType( resp *NameserverResponse, state *queryState, ) { - msg, err := r.queryDNS(ctx, nsIP, hostname, qtype) - if err != nil { + msg, lastErr := r.querySingleTypeWithRetry( + ctx, nsIP, hostname, qtype, + ) + if msg == nil { + r.recordRetryFailure(lastErr, state) + return } + r.handleDNSResponse(msg, resp, state) +} + +func (r *Resolver) querySingleTypeWithRetry( + ctx context.Context, + nsIP string, + hostname string, + qtype uint16, +) (*dns.Msg, error) { + var lastErr error + + backoff := singleTypeInitialBackoff + + for attempt := range singleTypeMaxRetries { + if checkCtx(ctx) != nil { + return nil, ErrContextCanceled + } + + if attempt > 0 { + if !waitBackoff(ctx, backoff) { + return nil, ErrContextCanceled + } + + backoff *= timeoutMultiplier + } + + msg, err := r.queryDNS(ctx, nsIP, hostname, qtype) + if err != nil { + lastErr = err + + if !isTimeout(err) { + return nil, err + } + + continue + } + + if msg.Rcode == dns.RcodeServerFailure { + lastErr = ErrSERVFAIL + + continue + } + + return msg, nil + } + + return nil, lastErr +} + +func waitBackoff(ctx context.Context, d time.Duration) bool { + select { + case <-ctx.Done(): + return false + case <-time.After(d): + return true + } +} + +func (r *Resolver) recordRetryFailure( + lastErr error, + state *queryState, +) { + if lastErr == nil { + return + } + + if isTimeout(lastErr) { + state.gotTimeout = true + } else if errors.Is(lastErr, ErrSERVFAIL) { + state.gotSERVFAIL = true + } +} + +func (r *Resolver) handleDNSResponse( + msg *dns.Msg, + resp *NameserverResponse, + state *queryState, +) { if msg.Rcode == dns.RcodeNameError { state.gotNXDomain = true return } - if msg.Rcode == dns.RcodeServerFailure { - state.gotSERVFAIL = true - - return - } - collectAnswerRecords(msg, resp, state) } @@ -540,8 +637,12 @@ func classifyResponse(resp *NameserverResponse, state queryState) { switch { case state.gotNXDomain && !state.hasRecords: resp.Status = StatusNXDomain + case state.gotTimeout && !state.hasRecords: + resp.Status = StatusTimeout + resp.Error = "all queries timed out after retries" case state.gotSERVFAIL && !state.hasRecords: resp.Status = StatusError + resp.Error = "server failure (SERVFAIL) after retries" case !state.hasRecords && !state.gotNXDomain: resp.Status = StatusNoData } diff --git a/internal/resolver/resolver.go b/internal/resolver/resolver.go index 889cdeb..aec9b89 100644 --- a/internal/resolver/resolver.go +++ b/internal/resolver/resolver.go @@ -17,6 +17,7 @@ const ( StatusError = "error" StatusNXDomain = "nxdomain" StatusNoData = "nodata" + StatusTimeout = "timeout" ) // MaxCNAMEDepth is the maximum CNAME chain depth to follow.