fix: retry on DNS timeout, distinguish authoritative negatives (closes #35)
Some checks failed
Check / check (pull_request) Failing after 6m4s

- Add StatusTimeout constant for timeout responses
- querySingleType now retries on timeout and SERVFAIL (3 attempts,
  exponential backoff starting at 100ms)
- NXDOMAIN and NOERROR+empty are treated as authoritative negatives
  with no retry
- classifyResponse sets structured error messages for timeout and
  SERVFAIL cases
- Refactored into smaller functions to satisfy cyclomatic complexity
  limits
This commit is contained in:
2026-02-28 03:22:57 -08:00
parent 0eb57fc15b
commit f2970143d2
3 changed files with 114 additions and 8 deletions

View File

@@ -24,4 +24,8 @@ var (
// ErrContextCanceled wraps context cancellation for the // ErrContextCanceled wraps context cancellation for the
// resolver's iterative queries. // resolver's iterative queries.
ErrContextCanceled = errors.New("context canceled") ErrContextCanceled = errors.New("context canceled")
// ErrSERVFAIL is returned when a DNS server responds with
// SERVFAIL after all retries are exhausted.
ErrSERVFAIL = errors.New("SERVFAIL from server")
) )

View File

@@ -459,9 +459,15 @@ func (r *Resolver) queryAllTypes(
return resp, nil return resp, nil
} }
const (
singleTypeMaxRetries = 3
singleTypeInitialBackoff = 100 * time.Millisecond
)
type queryState struct { type queryState struct {
gotNXDomain bool gotNXDomain bool
gotSERVFAIL bool gotSERVFAIL bool
gotTimeout bool
hasRecords bool hasRecords bool
} }
@@ -489,6 +495,21 @@ func (r *Resolver) queryEachType(
return state return state
} }
// isTimeout checks whether an error represents a DNS timeout.
func isTimeout(err error) bool {
if err == nil {
return false
}
var netErr net.Error
if errors.As(err, &netErr) && netErr.Timeout() {
return true
}
// Also catch i/o timeout strings from the dns library.
return strings.Contains(err.Error(), "i/o timeout")
}
func (r *Resolver) querySingleType( func (r *Resolver) querySingleType(
ctx context.Context, ctx context.Context,
nsIP string, nsIP string,
@@ -497,19 +518,95 @@ func (r *Resolver) querySingleType(
resp *NameserverResponse, resp *NameserverResponse,
state *queryState, state *queryState,
) { ) {
msg, err := r.queryDNS(ctx, nsIP, hostname, qtype) msg, lastErr := r.querySingleTypeWithRetry(
if err != nil { ctx, nsIP, hostname, qtype,
)
if msg == nil {
r.recordRetryFailure(lastErr, state)
return return
} }
if msg.Rcode == dns.RcodeNameError { r.handleDNSResponse(msg, resp, state)
state.gotNXDomain = true }
return func (r *Resolver) querySingleTypeWithRetry(
ctx context.Context,
nsIP string,
hostname string,
qtype uint16,
) (*dns.Msg, error) {
var lastErr error
backoff := singleTypeInitialBackoff
for attempt := range singleTypeMaxRetries {
if checkCtx(ctx) != nil {
return nil, ErrContextCanceled
}
if attempt > 0 {
if !waitBackoff(ctx, backoff) {
return nil, ErrContextCanceled
}
backoff *= timeoutMultiplier
}
msg, err := r.queryDNS(ctx, nsIP, hostname, qtype)
if err != nil {
lastErr = err
if !isTimeout(err) {
return nil, err
}
continue
} }
if msg.Rcode == dns.RcodeServerFailure { if msg.Rcode == dns.RcodeServerFailure {
lastErr = ErrSERVFAIL
continue
}
return msg, nil
}
return nil, lastErr
}
func waitBackoff(ctx context.Context, d time.Duration) bool {
select {
case <-ctx.Done():
return false
case <-time.After(d):
return true
}
}
func (r *Resolver) recordRetryFailure(
lastErr error,
state *queryState,
) {
if lastErr == nil {
return
}
if isTimeout(lastErr) {
state.gotTimeout = true
} else if errors.Is(lastErr, ErrSERVFAIL) {
state.gotSERVFAIL = true state.gotSERVFAIL = true
}
}
func (r *Resolver) handleDNSResponse(
msg *dns.Msg,
resp *NameserverResponse,
state *queryState,
) {
if msg.Rcode == dns.RcodeNameError {
state.gotNXDomain = true
return return
} }
@@ -540,8 +637,12 @@ func classifyResponse(resp *NameserverResponse, state queryState) {
switch { switch {
case state.gotNXDomain && !state.hasRecords: case state.gotNXDomain && !state.hasRecords:
resp.Status = StatusNXDomain resp.Status = StatusNXDomain
case state.gotTimeout && !state.hasRecords:
resp.Status = StatusTimeout
resp.Error = "all queries timed out after retries"
case state.gotSERVFAIL && !state.hasRecords: case state.gotSERVFAIL && !state.hasRecords:
resp.Status = StatusError resp.Status = StatusError
resp.Error = "server failure (SERVFAIL) after retries"
case !state.hasRecords && !state.gotNXDomain: case !state.hasRecords && !state.gotNXDomain:
resp.Status = StatusNoData resp.Status = StatusNoData
} }

View File

@@ -17,6 +17,7 @@ const (
StatusError = "error" StatusError = "error"
StatusNXDomain = "nxdomain" StatusNXDomain = "nxdomain"
StatusNoData = "nodata" StatusNoData = "nodata"
StatusTimeout = "timeout"
) )
// MaxCNAMEDepth is the maximum CNAME chain depth to follow. // MaxCNAMEDepth is the maximum CNAME chain depth to follow.