1 Commits

Author SHA1 Message Date
clawbot
619efffb07 fix: 700ms query timeout, iterative resolution for A/NS lookups
All checks were successful
Check / check (pull_request) Successful in 10m18s
Replaces recursive queries to root servers (which don't answer RD=1)
with proper iterative resolution through the delegation chain.
Reduces per-query timeout from 5s to 700ms.

Fixes the root cause of make check hanging: resolveARecord and
resolveNSRecursive were sending recursive queries to root nameservers,
which silently dropped them, causing 5s timeouts × retries × servers.

closes #24
2026-02-28 03:19:38 -08:00
3 changed files with 74 additions and 155 deletions

View File

@@ -24,8 +24,4 @@ var (
// ErrContextCanceled wraps context cancellation for the
// resolver's iterative queries.
ErrContextCanceled = errors.New("context canceled")
// ErrSERVFAIL is returned when a DNS server responds with
// SERVFAIL after all retries are exhausted.
ErrSERVFAIL = errors.New("SERVFAIL from server")
)

View File

@@ -4,7 +4,6 @@ import (
"context"
"errors"
"fmt"
"math/rand"
"net"
"sort"
"strings"
@@ -42,22 +41,6 @@ func rootServerList() []string {
}
}
const maxRootServers = 3
// randomRootServers returns a shuffled subset of root servers.
func randomRootServers() []string {
all := rootServerList()
rand.Shuffle(len(all), func(i, j int) {
all[i], all[j] = all[j], all[i]
})
if len(all) > maxRootServers {
return all[:maxRootServers]
}
return all
}
func checkCtx(ctx context.Context) error {
err := ctx.Err()
if err != nil {
@@ -244,7 +227,7 @@ func (r *Resolver) followDelegation(
authNS := extractNSSet(resp.Ns)
if len(authNS) == 0 {
return r.resolveNSRecursive(ctx, domain)
return r.resolveNSIterative(ctx, domain)
}
glue := extractGlue(resp.Extra)
@@ -308,60 +291,84 @@ func (r *Resolver) resolveNSIPs(
return ips
}
// resolveNSRecursive queries for NS records using recursive
// resolution as a fallback for intercepted environments.
func (r *Resolver) resolveNSRecursive(
// resolveNSIterative queries for NS records using iterative
// resolution as a fallback when followDelegation finds no
// authoritative answer in the delegation chain.
func (r *Resolver) resolveNSIterative(
ctx context.Context,
domain string,
) ([]string, error) {
domain = dns.Fqdn(domain)
msg := new(dns.Msg)
msg.SetQuestion(domain, dns.TypeNS)
msg.RecursionDesired = true
for _, ip := range randomRootServers() {
if checkCtx(ctx) != nil {
return nil, ErrContextCanceled
}
addr := net.JoinHostPort(ip, "53")
domain = dns.Fqdn(domain)
servers := rootServerList()
resp, _, err := r.client.ExchangeContext(ctx, msg, addr)
for range maxDelegation {
if checkCtx(ctx) != nil {
return nil, ErrContextCanceled
}
resp, err := r.queryServers(
ctx, servers, domain, dns.TypeNS,
)
if err != nil {
continue
return nil, err
}
nsNames := extractNSSet(resp.Answer)
if len(nsNames) > 0 {
return nsNames, nil
}
// Follow delegation.
authNS := extractNSSet(resp.Ns)
if len(authNS) == 0 {
break
}
glue := extractGlue(resp.Extra)
nextServers := glueIPs(authNS, glue)
if len(nextServers) == 0 {
break
}
servers = nextServers
}
return nil, ErrNoNameservers
}
// resolveARecord resolves a hostname to IPv4 addresses.
// resolveARecord resolves a hostname to IPv4 addresses using
// iterative resolution through the delegation chain.
func (r *Resolver) resolveARecord(
ctx context.Context,
hostname string,
) ([]string, error) {
hostname = dns.Fqdn(hostname)
msg := new(dns.Msg)
msg.SetQuestion(hostname, dns.TypeA)
msg.RecursionDesired = true
for _, ip := range randomRootServers() {
if checkCtx(ctx) != nil {
return nil, ErrContextCanceled
}
addr := net.JoinHostPort(ip, "53")
hostname = dns.Fqdn(hostname)
servers := rootServerList()
resp, _, err := r.client.ExchangeContext(ctx, msg, addr)
if err != nil {
continue
for range maxDelegation {
if checkCtx(ctx) != nil {
return nil, ErrContextCanceled
}
resp, err := r.queryServers(
ctx, servers, hostname, dns.TypeA,
)
if err != nil {
return nil, fmt.Errorf(
"resolving %s: %w", hostname, err,
)
}
// Check for A records in the answer section.
var ips []string
for _, rr := range resp.Answer {
@@ -373,6 +380,24 @@ func (r *Resolver) resolveARecord(
if len(ips) > 0 {
return ips, nil
}
// Follow delegation if present.
authNS := extractNSSet(resp.Ns)
if len(authNS) == 0 {
break
}
glue := extractGlue(resp.Extra)
nextServers := glueIPs(authNS, glue)
if len(nextServers) == 0 {
// Resolve NS IPs iteratively — but guard
// against infinite recursion by using only
// already-resolved servers.
break
}
servers = nextServers
}
return nil, fmt.Errorf(
@@ -402,7 +427,7 @@ func (r *Resolver) FindAuthoritativeNameservers(
candidate := strings.Join(labels[i:], ".") + "."
nsNames, err := r.followDelegation(
ctx, candidate, randomRootServers(),
ctx, candidate, rootServerList(),
)
if err == nil && len(nsNames) > 0 {
sort.Strings(nsNames)
@@ -459,15 +484,9 @@ func (r *Resolver) queryAllTypes(
return resp, nil
}
const (
singleTypeMaxRetries = 3
singleTypeInitialBackoff = 100 * time.Millisecond
)
type queryState struct {
gotNXDomain bool
gotSERVFAIL bool
gotTimeout bool
hasRecords bool
}
@@ -495,21 +514,6 @@ func (r *Resolver) queryEachType(
return state
}
// isTimeout checks whether an error represents a DNS timeout.
func isTimeout(err error) bool {
if err == nil {
return false
}
var netErr net.Error
if errors.As(err, &netErr) && netErr.Timeout() {
return true
}
// Also catch i/o timeout strings from the dns library.
return strings.Contains(err.Error(), "i/o timeout")
}
func (r *Resolver) querySingleType(
ctx context.Context,
nsIP string,
@@ -518,95 +522,19 @@ func (r *Resolver) querySingleType(
resp *NameserverResponse,
state *queryState,
) {
msg, lastErr := r.querySingleTypeWithRetry(
ctx, nsIP, hostname, qtype,
)
if msg == nil {
r.recordRetryFailure(lastErr, state)
msg, err := r.queryDNS(ctx, nsIP, hostname, qtype)
if err != nil {
return
}
r.handleDNSResponse(msg, resp, state)
}
if msg.Rcode == dns.RcodeNameError {
state.gotNXDomain = true
func (r *Resolver) querySingleTypeWithRetry(
ctx context.Context,
nsIP string,
hostname string,
qtype uint16,
) (*dns.Msg, error) {
var lastErr error
backoff := singleTypeInitialBackoff
for attempt := range singleTypeMaxRetries {
if checkCtx(ctx) != nil {
return nil, ErrContextCanceled
}
if attempt > 0 {
if !waitBackoff(ctx, backoff) {
return nil, ErrContextCanceled
}
backoff *= timeoutMultiplier
}
msg, err := r.queryDNS(ctx, nsIP, hostname, qtype)
if err != nil {
lastErr = err
if !isTimeout(err) {
return nil, err
}
continue
return
}
if msg.Rcode == dns.RcodeServerFailure {
lastErr = ErrSERVFAIL
continue
}
return msg, nil
}
return nil, lastErr
}
func waitBackoff(ctx context.Context, d time.Duration) bool {
select {
case <-ctx.Done():
return false
case <-time.After(d):
return true
}
}
func (r *Resolver) recordRetryFailure(
lastErr error,
state *queryState,
) {
if lastErr == nil {
return
}
if isTimeout(lastErr) {
state.gotTimeout = true
} else if errors.Is(lastErr, ErrSERVFAIL) {
state.gotSERVFAIL = true
}
}
func (r *Resolver) handleDNSResponse(
msg *dns.Msg,
resp *NameserverResponse,
state *queryState,
) {
if msg.Rcode == dns.RcodeNameError {
state.gotNXDomain = true
return
}
@@ -637,12 +565,8 @@ func classifyResponse(resp *NameserverResponse, state queryState) {
switch {
case state.gotNXDomain && !state.hasRecords:
resp.Status = StatusNXDomain
case state.gotTimeout && !state.hasRecords:
resp.Status = StatusTimeout
resp.Error = "all queries timed out after retries"
case state.gotSERVFAIL && !state.hasRecords:
resp.Status = StatusError
resp.Error = "server failure (SERVFAIL) after retries"
case !state.hasRecords && !state.gotNXDomain:
resp.Status = StatusNoData
}

View File

@@ -17,7 +17,6 @@ const (
StatusError = "error"
StatusNXDomain = "nxdomain"
StatusNoData = "nodata"
StatusTimeout = "timeout"
)
// MaxCNAMEDepth is the maximum CNAME chain depth to follow.