Compare commits
12 Commits
fix/dns-ti
...
2e1a4b2dbd
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2e1a4b2dbd | ||
| 0eb57fc15b | |||
| 5739108dc7 | |||
| 54272c2be5 | |||
| b18d29d586 | |||
| e63241cc3c | |||
| 5ab217bfd2 | |||
| 518a2cc42e | |||
|
|
4cb81aac24 | ||
|
|
203b581704 | ||
|
|
82fd68a41b | ||
|
|
f8d0dc4166 |
34
TESTING.md
Normal file
34
TESTING.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# Testing Policy
|
||||
|
||||
## DNS Resolution Tests
|
||||
|
||||
All resolver tests **MUST** use live queries against real DNS servers.
|
||||
No mocking of the DNS client layer is permitted.
|
||||
|
||||
### Rationale
|
||||
|
||||
The resolver performs iterative resolution from root nameservers through
|
||||
the full delegation chain. Mocked responses cannot faithfully represent
|
||||
the variety of real-world DNS behavior (truncation, referrals, glue
|
||||
records, DNSSEC, varied response times, EDNS, etc.). Testing against
|
||||
real servers ensures the resolver works correctly in production.
|
||||
|
||||
### Constraints
|
||||
|
||||
- Tests hit real DNS infrastructure and require network access
|
||||
- Test duration depends on network conditions; timeout tuning keeps
|
||||
the suite within the 30-second target
|
||||
- Query timeout is calibrated to 3× maximum antipodal RTT (~300ms)
|
||||
plus processing margin
|
||||
- Root server fan-out is limited to reduce parallel query load
|
||||
- Flaky failures from transient network issues are acceptable and
|
||||
should be investigated as potential resolver bugs, not papered over
|
||||
with mocks or skip flags
|
||||
|
||||
### What NOT to do
|
||||
|
||||
- **Do not mock `DNSClient`** for resolver tests (the mock constructor
|
||||
exists for unit-testing other packages that consume the resolver)
|
||||
- **Do not add `-short` flags** to skip slow tests
|
||||
- **Do not increase `-timeout`** to hide hanging queries
|
||||
- **Do not modify linter configuration** to suppress findings
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/rand/v2"
|
||||
"net"
|
||||
"sort"
|
||||
"strings"
|
||||
@@ -14,13 +13,7 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
// queryTimeoutDuration is the per-exchange DNS timeout.
|
||||
//
|
||||
// Rationale: maximum RTT to antipodal root/TLD servers is
|
||||
// ~300ms. We use 3× max RTT + 10ms processing ≈ 910ms,
|
||||
// rounded to 1s. Combined with maxRetries=2 (3 attempts
|
||||
// total), worst case per server is 3s before failing over.
|
||||
queryTimeoutDuration = 1 * time.Second
|
||||
queryTimeoutDuration = 700 * time.Millisecond
|
||||
maxRetries = 2
|
||||
maxDelegation = 20
|
||||
timeoutMultiplier = 2
|
||||
@@ -30,7 +23,7 @@ const (
|
||||
// ErrRefused is returned when a DNS server refuses a query.
|
||||
var ErrRefused = errors.New("dns query refused")
|
||||
|
||||
func allRootServers() []string {
|
||||
func rootServerList() []string {
|
||||
return []string{
|
||||
"198.41.0.4", // a.root-servers.net
|
||||
"170.247.170.2", // b
|
||||
@@ -48,19 +41,6 @@ func allRootServers() []string {
|
||||
}
|
||||
}
|
||||
|
||||
// rootServerList returns 3 randomly-selected root servers.
|
||||
// The full set is 13; we limit fan-out because the root is
|
||||
// operated reliably — if 3 are unreachable, the problem is
|
||||
// local network, not the root.
|
||||
func rootServerList() []string {
|
||||
shuffled := allRootServers()
|
||||
rand.Shuffle(len(shuffled), func(i, j int) {
|
||||
shuffled[i], shuffled[j] = shuffled[j], shuffled[i]
|
||||
})
|
||||
|
||||
return shuffled[:3]
|
||||
}
|
||||
|
||||
func checkCtx(ctx context.Context) error {
|
||||
err := ctx.Err()
|
||||
if err != nil {
|
||||
@@ -247,7 +227,7 @@ func (r *Resolver) followDelegation(
|
||||
|
||||
authNS := extractNSSet(resp.Ns)
|
||||
if len(authNS) == 0 {
|
||||
return r.resolveNSRecursive(ctx, domain)
|
||||
return r.resolveNSIterative(ctx, domain)
|
||||
}
|
||||
|
||||
glue := extractGlue(resp.Extra)
|
||||
@@ -311,60 +291,84 @@ func (r *Resolver) resolveNSIPs(
|
||||
return ips
|
||||
}
|
||||
|
||||
// resolveNSRecursive queries for NS records using recursive
|
||||
// resolution as a fallback for intercepted environments.
|
||||
func (r *Resolver) resolveNSRecursive(
|
||||
// resolveNSIterative queries for NS records using iterative
|
||||
// resolution as a fallback when followDelegation finds no
|
||||
// authoritative answer in the delegation chain.
|
||||
func (r *Resolver) resolveNSIterative(
|
||||
ctx context.Context,
|
||||
domain string,
|
||||
) ([]string, error) {
|
||||
domain = dns.Fqdn(domain)
|
||||
msg := new(dns.Msg)
|
||||
msg.SetQuestion(domain, dns.TypeNS)
|
||||
msg.RecursionDesired = true
|
||||
if checkCtx(ctx) != nil {
|
||||
return nil, ErrContextCanceled
|
||||
}
|
||||
|
||||
for _, ip := range rootServerList() {
|
||||
domain = dns.Fqdn(domain)
|
||||
servers := rootServerList()
|
||||
|
||||
for range maxDelegation {
|
||||
if checkCtx(ctx) != nil {
|
||||
return nil, ErrContextCanceled
|
||||
}
|
||||
|
||||
addr := net.JoinHostPort(ip, "53")
|
||||
|
||||
resp, _, err := r.client.ExchangeContext(ctx, msg, addr)
|
||||
resp, err := r.queryServers(
|
||||
ctx, servers, domain, dns.TypeNS,
|
||||
)
|
||||
if err != nil {
|
||||
continue
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nsNames := extractNSSet(resp.Answer)
|
||||
if len(nsNames) > 0 {
|
||||
return nsNames, nil
|
||||
}
|
||||
|
||||
// Follow delegation.
|
||||
authNS := extractNSSet(resp.Ns)
|
||||
if len(authNS) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
glue := extractGlue(resp.Extra)
|
||||
nextServers := glueIPs(authNS, glue)
|
||||
|
||||
if len(nextServers) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
servers = nextServers
|
||||
}
|
||||
|
||||
return nil, ErrNoNameservers
|
||||
}
|
||||
|
||||
// resolveARecord resolves a hostname to IPv4 addresses.
|
||||
// resolveARecord resolves a hostname to IPv4 addresses using
|
||||
// iterative resolution through the delegation chain.
|
||||
func (r *Resolver) resolveARecord(
|
||||
ctx context.Context,
|
||||
hostname string,
|
||||
) ([]string, error) {
|
||||
hostname = dns.Fqdn(hostname)
|
||||
msg := new(dns.Msg)
|
||||
msg.SetQuestion(hostname, dns.TypeA)
|
||||
msg.RecursionDesired = true
|
||||
if checkCtx(ctx) != nil {
|
||||
return nil, ErrContextCanceled
|
||||
}
|
||||
|
||||
for _, ip := range rootServerList() {
|
||||
hostname = dns.Fqdn(hostname)
|
||||
servers := rootServerList()
|
||||
|
||||
for range maxDelegation {
|
||||
if checkCtx(ctx) != nil {
|
||||
return nil, ErrContextCanceled
|
||||
}
|
||||
|
||||
addr := net.JoinHostPort(ip, "53")
|
||||
|
||||
resp, _, err := r.client.ExchangeContext(ctx, msg, addr)
|
||||
resp, err := r.queryServers(
|
||||
ctx, servers, hostname, dns.TypeA,
|
||||
)
|
||||
if err != nil {
|
||||
continue
|
||||
return nil, fmt.Errorf(
|
||||
"resolving %s: %w", hostname, err,
|
||||
)
|
||||
}
|
||||
|
||||
// Check for A records in the answer section.
|
||||
var ips []string
|
||||
|
||||
for _, rr := range resp.Answer {
|
||||
@@ -376,6 +380,24 @@ func (r *Resolver) resolveARecord(
|
||||
if len(ips) > 0 {
|
||||
return ips, nil
|
||||
}
|
||||
|
||||
// Follow delegation if present.
|
||||
authNS := extractNSSet(resp.Ns)
|
||||
if len(authNS) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
glue := extractGlue(resp.Extra)
|
||||
nextServers := glueIPs(authNS, glue)
|
||||
|
||||
if len(nextServers) == 0 {
|
||||
// Resolve NS IPs iteratively — but guard
|
||||
// against infinite recursion by using only
|
||||
// already-resolved servers.
|
||||
break
|
||||
}
|
||||
|
||||
servers = nextServers
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf(
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"log/slog"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"go.uber.org/fx"
|
||||
@@ -40,15 +41,17 @@ type Params struct {
|
||||
|
||||
// Watcher orchestrates all monitoring checks on a schedule.
|
||||
type Watcher struct {
|
||||
log *slog.Logger
|
||||
config *config.Config
|
||||
state *state.State
|
||||
resolver DNSResolver
|
||||
portCheck PortChecker
|
||||
tlsCheck TLSChecker
|
||||
notify Notifier
|
||||
cancel context.CancelFunc
|
||||
firstRun bool
|
||||
log *slog.Logger
|
||||
config *config.Config
|
||||
state *state.State
|
||||
resolver DNSResolver
|
||||
portCheck PortChecker
|
||||
tlsCheck TLSChecker
|
||||
notify Notifier
|
||||
cancel context.CancelFunc
|
||||
firstRun bool
|
||||
expiryNotifiedMu sync.Mutex
|
||||
expiryNotified map[string]time.Time
|
||||
}
|
||||
|
||||
// New creates a new Watcher instance wired into the fx lifecycle.
|
||||
@@ -57,14 +60,15 @@ func New(
|
||||
params Params,
|
||||
) (*Watcher, error) {
|
||||
w := &Watcher{
|
||||
log: params.Logger.Get(),
|
||||
config: params.Config,
|
||||
state: params.State,
|
||||
resolver: params.Resolver,
|
||||
portCheck: params.PortCheck,
|
||||
tlsCheck: params.TLSCheck,
|
||||
notify: params.Notify,
|
||||
firstRun: true,
|
||||
log: params.Logger.Get(),
|
||||
config: params.Config,
|
||||
state: params.State,
|
||||
resolver: params.Resolver,
|
||||
portCheck: params.PortCheck,
|
||||
tlsCheck: params.TLSCheck,
|
||||
notify: params.Notify,
|
||||
firstRun: true,
|
||||
expiryNotified: make(map[string]time.Time),
|
||||
}
|
||||
|
||||
lifecycle.Append(fx.Hook{
|
||||
@@ -100,14 +104,15 @@ func NewForTest(
|
||||
n Notifier,
|
||||
) *Watcher {
|
||||
return &Watcher{
|
||||
log: slog.Default(),
|
||||
config: cfg,
|
||||
state: st,
|
||||
resolver: res,
|
||||
portCheck: pc,
|
||||
tlsCheck: tc,
|
||||
notify: n,
|
||||
firstRun: true,
|
||||
log: slog.Default(),
|
||||
config: cfg,
|
||||
state: st,
|
||||
resolver: res,
|
||||
portCheck: pc,
|
||||
tlsCheck: tc,
|
||||
notify: n,
|
||||
firstRun: true,
|
||||
expiryNotified: make(map[string]time.Time),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,6 +211,28 @@ func (w *Watcher) checkDomain(
|
||||
Nameservers: nameservers,
|
||||
LastChecked: now,
|
||||
})
|
||||
|
||||
// Also look up A/AAAA records for the apex domain so that
|
||||
// port and TLS checks (which read HostnameState) can find
|
||||
// the domain's IP addresses.
|
||||
records, err := w.resolver.LookupAllRecords(ctx, domain)
|
||||
if err != nil {
|
||||
w.log.Error(
|
||||
"failed to lookup records for domain",
|
||||
"domain", domain,
|
||||
"error", err,
|
||||
)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
prevHS, hasPrevHS := w.state.GetHostnameState(domain)
|
||||
if hasPrevHS && !w.firstRun {
|
||||
w.detectHostnameChanges(ctx, domain, prevHS, records)
|
||||
}
|
||||
|
||||
newState := buildHostnameState(records, now)
|
||||
w.state.SetHostnameState(domain, newState)
|
||||
}
|
||||
|
||||
func (w *Watcher) detectNSChanges(
|
||||
@@ -691,6 +718,22 @@ func (w *Watcher) checkTLSExpiry(
|
||||
return
|
||||
}
|
||||
|
||||
// Deduplicate expiry warnings: don't re-notify for the same
|
||||
// hostname within the TLS check interval.
|
||||
dedupKey := fmt.Sprintf("expiry:%s:%s", hostname, ip)
|
||||
|
||||
w.expiryNotifiedMu.Lock()
|
||||
|
||||
lastNotified, seen := w.expiryNotified[dedupKey]
|
||||
if seen && time.Since(lastNotified) < w.config.TLSInterval {
|
||||
w.expiryNotifiedMu.Unlock()
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
w.expiryNotified[dedupKey] = time.Now()
|
||||
w.expiryNotifiedMu.Unlock()
|
||||
|
||||
msg := fmt.Sprintf(
|
||||
"Host: %s\nIP: %s\nCN: %s\n"+
|
||||
"Expires: %s (%.0f days)",
|
||||
|
||||
@@ -273,6 +273,10 @@ func setupBaselineMocks(deps *testDeps) {
|
||||
"ns1.example.com.",
|
||||
"ns2.example.com.",
|
||||
}
|
||||
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"93.184.216.34"}},
|
||||
"ns2.example.com.": {"A": {"93.184.216.34"}},
|
||||
}
|
||||
deps.resolver.allRecords["www.example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"93.184.216.34"}},
|
||||
"ns2.example.com.": {"A": {"93.184.216.34"}},
|
||||
@@ -290,6 +294,14 @@ func setupBaselineMocks(deps *testDeps) {
|
||||
"www.example.com",
|
||||
},
|
||||
}
|
||||
deps.tlsChecker.certs["93.184.216.34:example.com"] = &tlscheck.CertificateInfo{
|
||||
CommonName: "example.com",
|
||||
Issuer: "DigiCert",
|
||||
NotAfter: time.Now().Add(90 * 24 * time.Hour),
|
||||
SubjectAlternativeNames: []string{
|
||||
"example.com",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func assertNoNotifications(
|
||||
@@ -322,14 +334,74 @@ func assertStatePopulated(
|
||||
)
|
||||
}
|
||||
|
||||
if len(snap.Hostnames) != 1 {
|
||||
// Hostnames includes both explicit hostnames and domains
|
||||
// (domains now also get hostname state for port/TLS checks).
|
||||
if len(snap.Hostnames) < 1 {
|
||||
t.Errorf(
|
||||
"expected 1 hostname in state, got %d",
|
||||
"expected at least 1 hostname in state, got %d",
|
||||
len(snap.Hostnames),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDomainPortAndTLSChecks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cfg := defaultTestConfig(t)
|
||||
cfg.Domains = []string{"example.com"}
|
||||
|
||||
w, deps := newTestWatcher(t, cfg)
|
||||
|
||||
deps.resolver.nsRecords["example.com"] = []string{
|
||||
"ns1.example.com.",
|
||||
}
|
||||
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"93.184.216.34"}},
|
||||
}
|
||||
deps.portChecker.results["93.184.216.34:80"] = true
|
||||
deps.portChecker.results["93.184.216.34:443"] = true
|
||||
deps.tlsChecker.certs["93.184.216.34:example.com"] = &tlscheck.CertificateInfo{
|
||||
CommonName: "example.com",
|
||||
Issuer: "DigiCert",
|
||||
NotAfter: time.Now().Add(90 * 24 * time.Hour),
|
||||
SubjectAlternativeNames: []string{
|
||||
"example.com",
|
||||
},
|
||||
}
|
||||
|
||||
w.RunOnce(t.Context())
|
||||
|
||||
snap := deps.state.GetSnapshot()
|
||||
|
||||
// Domain should have port state populated
|
||||
if len(snap.Ports) == 0 {
|
||||
t.Error("expected port state for domain, got none")
|
||||
}
|
||||
|
||||
// Domain should have certificate state populated
|
||||
if len(snap.Certificates) == 0 {
|
||||
t.Error("expected certificate state for domain, got none")
|
||||
}
|
||||
|
||||
// Verify port checker was actually called
|
||||
deps.portChecker.mu.Lock()
|
||||
calls := deps.portChecker.calls
|
||||
deps.portChecker.mu.Unlock()
|
||||
|
||||
if calls == 0 {
|
||||
t.Error("expected port checker to be called for domain")
|
||||
}
|
||||
|
||||
// Verify TLS checker was actually called
|
||||
deps.tlsChecker.mu.Lock()
|
||||
tlsCalls := deps.tlsChecker.calls
|
||||
deps.tlsChecker.mu.Unlock()
|
||||
|
||||
if tlsCalls == 0 {
|
||||
t.Error("expected TLS checker to be called for domain")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNSChangeDetection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -342,6 +414,12 @@ func TestNSChangeDetection(t *testing.T) {
|
||||
"ns1.example.com.",
|
||||
"ns2.example.com.",
|
||||
}
|
||||
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"1.2.3.4"}},
|
||||
"ns2.example.com.": {"A": {"1.2.3.4"}},
|
||||
}
|
||||
deps.portChecker.results["1.2.3.4:80"] = false
|
||||
deps.portChecker.results["1.2.3.4:443"] = false
|
||||
|
||||
ctx := t.Context()
|
||||
w.RunOnce(ctx)
|
||||
@@ -351,6 +429,10 @@ func TestNSChangeDetection(t *testing.T) {
|
||||
"ns1.example.com.",
|
||||
"ns3.example.com.",
|
||||
}
|
||||
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"1.2.3.4"}},
|
||||
"ns3.example.com.": {"A": {"1.2.3.4"}},
|
||||
}
|
||||
deps.resolver.mu.Unlock()
|
||||
|
||||
w.RunOnce(ctx)
|
||||
@@ -506,6 +588,61 @@ func TestTLSExpiryWarning(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTLSExpiryWarningDedup(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cfg := defaultTestConfig(t)
|
||||
cfg.Hostnames = []string{"www.example.com"}
|
||||
cfg.TLSInterval = 24 * time.Hour
|
||||
|
||||
w, deps := newTestWatcher(t, cfg)
|
||||
|
||||
deps.resolver.allRecords["www.example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"1.2.3.4"}},
|
||||
}
|
||||
deps.resolver.ipAddresses["www.example.com"] = []string{
|
||||
"1.2.3.4",
|
||||
}
|
||||
deps.portChecker.results["1.2.3.4:80"] = true
|
||||
deps.portChecker.results["1.2.3.4:443"] = true
|
||||
deps.tlsChecker.certs["1.2.3.4:www.example.com"] = &tlscheck.CertificateInfo{
|
||||
CommonName: "www.example.com",
|
||||
Issuer: "DigiCert",
|
||||
NotAfter: time.Now().Add(3 * 24 * time.Hour),
|
||||
SubjectAlternativeNames: []string{
|
||||
"www.example.com",
|
||||
},
|
||||
}
|
||||
|
||||
ctx := t.Context()
|
||||
|
||||
// First run = baseline, no notifications
|
||||
w.RunOnce(ctx)
|
||||
|
||||
// Second run should fire one expiry warning
|
||||
w.RunOnce(ctx)
|
||||
|
||||
// Third run should NOT fire another warning (dedup)
|
||||
w.RunOnce(ctx)
|
||||
|
||||
notifications := deps.notifier.getNotifications()
|
||||
|
||||
expiryCount := 0
|
||||
|
||||
for _, n := range notifications {
|
||||
if n.Title == "TLS Expiry Warning: www.example.com" {
|
||||
expiryCount++
|
||||
}
|
||||
}
|
||||
|
||||
if expiryCount != 1 {
|
||||
t.Errorf(
|
||||
"expected exactly 1 expiry warning (dedup), got %d",
|
||||
expiryCount,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGracefulShutdown(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -519,6 +656,11 @@ func TestGracefulShutdown(t *testing.T) {
|
||||
deps.resolver.nsRecords["example.com"] = []string{
|
||||
"ns1.example.com.",
|
||||
}
|
||||
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"1.2.3.4"}},
|
||||
}
|
||||
deps.portChecker.results["1.2.3.4:80"] = false
|
||||
deps.portChecker.results["1.2.3.4:443"] = false
|
||||
|
||||
ctx, cancel := context.WithCancel(t.Context())
|
||||
|
||||
|
||||
Reference in New Issue
Block a user