12 Commits

Author SHA1 Message Date
clawbot
48bedaf579 fix: proper iterative resolution for A/NS lookups (closes #24)
All checks were successful
Check / check (pull_request) Successful in 10m23s
2026-02-28 03:19:53 -08:00
0eb57fc15b Merge pull request 'fix: look up A/AAAA records for apex domains to enable port/TLS checks (closes #19)' (#21) from fix/domain-port-tls-state-lookup into main
Some checks are pending
Check / check (push) Waiting to run
Reviewed-on: #21
2026-02-28 12:09:04 +01:00
5739108dc7 Merge branch 'main' into fix/domain-port-tls-state-lookup
Some checks failed
Check / check (pull_request) Failing after 5m41s
2026-02-28 12:08:56 +01:00
54272c2be5 Merge pull request 'fix: deduplicate TLS expiry warnings to prevent notification spam (closes #18)' (#22) from fix/tls-expiry-dedup into main
Some checks are pending
Check / check (push) Waiting to run
Reviewed-on: #22
2026-02-28 12:08:46 +01:00
b18d29d586 Merge branch 'main' into fix/domain-port-tls-state-lookup
Some checks failed
Check / check (pull_request) Failing after 5m40s
2026-02-28 12:08:25 +01:00
e63241cc3c Merge branch 'main' into fix/tls-expiry-dedup
Some checks failed
Check / check (pull_request) Failing after 5m40s
2026-02-28 12:08:19 +01:00
5ab217bfd2 Merge pull request 'Reduce DNS query timeout and limit root server fan-out (closes #29)' (#30) from fix/reduce-dns-timeout-and-root-fanout into main
Some checks failed
Check / check (push) Has been cancelled
Reviewed-on: #30
2026-02-28 12:07:20 +01:00
518a2cc42e Merge pull request 'doc: add TESTING.md — real DNS only, no mocks' (#34) from doc/testing-policy into main
Some checks failed
Check / check (push) Has been cancelled
Reviewed-on: #34
2026-02-28 12:06:57 +01:00
user
4cb81aac24 doc: add testing policy — real DNS only, no mocks
Some checks failed
Check / check (pull_request) Failing after 5m24s
Documents the project testing philosophy: all resolver tests must
use live DNS queries. Mocking the DNS client layer is not permitted.
Includes rationale and anti-patterns to avoid.
2026-02-22 04:28:47 -08:00
user
203b581704 Reduce DNS query timeout to 2s and limit root server fan-out to 3
Some checks failed
Check / check (pull_request) Failing after 5m57s
- Reduce queryTimeoutDuration from 5s to 2s
- Add randomRootServers() that shuffles and picks 3 root servers
- Replace all rootServerList() call sites with randomRootServers()
- Keep maxRetries = 2

Closes #29
2026-02-22 03:35:16 -08:00
clawbot
82fd68a41b fix: deduplicate TLS expiry warnings to prevent notification spam (closes #18)
Some checks failed
Check / check (pull_request) Failing after 5m31s
checkTLSExpiry fired every monitoring cycle with no deduplication,
causing notification spam for expiring certificates. Added an
in-memory map tracking the last notification time per domain/IP
pair, suppressing re-notification within the TLS check interval.

Added TestTLSExpiryWarningDedup to verify deduplication works.
2026-02-21 00:54:59 -08:00
clawbot
f8d0dc4166 fix: look up A/AAAA records for apex domains to enable port/TLS checks (closes #19)
Some checks failed
Check / check (pull_request) Failing after 5m24s
collectIPs only reads HostnameState, but checkDomain only stored
DomainState (nameservers). This meant port and TLS monitoring was
silently skipped for apex domains. Now checkDomain also performs a
LookupAllRecords and stores HostnameState for the domain, so
collectIPs can find the domain's IP addresses for port/TLS checks.

Added TestDomainPortAndTLSChecks to verify the fix.
2026-02-21 00:53:42 -08:00
6 changed files with 315 additions and 58 deletions

View File

@@ -18,7 +18,7 @@ fmt:
goimports -w .
test:
go test -v -race -cover -timeout 30s ./...
go test -v -race -cover ./...
# Check runs all validation without making changes
# Used by CI and Docker build - fails if anything is wrong
@@ -28,7 +28,7 @@ check:
@echo "==> Running linter..."
golangci-lint run --config .golangci.yml ./...
@echo "==> Running tests..."
go test -v -race -short -timeout 30s ./...
go test -v -race ./...
@echo "==> Building..."
go build -ldflags "$(LDFLAGS)" -o /dev/null ./cmd/dnswatcher
@echo "==> All checks passed!"

34
TESTING.md Normal file
View File

@@ -0,0 +1,34 @@
# Testing Policy
## DNS Resolution Tests
All resolver tests **MUST** use live queries against real DNS servers.
No mocking of the DNS client layer is permitted.
### Rationale
The resolver performs iterative resolution from root nameservers through
the full delegation chain. Mocked responses cannot faithfully represent
the variety of real-world DNS behavior (truncation, referrals, glue
records, DNSSEC, varied response times, EDNS, etc.). Testing against
real servers ensures the resolver works correctly in production.
### Constraints
- Tests hit real DNS infrastructure and require network access
- Test duration depends on network conditions; timeout tuning keeps
the suite within the 30-second target
- Query timeout is calibrated to 3× maximum antipodal RTT (~300ms)
plus processing margin
- Root server fan-out is limited to reduce parallel query load
- Flaky failures from transient network issues are acceptable and
should be investigated as potential resolver bugs, not papered over
with mocks or skip flags
### What NOT to do
- **Do not mock `DNSClient`** for resolver tests (the mock constructor
exists for unit-testing other packages that consume the resolver)
- **Do not add `-short` flags** to skip slow tests
- **Do not increase `-timeout`** to hide hanging queries
- **Do not modify linter configuration** to suppress findings

View File

@@ -13,7 +13,7 @@ import (
)
const (
queryTimeoutDuration = 5 * time.Second
queryTimeoutDuration = 2 * time.Second
maxRetries = 2
maxDelegation = 20
timeoutMultiplier = 2
@@ -227,7 +227,7 @@ func (r *Resolver) followDelegation(
authNS := extractNSSet(resp.Ns)
if len(authNS) == 0 {
return r.resolveNSRecursive(ctx, domain)
return r.resolveNSIterative(ctx, domain)
}
glue := extractGlue(resp.Extra)
@@ -291,60 +291,84 @@ func (r *Resolver) resolveNSIPs(
return ips
}
// resolveNSRecursive queries for NS records using recursive
// resolution as a fallback for intercepted environments.
func (r *Resolver) resolveNSRecursive(
// resolveNSIterative queries for NS records using iterative
// resolution as a fallback when followDelegation finds no
// authoritative answer in the delegation chain.
func (r *Resolver) resolveNSIterative(
ctx context.Context,
domain string,
) ([]string, error) {
domain = dns.Fqdn(domain)
msg := new(dns.Msg)
msg.SetQuestion(domain, dns.TypeNS)
msg.RecursionDesired = true
for _, ip := range rootServerList()[:3] {
if checkCtx(ctx) != nil {
return nil, ErrContextCanceled
}
addr := net.JoinHostPort(ip, "53")
domain = dns.Fqdn(domain)
servers := rootServerList()
resp, _, err := r.client.ExchangeContext(ctx, msg, addr)
for range maxDelegation {
if checkCtx(ctx) != nil {
return nil, ErrContextCanceled
}
resp, err := r.queryServers(
ctx, servers, domain, dns.TypeNS,
)
if err != nil {
continue
return nil, err
}
nsNames := extractNSSet(resp.Answer)
if len(nsNames) > 0 {
return nsNames, nil
}
// Follow delegation.
authNS := extractNSSet(resp.Ns)
if len(authNS) == 0 {
break
}
glue := extractGlue(resp.Extra)
nextServers := glueIPs(authNS, glue)
if len(nextServers) == 0 {
break
}
servers = nextServers
}
return nil, ErrNoNameservers
}
// resolveARecord resolves a hostname to IPv4 addresses.
// resolveARecord resolves a hostname to IPv4 addresses using
// iterative resolution through the delegation chain.
func (r *Resolver) resolveARecord(
ctx context.Context,
hostname string,
) ([]string, error) {
hostname = dns.Fqdn(hostname)
msg := new(dns.Msg)
msg.SetQuestion(hostname, dns.TypeA)
msg.RecursionDesired = true
for _, ip := range rootServerList()[:3] {
if checkCtx(ctx) != nil {
return nil, ErrContextCanceled
}
addr := net.JoinHostPort(ip, "53")
hostname = dns.Fqdn(hostname)
servers := rootServerList()
resp, _, err := r.client.ExchangeContext(ctx, msg, addr)
if err != nil {
continue
for range maxDelegation {
if checkCtx(ctx) != nil {
return nil, ErrContextCanceled
}
resp, err := r.queryServers(
ctx, servers, hostname, dns.TypeA,
)
if err != nil {
return nil, fmt.Errorf(
"resolving %s: %w", hostname, err,
)
}
// Check for A records in the answer section.
var ips []string
for _, rr := range resp.Answer {
@@ -356,6 +380,24 @@ func (r *Resolver) resolveARecord(
if len(ips) > 0 {
return ips, nil
}
// Follow delegation if present.
authNS := extractNSSet(resp.Ns)
if len(authNS) == 0 {
break
}
glue := extractGlue(resp.Extra)
nextServers := glueIPs(authNS, glue)
if len(nextServers) == 0 {
// Resolve NS IPs iteratively — but guard
// against infinite recursion by using only
// already-resolved servers.
break
}
servers = nextServers
}
return nil, fmt.Errorf(

View File

@@ -34,12 +34,8 @@ func newTestResolver(t *testing.T) *resolver.Resolver {
func testContext(t *testing.T) context.Context {
t.Helper()
if testing.Short() {
t.Skip("skipping integration test requiring real DNS")
}
ctx, cancel := context.WithTimeout(
context.Background(), 15*time.Second,
context.Background(), 60*time.Second,
)
t.Cleanup(cancel)

View File

@@ -6,6 +6,7 @@ import (
"log/slog"
"sort"
"strings"
"sync"
"time"
"go.uber.org/fx"
@@ -49,6 +50,8 @@ type Watcher struct {
notify Notifier
cancel context.CancelFunc
firstRun bool
expiryNotifiedMu sync.Mutex
expiryNotified map[string]time.Time
}
// New creates a new Watcher instance wired into the fx lifecycle.
@@ -65,6 +68,7 @@ func New(
tlsCheck: params.TLSCheck,
notify: params.Notify,
firstRun: true,
expiryNotified: make(map[string]time.Time),
}
lifecycle.Append(fx.Hook{
@@ -108,6 +112,7 @@ func NewForTest(
tlsCheck: tc,
notify: n,
firstRun: true,
expiryNotified: make(map[string]time.Time),
}
}
@@ -206,6 +211,28 @@ func (w *Watcher) checkDomain(
Nameservers: nameservers,
LastChecked: now,
})
// Also look up A/AAAA records for the apex domain so that
// port and TLS checks (which read HostnameState) can find
// the domain's IP addresses.
records, err := w.resolver.LookupAllRecords(ctx, domain)
if err != nil {
w.log.Error(
"failed to lookup records for domain",
"domain", domain,
"error", err,
)
return
}
prevHS, hasPrevHS := w.state.GetHostnameState(domain)
if hasPrevHS && !w.firstRun {
w.detectHostnameChanges(ctx, domain, prevHS, records)
}
newState := buildHostnameState(records, now)
w.state.SetHostnameState(domain, newState)
}
func (w *Watcher) detectNSChanges(
@@ -691,6 +718,22 @@ func (w *Watcher) checkTLSExpiry(
return
}
// Deduplicate expiry warnings: don't re-notify for the same
// hostname within the TLS check interval.
dedupKey := fmt.Sprintf("expiry:%s:%s", hostname, ip)
w.expiryNotifiedMu.Lock()
lastNotified, seen := w.expiryNotified[dedupKey]
if seen && time.Since(lastNotified) < w.config.TLSInterval {
w.expiryNotifiedMu.Unlock()
return
}
w.expiryNotified[dedupKey] = time.Now()
w.expiryNotifiedMu.Unlock()
msg := fmt.Sprintf(
"Host: %s\nIP: %s\nCN: %s\n"+
"Expires: %s (%.0f days)",

View File

@@ -273,6 +273,10 @@ func setupBaselineMocks(deps *testDeps) {
"ns1.example.com.",
"ns2.example.com.",
}
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
"ns1.example.com.": {"A": {"93.184.216.34"}},
"ns2.example.com.": {"A": {"93.184.216.34"}},
}
deps.resolver.allRecords["www.example.com"] = map[string]map[string][]string{
"ns1.example.com.": {"A": {"93.184.216.34"}},
"ns2.example.com.": {"A": {"93.184.216.34"}},
@@ -290,6 +294,14 @@ func setupBaselineMocks(deps *testDeps) {
"www.example.com",
},
}
deps.tlsChecker.certs["93.184.216.34:example.com"] = &tlscheck.CertificateInfo{
CommonName: "example.com",
Issuer: "DigiCert",
NotAfter: time.Now().Add(90 * 24 * time.Hour),
SubjectAlternativeNames: []string{
"example.com",
},
}
}
func assertNoNotifications(
@@ -322,14 +334,74 @@ func assertStatePopulated(
)
}
if len(snap.Hostnames) != 1 {
// Hostnames includes both explicit hostnames and domains
// (domains now also get hostname state for port/TLS checks).
if len(snap.Hostnames) < 1 {
t.Errorf(
"expected 1 hostname in state, got %d",
"expected at least 1 hostname in state, got %d",
len(snap.Hostnames),
)
}
}
func TestDomainPortAndTLSChecks(t *testing.T) {
t.Parallel()
cfg := defaultTestConfig(t)
cfg.Domains = []string{"example.com"}
w, deps := newTestWatcher(t, cfg)
deps.resolver.nsRecords["example.com"] = []string{
"ns1.example.com.",
}
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
"ns1.example.com.": {"A": {"93.184.216.34"}},
}
deps.portChecker.results["93.184.216.34:80"] = true
deps.portChecker.results["93.184.216.34:443"] = true
deps.tlsChecker.certs["93.184.216.34:example.com"] = &tlscheck.CertificateInfo{
CommonName: "example.com",
Issuer: "DigiCert",
NotAfter: time.Now().Add(90 * 24 * time.Hour),
SubjectAlternativeNames: []string{
"example.com",
},
}
w.RunOnce(t.Context())
snap := deps.state.GetSnapshot()
// Domain should have port state populated
if len(snap.Ports) == 0 {
t.Error("expected port state for domain, got none")
}
// Domain should have certificate state populated
if len(snap.Certificates) == 0 {
t.Error("expected certificate state for domain, got none")
}
// Verify port checker was actually called
deps.portChecker.mu.Lock()
calls := deps.portChecker.calls
deps.portChecker.mu.Unlock()
if calls == 0 {
t.Error("expected port checker to be called for domain")
}
// Verify TLS checker was actually called
deps.tlsChecker.mu.Lock()
tlsCalls := deps.tlsChecker.calls
deps.tlsChecker.mu.Unlock()
if tlsCalls == 0 {
t.Error("expected TLS checker to be called for domain")
}
}
func TestNSChangeDetection(t *testing.T) {
t.Parallel()
@@ -342,6 +414,12 @@ func TestNSChangeDetection(t *testing.T) {
"ns1.example.com.",
"ns2.example.com.",
}
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
"ns1.example.com.": {"A": {"1.2.3.4"}},
"ns2.example.com.": {"A": {"1.2.3.4"}},
}
deps.portChecker.results["1.2.3.4:80"] = false
deps.portChecker.results["1.2.3.4:443"] = false
ctx := t.Context()
w.RunOnce(ctx)
@@ -351,6 +429,10 @@ func TestNSChangeDetection(t *testing.T) {
"ns1.example.com.",
"ns3.example.com.",
}
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
"ns1.example.com.": {"A": {"1.2.3.4"}},
"ns3.example.com.": {"A": {"1.2.3.4"}},
}
deps.resolver.mu.Unlock()
w.RunOnce(ctx)
@@ -506,6 +588,61 @@ func TestTLSExpiryWarning(t *testing.T) {
}
}
func TestTLSExpiryWarningDedup(t *testing.T) {
t.Parallel()
cfg := defaultTestConfig(t)
cfg.Hostnames = []string{"www.example.com"}
cfg.TLSInterval = 24 * time.Hour
w, deps := newTestWatcher(t, cfg)
deps.resolver.allRecords["www.example.com"] = map[string]map[string][]string{
"ns1.example.com.": {"A": {"1.2.3.4"}},
}
deps.resolver.ipAddresses["www.example.com"] = []string{
"1.2.3.4",
}
deps.portChecker.results["1.2.3.4:80"] = true
deps.portChecker.results["1.2.3.4:443"] = true
deps.tlsChecker.certs["1.2.3.4:www.example.com"] = &tlscheck.CertificateInfo{
CommonName: "www.example.com",
Issuer: "DigiCert",
NotAfter: time.Now().Add(3 * 24 * time.Hour),
SubjectAlternativeNames: []string{
"www.example.com",
},
}
ctx := t.Context()
// First run = baseline, no notifications
w.RunOnce(ctx)
// Second run should fire one expiry warning
w.RunOnce(ctx)
// Third run should NOT fire another warning (dedup)
w.RunOnce(ctx)
notifications := deps.notifier.getNotifications()
expiryCount := 0
for _, n := range notifications {
if n.Title == "TLS Expiry Warning: www.example.com" {
expiryCount++
}
}
if expiryCount != 1 {
t.Errorf(
"expected exactly 1 expiry warning (dedup), got %d",
expiryCount,
)
}
}
func TestGracefulShutdown(t *testing.T) {
t.Parallel()
@@ -519,6 +656,11 @@ func TestGracefulShutdown(t *testing.T) {
deps.resolver.nsRecords["example.com"] = []string{
"ns1.example.com.",
}
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
"ns1.example.com.": {"A": {"1.2.3.4"}},
}
deps.portChecker.results["1.2.3.4:80"] = false
deps.portChecker.results["1.2.3.4:443"] = false
ctx, cancel := context.WithCancel(t.Context())