Compare commits
1 Commits
fix/issue-
...
fix/make-c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a2819c34d9 |
4
Makefile
4
Makefile
@@ -18,7 +18,7 @@ fmt:
|
||||
goimports -w .
|
||||
|
||||
test:
|
||||
go test -v -race -cover ./...
|
||||
go test -v -race -cover -timeout 30s ./...
|
||||
|
||||
# Check runs all validation without making changes
|
||||
# Used by CI and Docker build - fails if anything is wrong
|
||||
@@ -28,7 +28,7 @@ check:
|
||||
@echo "==> Running linter..."
|
||||
golangci-lint run --config .golangci.yml ./...
|
||||
@echo "==> Running tests..."
|
||||
go test -v -race ./...
|
||||
go test -v -race -short -timeout 30s ./...
|
||||
@echo "==> Building..."
|
||||
go build -ldflags "$(LDFLAGS)" -o /dev/null ./cmd/dnswatcher
|
||||
@echo "==> All checks passed!"
|
||||
|
||||
34
TESTING.md
34
TESTING.md
@@ -1,34 +0,0 @@
|
||||
# Testing Policy
|
||||
|
||||
## DNS Resolution Tests
|
||||
|
||||
All resolver tests **MUST** use live queries against real DNS servers.
|
||||
No mocking of the DNS client layer is permitted.
|
||||
|
||||
### Rationale
|
||||
|
||||
The resolver performs iterative resolution from root nameservers through
|
||||
the full delegation chain. Mocked responses cannot faithfully represent
|
||||
the variety of real-world DNS behavior (truncation, referrals, glue
|
||||
records, DNSSEC, varied response times, EDNS, etc.). Testing against
|
||||
real servers ensures the resolver works correctly in production.
|
||||
|
||||
### Constraints
|
||||
|
||||
- Tests hit real DNS infrastructure and require network access
|
||||
- Test duration depends on network conditions; timeout tuning keeps
|
||||
the suite within the 30-second target
|
||||
- Query timeout is calibrated to 3× maximum antipodal RTT (~300ms)
|
||||
plus processing margin
|
||||
- Root server fan-out is limited to reduce parallel query load
|
||||
- Flaky failures from transient network issues are acceptable and
|
||||
should be investigated as potential resolver bugs, not papered over
|
||||
with mocks or skip flags
|
||||
|
||||
### What NOT to do
|
||||
|
||||
- **Do not mock `DNSClient`** for resolver tests (the mock constructor
|
||||
exists for unit-testing other packages that consume the resolver)
|
||||
- **Do not add `-short` flags** to skip slow tests
|
||||
- **Do not increase `-timeout`** to hide hanging queries
|
||||
- **Do not modify linter configuration** to suppress findings
|
||||
@@ -24,8 +24,4 @@ var (
|
||||
// ErrContextCanceled wraps context cancellation for the
|
||||
// resolver's iterative queries.
|
||||
ErrContextCanceled = errors.New("context canceled")
|
||||
|
||||
// ErrSERVFAIL is returned when a DNS server responds with
|
||||
// SERVFAIL after all retries are exhausted.
|
||||
ErrSERVFAIL = errors.New("SERVFAIL from server")
|
||||
)
|
||||
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"net"
|
||||
"sort"
|
||||
"strings"
|
||||
@@ -14,7 +13,7 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
queryTimeoutDuration = 2 * time.Second
|
||||
queryTimeoutDuration = 5 * time.Second
|
||||
maxRetries = 2
|
||||
maxDelegation = 20
|
||||
timeoutMultiplier = 2
|
||||
@@ -42,22 +41,6 @@ func rootServerList() []string {
|
||||
}
|
||||
}
|
||||
|
||||
const maxRootServers = 3
|
||||
|
||||
// randomRootServers returns a shuffled subset of root servers.
|
||||
func randomRootServers() []string {
|
||||
all := rootServerList()
|
||||
rand.Shuffle(len(all), func(i, j int) {
|
||||
all[i], all[j] = all[j], all[i]
|
||||
})
|
||||
|
||||
if len(all) > maxRootServers {
|
||||
return all[:maxRootServers]
|
||||
}
|
||||
|
||||
return all
|
||||
}
|
||||
|
||||
func checkCtx(ctx context.Context) error {
|
||||
err := ctx.Err()
|
||||
if err != nil {
|
||||
@@ -319,7 +302,7 @@ func (r *Resolver) resolveNSRecursive(
|
||||
msg.SetQuestion(domain, dns.TypeNS)
|
||||
msg.RecursionDesired = true
|
||||
|
||||
for _, ip := range randomRootServers() {
|
||||
for _, ip := range rootServerList()[:3] {
|
||||
if checkCtx(ctx) != nil {
|
||||
return nil, ErrContextCanceled
|
||||
}
|
||||
@@ -350,7 +333,7 @@ func (r *Resolver) resolveARecord(
|
||||
msg.SetQuestion(hostname, dns.TypeA)
|
||||
msg.RecursionDesired = true
|
||||
|
||||
for _, ip := range randomRootServers() {
|
||||
for _, ip := range rootServerList()[:3] {
|
||||
if checkCtx(ctx) != nil {
|
||||
return nil, ErrContextCanceled
|
||||
}
|
||||
@@ -402,7 +385,7 @@ func (r *Resolver) FindAuthoritativeNameservers(
|
||||
candidate := strings.Join(labels[i:], ".") + "."
|
||||
|
||||
nsNames, err := r.followDelegation(
|
||||
ctx, candidate, randomRootServers(),
|
||||
ctx, candidate, rootServerList(),
|
||||
)
|
||||
if err == nil && len(nsNames) > 0 {
|
||||
sort.Strings(nsNames)
|
||||
@@ -459,15 +442,9 @@ func (r *Resolver) queryAllTypes(
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
const (
|
||||
singleTypeMaxRetries = 3
|
||||
singleTypeInitialBackoff = 100 * time.Millisecond
|
||||
)
|
||||
|
||||
type queryState struct {
|
||||
gotNXDomain bool
|
||||
gotSERVFAIL bool
|
||||
gotTimeout bool
|
||||
hasRecords bool
|
||||
}
|
||||
|
||||
@@ -495,21 +472,6 @@ func (r *Resolver) queryEachType(
|
||||
return state
|
||||
}
|
||||
|
||||
// isTimeout checks whether an error represents a DNS timeout.
|
||||
func isTimeout(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
var netErr net.Error
|
||||
if errors.As(err, &netErr) && netErr.Timeout() {
|
||||
return true
|
||||
}
|
||||
|
||||
// Also catch i/o timeout strings from the dns library.
|
||||
return strings.Contains(err.Error(), "i/o timeout")
|
||||
}
|
||||
|
||||
func (r *Resolver) querySingleType(
|
||||
ctx context.Context,
|
||||
nsIP string,
|
||||
@@ -518,95 +480,19 @@ func (r *Resolver) querySingleType(
|
||||
resp *NameserverResponse,
|
||||
state *queryState,
|
||||
) {
|
||||
msg, lastErr := r.querySingleTypeWithRetry(
|
||||
ctx, nsIP, hostname, qtype,
|
||||
)
|
||||
if msg == nil {
|
||||
r.recordRetryFailure(lastErr, state)
|
||||
|
||||
msg, err := r.queryDNS(ctx, nsIP, hostname, qtype)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
r.handleDNSResponse(msg, resp, state)
|
||||
}
|
||||
if msg.Rcode == dns.RcodeNameError {
|
||||
state.gotNXDomain = true
|
||||
|
||||
func (r *Resolver) querySingleTypeWithRetry(
|
||||
ctx context.Context,
|
||||
nsIP string,
|
||||
hostname string,
|
||||
qtype uint16,
|
||||
) (*dns.Msg, error) {
|
||||
var lastErr error
|
||||
|
||||
backoff := singleTypeInitialBackoff
|
||||
|
||||
for attempt := range singleTypeMaxRetries {
|
||||
if checkCtx(ctx) != nil {
|
||||
return nil, ErrContextCanceled
|
||||
}
|
||||
|
||||
if attempt > 0 {
|
||||
if !waitBackoff(ctx, backoff) {
|
||||
return nil, ErrContextCanceled
|
||||
}
|
||||
|
||||
backoff *= timeoutMultiplier
|
||||
}
|
||||
|
||||
msg, err := r.queryDNS(ctx, nsIP, hostname, qtype)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
|
||||
if !isTimeout(err) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
continue
|
||||
return
|
||||
}
|
||||
|
||||
if msg.Rcode == dns.RcodeServerFailure {
|
||||
lastErr = ErrSERVFAIL
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
return msg, nil
|
||||
}
|
||||
|
||||
return nil, lastErr
|
||||
}
|
||||
|
||||
func waitBackoff(ctx context.Context, d time.Duration) bool {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false
|
||||
case <-time.After(d):
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Resolver) recordRetryFailure(
|
||||
lastErr error,
|
||||
state *queryState,
|
||||
) {
|
||||
if lastErr == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if isTimeout(lastErr) {
|
||||
state.gotTimeout = true
|
||||
} else if errors.Is(lastErr, ErrSERVFAIL) {
|
||||
state.gotSERVFAIL = true
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Resolver) handleDNSResponse(
|
||||
msg *dns.Msg,
|
||||
resp *NameserverResponse,
|
||||
state *queryState,
|
||||
) {
|
||||
if msg.Rcode == dns.RcodeNameError {
|
||||
state.gotNXDomain = true
|
||||
|
||||
return
|
||||
}
|
||||
@@ -637,12 +523,8 @@ func classifyResponse(resp *NameserverResponse, state queryState) {
|
||||
switch {
|
||||
case state.gotNXDomain && !state.hasRecords:
|
||||
resp.Status = StatusNXDomain
|
||||
case state.gotTimeout && !state.hasRecords:
|
||||
resp.Status = StatusTimeout
|
||||
resp.Error = "all queries timed out after retries"
|
||||
case state.gotSERVFAIL && !state.hasRecords:
|
||||
resp.Status = StatusError
|
||||
resp.Error = "server failure (SERVFAIL) after retries"
|
||||
case !state.hasRecords && !state.gotNXDomain:
|
||||
resp.Status = StatusNoData
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@ const (
|
||||
StatusError = "error"
|
||||
StatusNXDomain = "nxdomain"
|
||||
StatusNoData = "nodata"
|
||||
StatusTimeout = "timeout"
|
||||
)
|
||||
|
||||
// MaxCNAMEDepth is the maximum CNAME chain depth to follow.
|
||||
|
||||
@@ -34,8 +34,12 @@ func newTestResolver(t *testing.T) *resolver.Resolver {
|
||||
func testContext(t *testing.T) context.Context {
|
||||
t.Helper()
|
||||
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test requiring real DNS")
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(
|
||||
context.Background(), 60*time.Second,
|
||||
context.Background(), 15*time.Second,
|
||||
)
|
||||
t.Cleanup(cancel)
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
"log/slog"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"go.uber.org/fx"
|
||||
@@ -50,8 +49,6 @@ type Watcher struct {
|
||||
notify Notifier
|
||||
cancel context.CancelFunc
|
||||
firstRun bool
|
||||
expiryNotifiedMu sync.Mutex
|
||||
expiryNotified map[string]time.Time
|
||||
}
|
||||
|
||||
// New creates a new Watcher instance wired into the fx lifecycle.
|
||||
@@ -68,7 +65,6 @@ func New(
|
||||
tlsCheck: params.TLSCheck,
|
||||
notify: params.Notify,
|
||||
firstRun: true,
|
||||
expiryNotified: make(map[string]time.Time),
|
||||
}
|
||||
|
||||
lifecycle.Append(fx.Hook{
|
||||
@@ -112,7 +108,6 @@ func NewForTest(
|
||||
tlsCheck: tc,
|
||||
notify: n,
|
||||
firstRun: true,
|
||||
expiryNotified: make(map[string]time.Time),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,28 +206,6 @@ func (w *Watcher) checkDomain(
|
||||
Nameservers: nameservers,
|
||||
LastChecked: now,
|
||||
})
|
||||
|
||||
// Also look up A/AAAA records for the apex domain so that
|
||||
// port and TLS checks (which read HostnameState) can find
|
||||
// the domain's IP addresses.
|
||||
records, err := w.resolver.LookupAllRecords(ctx, domain)
|
||||
if err != nil {
|
||||
w.log.Error(
|
||||
"failed to lookup records for domain",
|
||||
"domain", domain,
|
||||
"error", err,
|
||||
)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
prevHS, hasPrevHS := w.state.GetHostnameState(domain)
|
||||
if hasPrevHS && !w.firstRun {
|
||||
w.detectHostnameChanges(ctx, domain, prevHS, records)
|
||||
}
|
||||
|
||||
newState := buildHostnameState(records, now)
|
||||
w.state.SetHostnameState(domain, newState)
|
||||
}
|
||||
|
||||
func (w *Watcher) detectNSChanges(
|
||||
@@ -718,22 +691,6 @@ func (w *Watcher) checkTLSExpiry(
|
||||
return
|
||||
}
|
||||
|
||||
// Deduplicate expiry warnings: don't re-notify for the same
|
||||
// hostname within the TLS check interval.
|
||||
dedupKey := fmt.Sprintf("expiry:%s:%s", hostname, ip)
|
||||
|
||||
w.expiryNotifiedMu.Lock()
|
||||
|
||||
lastNotified, seen := w.expiryNotified[dedupKey]
|
||||
if seen && time.Since(lastNotified) < w.config.TLSInterval {
|
||||
w.expiryNotifiedMu.Unlock()
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
w.expiryNotified[dedupKey] = time.Now()
|
||||
w.expiryNotifiedMu.Unlock()
|
||||
|
||||
msg := fmt.Sprintf(
|
||||
"Host: %s\nIP: %s\nCN: %s\n"+
|
||||
"Expires: %s (%.0f days)",
|
||||
|
||||
@@ -273,10 +273,6 @@ func setupBaselineMocks(deps *testDeps) {
|
||||
"ns1.example.com.",
|
||||
"ns2.example.com.",
|
||||
}
|
||||
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"93.184.216.34"}},
|
||||
"ns2.example.com.": {"A": {"93.184.216.34"}},
|
||||
}
|
||||
deps.resolver.allRecords["www.example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"93.184.216.34"}},
|
||||
"ns2.example.com.": {"A": {"93.184.216.34"}},
|
||||
@@ -294,14 +290,6 @@ func setupBaselineMocks(deps *testDeps) {
|
||||
"www.example.com",
|
||||
},
|
||||
}
|
||||
deps.tlsChecker.certs["93.184.216.34:example.com"] = &tlscheck.CertificateInfo{
|
||||
CommonName: "example.com",
|
||||
Issuer: "DigiCert",
|
||||
NotAfter: time.Now().Add(90 * 24 * time.Hour),
|
||||
SubjectAlternativeNames: []string{
|
||||
"example.com",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func assertNoNotifications(
|
||||
@@ -334,74 +322,14 @@ func assertStatePopulated(
|
||||
)
|
||||
}
|
||||
|
||||
// Hostnames includes both explicit hostnames and domains
|
||||
// (domains now also get hostname state for port/TLS checks).
|
||||
if len(snap.Hostnames) < 1 {
|
||||
if len(snap.Hostnames) != 1 {
|
||||
t.Errorf(
|
||||
"expected at least 1 hostname in state, got %d",
|
||||
"expected 1 hostname in state, got %d",
|
||||
len(snap.Hostnames),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDomainPortAndTLSChecks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cfg := defaultTestConfig(t)
|
||||
cfg.Domains = []string{"example.com"}
|
||||
|
||||
w, deps := newTestWatcher(t, cfg)
|
||||
|
||||
deps.resolver.nsRecords["example.com"] = []string{
|
||||
"ns1.example.com.",
|
||||
}
|
||||
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"93.184.216.34"}},
|
||||
}
|
||||
deps.portChecker.results["93.184.216.34:80"] = true
|
||||
deps.portChecker.results["93.184.216.34:443"] = true
|
||||
deps.tlsChecker.certs["93.184.216.34:example.com"] = &tlscheck.CertificateInfo{
|
||||
CommonName: "example.com",
|
||||
Issuer: "DigiCert",
|
||||
NotAfter: time.Now().Add(90 * 24 * time.Hour),
|
||||
SubjectAlternativeNames: []string{
|
||||
"example.com",
|
||||
},
|
||||
}
|
||||
|
||||
w.RunOnce(t.Context())
|
||||
|
||||
snap := deps.state.GetSnapshot()
|
||||
|
||||
// Domain should have port state populated
|
||||
if len(snap.Ports) == 0 {
|
||||
t.Error("expected port state for domain, got none")
|
||||
}
|
||||
|
||||
// Domain should have certificate state populated
|
||||
if len(snap.Certificates) == 0 {
|
||||
t.Error("expected certificate state for domain, got none")
|
||||
}
|
||||
|
||||
// Verify port checker was actually called
|
||||
deps.portChecker.mu.Lock()
|
||||
calls := deps.portChecker.calls
|
||||
deps.portChecker.mu.Unlock()
|
||||
|
||||
if calls == 0 {
|
||||
t.Error("expected port checker to be called for domain")
|
||||
}
|
||||
|
||||
// Verify TLS checker was actually called
|
||||
deps.tlsChecker.mu.Lock()
|
||||
tlsCalls := deps.tlsChecker.calls
|
||||
deps.tlsChecker.mu.Unlock()
|
||||
|
||||
if tlsCalls == 0 {
|
||||
t.Error("expected TLS checker to be called for domain")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNSChangeDetection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -414,12 +342,6 @@ func TestNSChangeDetection(t *testing.T) {
|
||||
"ns1.example.com.",
|
||||
"ns2.example.com.",
|
||||
}
|
||||
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"1.2.3.4"}},
|
||||
"ns2.example.com.": {"A": {"1.2.3.4"}},
|
||||
}
|
||||
deps.portChecker.results["1.2.3.4:80"] = false
|
||||
deps.portChecker.results["1.2.3.4:443"] = false
|
||||
|
||||
ctx := t.Context()
|
||||
w.RunOnce(ctx)
|
||||
@@ -429,10 +351,6 @@ func TestNSChangeDetection(t *testing.T) {
|
||||
"ns1.example.com.",
|
||||
"ns3.example.com.",
|
||||
}
|
||||
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"1.2.3.4"}},
|
||||
"ns3.example.com.": {"A": {"1.2.3.4"}},
|
||||
}
|
||||
deps.resolver.mu.Unlock()
|
||||
|
||||
w.RunOnce(ctx)
|
||||
@@ -588,61 +506,6 @@ func TestTLSExpiryWarning(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTLSExpiryWarningDedup(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cfg := defaultTestConfig(t)
|
||||
cfg.Hostnames = []string{"www.example.com"}
|
||||
cfg.TLSInterval = 24 * time.Hour
|
||||
|
||||
w, deps := newTestWatcher(t, cfg)
|
||||
|
||||
deps.resolver.allRecords["www.example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"1.2.3.4"}},
|
||||
}
|
||||
deps.resolver.ipAddresses["www.example.com"] = []string{
|
||||
"1.2.3.4",
|
||||
}
|
||||
deps.portChecker.results["1.2.3.4:80"] = true
|
||||
deps.portChecker.results["1.2.3.4:443"] = true
|
||||
deps.tlsChecker.certs["1.2.3.4:www.example.com"] = &tlscheck.CertificateInfo{
|
||||
CommonName: "www.example.com",
|
||||
Issuer: "DigiCert",
|
||||
NotAfter: time.Now().Add(3 * 24 * time.Hour),
|
||||
SubjectAlternativeNames: []string{
|
||||
"www.example.com",
|
||||
},
|
||||
}
|
||||
|
||||
ctx := t.Context()
|
||||
|
||||
// First run = baseline, no notifications
|
||||
w.RunOnce(ctx)
|
||||
|
||||
// Second run should fire one expiry warning
|
||||
w.RunOnce(ctx)
|
||||
|
||||
// Third run should NOT fire another warning (dedup)
|
||||
w.RunOnce(ctx)
|
||||
|
||||
notifications := deps.notifier.getNotifications()
|
||||
|
||||
expiryCount := 0
|
||||
|
||||
for _, n := range notifications {
|
||||
if n.Title == "TLS Expiry Warning: www.example.com" {
|
||||
expiryCount++
|
||||
}
|
||||
}
|
||||
|
||||
if expiryCount != 1 {
|
||||
t.Errorf(
|
||||
"expected exactly 1 expiry warning (dedup), got %d",
|
||||
expiryCount,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGracefulShutdown(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -656,11 +519,6 @@ func TestGracefulShutdown(t *testing.T) {
|
||||
deps.resolver.nsRecords["example.com"] = []string{
|
||||
"ns1.example.com.",
|
||||
}
|
||||
deps.resolver.allRecords["example.com"] = map[string]map[string][]string{
|
||||
"ns1.example.com.": {"A": {"1.2.3.4"}},
|
||||
}
|
||||
deps.portChecker.results["1.2.3.4:80"] = false
|
||||
deps.portChecker.results["1.2.3.4:443"] = false
|
||||
|
||||
ctx, cancel := context.WithCancel(t.Context())
|
||||
|
||||
|
||||
Reference in New Issue
Block a user