Files
dnswatcher/internal/notify/retry.go
clawbot 6d8bb86a55
All checks were successful
check / check (push) Successful in 36s
fix: increase retry defaults to 5 retries and 60s max delay
Per sneak's feedback: DefaultMaxRetries 3→5, DefaultMaxDelay 10s→60s.

closes #62
2026-03-17 02:19:31 -07:00

140 lines
3.1 KiB
Go

package notify
import (
"context"
"math"
"math/rand/v2"
"time"
)
// Retry defaults.
const (
// DefaultMaxRetries is the number of additional attempts
// after the first failure.
DefaultMaxRetries = 5
// DefaultBaseDelay is the initial delay before the first
// retry attempt.
DefaultBaseDelay = 1 * time.Second
// DefaultMaxDelay caps the computed backoff delay.
DefaultMaxDelay = 60 * time.Second
// backoffMultiplier is the exponential growth factor.
backoffMultiplier = 2
// jitterFraction controls the ±random spread applied
// to each delay (0.25 = ±25%).
jitterFraction = 0.25
)
// RetryConfig holds tuning knobs for the retry loop.
// Zero values fall back to the package defaults above.
type RetryConfig struct {
MaxRetries int
BaseDelay time.Duration
MaxDelay time.Duration
}
// defaults returns a copy with zero fields replaced by
// package defaults.
func (rc RetryConfig) defaults() RetryConfig {
if rc.MaxRetries <= 0 {
rc.MaxRetries = DefaultMaxRetries
}
if rc.BaseDelay <= 0 {
rc.BaseDelay = DefaultBaseDelay
}
if rc.MaxDelay <= 0 {
rc.MaxDelay = DefaultMaxDelay
}
return rc
}
// backoff computes the delay for attempt n (0-indexed) with
// jitter. The raw delay is BaseDelay * 2^n, capped at
// MaxDelay, then randomised by ±jitterFraction.
func (rc RetryConfig) backoff(attempt int) time.Duration {
raw := float64(rc.BaseDelay) *
math.Pow(backoffMultiplier, float64(attempt))
if raw > float64(rc.MaxDelay) {
raw = float64(rc.MaxDelay)
}
// Apply jitter: uniform in [raw*(1-j), raw*(1+j)].
lo := raw * (1 - jitterFraction)
hi := raw * (1 + jitterFraction)
jittered := lo + rand.Float64()*(hi-lo) //nolint:gosec // jitter does not need crypto/rand
return time.Duration(jittered)
}
// deliverWithRetry calls fn, retrying on error with
// exponential backoff. It logs every failed attempt and
// returns the last error if all attempts are exhausted.
func (svc *Service) deliverWithRetry(
ctx context.Context,
endpoint string,
fn func(context.Context) error,
) error {
cfg := svc.retryConfig.defaults()
var lastErr error
// attempt 0 is the initial call; attempts 1..MaxRetries
// are retries.
for attempt := range cfg.MaxRetries + 1 {
lastErr = fn(ctx)
if lastErr == nil {
if attempt > 0 {
svc.log.Info(
"notification delivered after retry",
"endpoint", endpoint,
"attempt", attempt+1,
)
}
return nil
}
// Last attempt — don't sleep, just return.
if attempt == cfg.MaxRetries {
break
}
delay := cfg.backoff(attempt)
svc.log.Warn(
"notification delivery failed, retrying",
"endpoint", endpoint,
"attempt", attempt+1,
"maxAttempts", cfg.MaxRetries+1,
"retryIn", delay,
"error", lastErr,
)
select {
case <-ctx.Done():
return ctx.Err()
case <-svc.sleepFunc(delay):
}
}
return lastErr
}
// sleepFunc returns a channel that closes after d.
// It is a field-level indirection so tests can override it.
func (svc *Service) sleepFunc(d time.Duration) <-chan time.Time {
if svc.sleepFn != nil {
return svc.sleepFn(d)
}
return time.After(d)
}