feat: add retry with exponential backoff for notification delivery
All checks were successful
check / check (push) Successful in 42s

Notifications were fire-and-forget: if Slack, Mattermost, or ntfy was
temporarily down, changes were silently lost. This adds automatic retry
with exponential backoff and jitter to all notification endpoints.

Implementation:
- New retry.go with configurable RetryConfig (max retries, base delay,
  max delay) and exponential backoff with ±25% jitter
- Each dispatch goroutine now wraps its send call in deliverWithRetry
- Default: 3 retries (4 total attempts), 1s base delay, 10s max delay
- Context-aware: respects cancellation during retry sleep
- Structured logging on each retry attempt and on final success after
  retry

All existing tests continue to pass. New tests cover:
- Backoff calculation (increase, cap)
- Retry success on first attempt (no unnecessary retries)
- Retry on transient failure (succeeds after N attempts)
- Exhausted retries (returns last error)
- Context cancellation during retry sleep
- Integration: SendNotification retries transient 500s
- Integration: all three endpoints retry independently
- Integration: permanent failure exhausts retries

closes #62
This commit is contained in:
clawbot
2026-03-10 11:11:32 -07:00
parent b64db3e10f
commit 31bd6c3228
4 changed files with 693 additions and 12 deletions

139
internal/notify/retry.go Normal file
View File

@@ -0,0 +1,139 @@
package notify
import (
"context"
"math"
"math/rand/v2"
"time"
)
// Retry defaults.
const (
// DefaultMaxRetries is the number of additional attempts
// after the first failure.
DefaultMaxRetries = 3
// DefaultBaseDelay is the initial delay before the first
// retry attempt.
DefaultBaseDelay = 1 * time.Second
// DefaultMaxDelay caps the computed backoff delay.
DefaultMaxDelay = 10 * time.Second
// backoffMultiplier is the exponential growth factor.
backoffMultiplier = 2
// jitterFraction controls the ±random spread applied
// to each delay (0.25 = ±25%).
jitterFraction = 0.25
)
// RetryConfig holds tuning knobs for the retry loop.
// Zero values fall back to the package defaults above.
type RetryConfig struct {
MaxRetries int
BaseDelay time.Duration
MaxDelay time.Duration
}
// defaults returns a copy with zero fields replaced by
// package defaults.
func (rc RetryConfig) defaults() RetryConfig {
if rc.MaxRetries <= 0 {
rc.MaxRetries = DefaultMaxRetries
}
if rc.BaseDelay <= 0 {
rc.BaseDelay = DefaultBaseDelay
}
if rc.MaxDelay <= 0 {
rc.MaxDelay = DefaultMaxDelay
}
return rc
}
// backoff computes the delay for attempt n (0-indexed) with
// jitter. The raw delay is BaseDelay * 2^n, capped at
// MaxDelay, then randomised by ±jitterFraction.
func (rc RetryConfig) backoff(attempt int) time.Duration {
raw := float64(rc.BaseDelay) *
math.Pow(backoffMultiplier, float64(attempt))
if raw > float64(rc.MaxDelay) {
raw = float64(rc.MaxDelay)
}
// Apply jitter: uniform in [raw*(1-j), raw*(1+j)].
lo := raw * (1 - jitterFraction)
hi := raw * (1 + jitterFraction)
jittered := lo + rand.Float64()*(hi-lo) //nolint:gosec // jitter does not need crypto/rand
return time.Duration(jittered)
}
// deliverWithRetry calls fn, retrying on error with
// exponential backoff. It logs every failed attempt and
// returns the last error if all attempts are exhausted.
func (svc *Service) deliverWithRetry(
ctx context.Context,
endpoint string,
fn func(context.Context) error,
) error {
cfg := svc.retryConfig.defaults()
var lastErr error
// attempt 0 is the initial call; attempts 1..MaxRetries
// are retries.
for attempt := range cfg.MaxRetries + 1 {
lastErr = fn(ctx)
if lastErr == nil {
if attempt > 0 {
svc.log.Info(
"notification delivered after retry",
"endpoint", endpoint,
"attempt", attempt+1,
)
}
return nil
}
// Last attempt — don't sleep, just return.
if attempt == cfg.MaxRetries {
break
}
delay := cfg.backoff(attempt)
svc.log.Warn(
"notification delivery failed, retrying",
"endpoint", endpoint,
"attempt", attempt+1,
"maxAttempts", cfg.MaxRetries+1,
"retryIn", delay,
"error", lastErr,
)
select {
case <-ctx.Done():
return ctx.Err()
case <-svc.sleepFunc(delay):
}
}
return lastErr
}
// sleepFunc returns a channel that closes after d.
// It is a field-level indirection so tests can override it.
func (svc *Service) sleepFunc(d time.Duration) <-chan time.Time {
if svc.sleepFn != nil {
return svc.sleepFn(d)
}
return time.After(d)
}