feat: add retry with exponential backoff for notification delivery (#87)
All checks were successful
check / check (push) Successful in 37s

## Summary

Notifications were fire-and-forget: if Slack, Mattermost, or ntfy was temporarily down, changes were silently lost. This adds automatic retry with exponential backoff and jitter to all notification endpoints.

## Changes

### New file: `internal/notify/retry.go`
- `RetryConfig` struct with configurable max retries, base delay, max delay
- `backoff()` computes delay as `BaseDelay * 2^attempt`, capped at `MaxDelay`, with ±25% jitter
- `deliverWithRetry()` wraps any send function with the retry loop
- Defaults: 3 retries (4 total attempts), 1s base delay, 10s max delay
- Context-aware: respects cancellation during retry sleep
- Injectable `sleepFn` for test determinism

### Modified: `internal/notify/notify.go`
- Added `retryConfig` and `sleepFn` fields to `Service`
- Updated `dispatchNtfy`, `dispatchSlack`, `dispatchMattermost` to wrap sends in `deliverWithRetry`
- Structured logging: warns on each retry, logs error only after all retries exhausted, logs info on success after retry

### Modified: `internal/notify/export_test.go`
- Added test helpers: `SetRetryConfig`, `SetSleepFunc`, `DeliverWithRetry`, `BackoffDuration`

### New file: `internal/notify/retry_test.go`
- Backoff calculation tests (exponential increase, max cap with jitter)
- `deliverWithRetry` unit tests: first-attempt success, transient failure recovery, exhausted retries, context cancellation
- Integration tests via `SendNotification`: transient failure retries, all-endpoints retry independently, permanent failure exhausts retries

## Verification
- `make fmt` 
- `make check` (format + lint + tests + build) 
- `docker build .` 
- All existing tests continue to pass unchanged
- No DNS client mocking — notification tests use `httptest` servers

closes #62

Co-authored-by: clawbot <clawbot@noreply.git.eeqj.de>
Reviewed-on: #87
Co-authored-by: clawbot <clawbot@noreply.example.org>
Co-committed-by: clawbot <clawbot@noreply.example.org>
This commit was merged in pull request #87.
This commit is contained in:
2026-03-22 07:14:59 +01:00
committed by Jeffrey Paul
parent f788037bfb
commit 23f115053b
4 changed files with 693 additions and 12 deletions

View File

@@ -113,6 +113,8 @@ type Service struct {
slackWebhookURL *url.URL
mattermostWebhookURL *url.URL
history *AlertHistory
retryConfig RetryConfig
sleepFn func(time.Duration) <-chan time.Time
}
// New creates a new notify Service.
@@ -203,13 +205,19 @@ func (svc *Service) dispatchNtfy(
go func() {
notifyCtx := context.WithoutCancel(ctx)
err := svc.sendNtfy(
notifyCtx, svc.ntfyURL,
title, message, priority,
err := svc.deliverWithRetry(
notifyCtx, "ntfy",
func(c context.Context) error {
return svc.sendNtfy(
c, svc.ntfyURL,
title, message, priority,
)
},
)
if err != nil {
svc.log.Error(
"failed to send ntfy notification",
"failed to send ntfy notification "+
"after retries",
"error", err,
)
}
@@ -227,13 +235,19 @@ func (svc *Service) dispatchSlack(
go func() {
notifyCtx := context.WithoutCancel(ctx)
err := svc.sendSlack(
notifyCtx, svc.slackWebhookURL,
title, message, priority,
err := svc.deliverWithRetry(
notifyCtx, "slack",
func(c context.Context) error {
return svc.sendSlack(
c, svc.slackWebhookURL,
title, message, priority,
)
},
)
if err != nil {
svc.log.Error(
"failed to send slack notification",
"failed to send slack notification "+
"after retries",
"error", err,
)
}
@@ -251,13 +265,19 @@ func (svc *Service) dispatchMattermost(
go func() {
notifyCtx := context.WithoutCancel(ctx)
err := svc.sendSlack(
notifyCtx, svc.mattermostWebhookURL,
title, message, priority,
err := svc.deliverWithRetry(
notifyCtx, "mattermost",
func(c context.Context) error {
return svc.sendSlack(
c, svc.mattermostWebhookURL,
title, message, priority,
)
},
)
if err != nil {
svc.log.Error(
"failed to send mattermost notification",
"failed to send mattermost notification "+
"after retries",
"error", err,
)
}