feat: add retry with exponential backoff for notification delivery
All checks were successful
check / check (push) Successful in 42s

Notifications were fire-and-forget: if Slack, Mattermost, or ntfy was
temporarily down, changes were silently lost. This adds automatic retry
with exponential backoff and jitter to all notification endpoints.

Implementation:
- New retry.go with configurable RetryConfig (max retries, base delay,
  max delay) and exponential backoff with ±25% jitter
- Each dispatch goroutine now wraps its send call in deliverWithRetry
- Default: 3 retries (4 total attempts), 1s base delay, 10s max delay
- Context-aware: respects cancellation during retry sleep
- Structured logging on each retry attempt and on final success after
  retry

All existing tests continue to pass. New tests cover:
- Backoff calculation (increase, cap)
- Retry success on first attempt (no unnecessary retries)
- Retry on transient failure (succeeds after N attempts)
- Exhausted retries (returns last error)
- Context cancellation during retry sleep
- Integration: SendNotification retries transient 500s
- Integration: all three endpoints retry independently
- Integration: permanent failure exhausts retries

closes #62
This commit is contained in:
clawbot
2026-03-10 11:11:32 -07:00
parent b64db3e10f
commit 31bd6c3228
4 changed files with 693 additions and 12 deletions

View File

@@ -113,6 +113,8 @@ type Service struct {
slackWebhookURL *url.URL
mattermostWebhookURL *url.URL
history *AlertHistory
retryConfig RetryConfig
sleepFn func(time.Duration) <-chan time.Time
}
// New creates a new notify Service.
@@ -203,13 +205,19 @@ func (svc *Service) dispatchNtfy(
go func() {
notifyCtx := context.WithoutCancel(ctx)
err := svc.sendNtfy(
notifyCtx, svc.ntfyURL,
title, message, priority,
err := svc.deliverWithRetry(
notifyCtx, "ntfy",
func(c context.Context) error {
return svc.sendNtfy(
c, svc.ntfyURL,
title, message, priority,
)
},
)
if err != nil {
svc.log.Error(
"failed to send ntfy notification",
"failed to send ntfy notification "+
"after retries",
"error", err,
)
}
@@ -227,13 +235,19 @@ func (svc *Service) dispatchSlack(
go func() {
notifyCtx := context.WithoutCancel(ctx)
err := svc.sendSlack(
notifyCtx, svc.slackWebhookURL,
title, message, priority,
err := svc.deliverWithRetry(
notifyCtx, "slack",
func(c context.Context) error {
return svc.sendSlack(
c, svc.slackWebhookURL,
title, message, priority,
)
},
)
if err != nil {
svc.log.Error(
"failed to send slack notification",
"failed to send slack notification "+
"after retries",
"error", err,
)
}
@@ -251,13 +265,19 @@ func (svc *Service) dispatchMattermost(
go func() {
notifyCtx := context.WithoutCancel(ctx)
err := svc.sendSlack(
notifyCtx, svc.mattermostWebhookURL,
title, message, priority,
err := svc.deliverWithRetry(
notifyCtx, "mattermost",
func(c context.Context) error {
return svc.sendSlack(
c, svc.mattermostWebhookURL,
title, message, priority,
)
},
)
if err != nil {
svc.log.Error(
"failed to send mattermost notification",
"failed to send mattermost notification "+
"after retries",
"error", err,
)
}