refactor: event-driven delivery engine with channel notifications and timer-based retries
All checks were successful
check / check (push) Successful in 58s

Replace the polling-based delivery engine with a fully event-driven
architecture using Go channels and goroutines:

- Webhook handler notifies engine via buffered channel after creating
  delivery records, with inline event data for payloads < 16KB
- Large payloads (>= 16KB) use pointer semantics (Body *string = nil)
  and are fetched from DB on demand, keeping channel memory bounded
- Failed retry-target deliveries schedule Go timers with exponential
  backoff; timers fire into a separate retry channel when ready
- On startup, engine scans DB once to recover interrupted deliveries
  (pending processed immediately, retrying get timers for remaining
  backoff)
- DB stores delivery status for crash recovery only, not for
  inter-component communication during normal operation
- delivery.Notifier interface decouples handlers from engine; fx wires
  *Engine as Notifier

No more periodic polling. No more wasted cycles when idle.
This commit is contained in:
clawbot 2026-03-01 21:46:16 -08:00
parent 8f62fde8e9
commit 5e683af2a4
6 changed files with 404 additions and 53 deletions

View File

@ -463,11 +463,12 @@ External Service
1. Look up Entrypoint by UUID
2. Capture full request as Event
3. Queue Delivery to each active Target
4. Notify Engine via channel
┌──────────────┐
│ Delivery │
│ Engine │
│ Delivery │◄── retry timers
│ Engine │ (backoff)
└──────┬───────┘
┌────────────────────┼────────────────────┐
@ -577,7 +578,7 @@ webhooker/
│ ├── globals/
│ │ └── globals.go # Build-time variables (appname, version, arch)
│ ├── delivery/
│ │ └── engine.go # Background delivery engine (fx lifecycle)
│ │ └── engine.go # Event-driven delivery engine (channel + timer based)
│ ├── handlers/
│ │ ├── handlers.go # Base handler struct, JSON helpers, template rendering
│ │ ├── auth.go # Login, logout handlers
@ -627,11 +628,14 @@ Components are wired via Uber fx in this order:
7. `session.New` — Cookie-based session manager
8. `handlers.New` — HTTP handlers
9. `middleware.New` — HTTP middleware
10. `delivery.New` — Background delivery engine
11. `server.New` — HTTP server and router
10. `delivery.New` — Event-driven delivery engine
11. `delivery.Engine``handlers.DeliveryNotifier` — interface bridge
12. `server.New` — HTTP server and router
The server starts via `fx.Invoke(func(*server.Server, *delivery.Engine)
{})` which triggers the fx lifecycle hooks in dependency order.
{})` which triggers the fx lifecycle hooks in dependency order. The
`DeliveryNotifier` interface allows the webhook handler to notify the
delivery engine of new work without a direct package dependency.
### Middleware Stack
@ -720,7 +724,7 @@ linted, tested, and compiled.
- [x] Per-webhook database lifecycle management (create on webhook
creation, delete on webhook removal)
- [x] `WebhookDBManager` component with lazy connection pooling
- [x] Delivery engine polls all per-webhook DBs for pending deliveries
- [x] Event-driven delivery engine (channel notifications + timer-based retries)
- [x] Database target type marks delivery as immediately successful
(events are already in the per-webhook DB)

View File

@ -39,6 +39,9 @@ func main() {
handlers.New,
middleware.New,
delivery.New,
// Wire *delivery.Engine as delivery.Notifier so the
// webhook handler can notify the engine of new deliveries.
func(e *delivery.Engine) delivery.Notifier { return e },
server.New,
),
fx.Invoke(func(*server.Server, *delivery.Engine) {}),

View File

@ -18,8 +18,19 @@ import (
)
const (
// pollInterval is how often the engine checks for pending deliveries.
pollInterval = 2 * time.Second
// notifyChannelSize is the buffer size for the delivery notification channel.
// Sized large enough that the webhook handler should never block.
notifyChannelSize = 1000
// retryChannelSize is the buffer size for the retry channel. Timer-fired
// retries are sent here for processing by the engine goroutine.
retryChannelSize = 1000
// MaxInlineBodySize is the maximum event body size that will be carried
// inline in a Notification through the channel. Bodies at or above this
// size are left nil and fetched from the per-webhook database on demand.
// This keeps channel buffer memory bounded under high traffic.
MaxInlineBodySize = 16 * 1024
// httpClientTimeout is the timeout for outbound HTTP requests.
httpClientTimeout = 30 * time.Second
@ -28,6 +39,33 @@ const (
maxBodyLog = 4096
)
// Notification carries event data through the delivery notification channel.
// The Body field is a pointer: non-nil for payloads under MaxInlineBodySize
// (16 KB), nil for larger payloads. When nil, the engine fetches the body
// from the per-webhook database using EventID. This keeps channel buffer
// memory bounded regardless of payload sizes during high traffic.
type Notification struct {
WebhookID string
EventID string
Method string
Headers string
ContentType string
Body *string // nil if body >= MaxInlineBodySize; fetch from DB by EventID
}
// Notifier is the interface for notifying the delivery engine about new
// deliveries. Implemented by Engine and injected into handlers.
type Notifier interface {
Notify(n Notification)
}
// retryRequest carries the information needed to retry a specific delivery.
// Sent from timer goroutines to the engine's retry channel.
type retryRequest struct {
webhookID string
deliveryID string
}
// HTTPTargetConfig holds configuration for http and retry target types.
type HTTPTargetConfig struct {
URL string `json:"url"`
@ -45,9 +83,14 @@ type EngineParams struct {
Logger *logger.Logger
}
// Engine processes queued deliveries in the background.
// It iterates over all active webhooks and polls each webhook's
// per-webhook database for pending deliveries.
// Engine processes queued deliveries in the background using an
// event-driven architecture. New deliveries are signaled via a buffered
// channel from the webhook handler and processed immediately. Failed
// deliveries that need retry are scheduled via Go timers with exponential
// backoff — each timer fires into a separate retry channel when the
// backoff period expires. The database stores delivery status for crash
// recovery only; on startup the engine scans for interrupted deliveries
// and re-queues them.
type Engine struct {
database *database.Database
dbManager *database.WebhookDBManager
@ -55,6 +98,8 @@ type Engine struct {
client *http.Client
cancel context.CancelFunc
wg sync.WaitGroup
notifyCh chan Notification
retryCh chan retryRequest
}
// New creates and registers the delivery engine with the fx lifecycle.
@ -66,6 +111,8 @@ func New(lc fx.Lifecycle, params EngineParams) *Engine {
client: &http.Client{
Timeout: httpClientTimeout,
},
notifyCh: make(chan Notification, notifyChannelSize),
retryCh: make(chan retryRequest, retryChannelSize),
}
lc.Append(fx.Hook{
@ -97,29 +144,52 @@ func (e *Engine) stop() {
e.log.Info("delivery engine stopped")
}
// Notify signals the delivery engine that new deliveries are available.
// This is called by the webhook handler after creating delivery records.
// The notification carries the event data inline (with body pointer
// semantics for memory efficiency). The call is non-blocking; if the
// channel is full, a warning is logged and the deliveries will be
// recovered on the next engine restart.
func (e *Engine) Notify(n Notification) {
select {
case e.notifyCh <- n:
default:
e.log.Warn("delivery notification channel full, deliveries will be recovered on restart",
"webhook_id", n.WebhookID,
"event_id", n.EventID,
)
}
}
func (e *Engine) run(ctx context.Context) {
defer e.wg.Done()
ticker := time.NewTicker(pollInterval)
defer ticker.Stop()
// On startup, recover any pending or retrying deliveries that were
// interrupted by an unexpected shutdown. Pending deliveries are
// processed immediately; retrying deliveries get timers scheduled
// for their remaining backoff.
e.recoverInFlight(ctx)
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
e.processPending(ctx)
case n := <-e.notifyCh:
e.processNotification(ctx, n)
case req := <-e.retryCh:
e.processRetryDelivery(ctx, req)
}
}
}
// processPending iterates over all active webhooks and processes pending
// deliveries from each webhook's per-webhook database.
func (e *Engine) processPending(ctx context.Context) {
// Get all active webhook IDs from the main application database
// recoverInFlight scans all webhooks on startup for deliveries that were
// interrupted by an unexpected shutdown. Pending deliveries are processed
// immediately; retrying deliveries get timers scheduled for their
// remaining backoff period.
func (e *Engine) recoverInFlight(ctx context.Context) {
var webhookIDs []string
if err := e.database.DB().Model(&database.Webhook{}).Pluck("id", &webhookIDs).Error; err != nil {
e.log.Error("failed to query webhook IDs", "error", err)
e.log.Error("failed to query webhook IDs for recovery", "error", err)
return
}
@ -128,18 +198,200 @@ func (e *Engine) processPending(ctx context.Context) {
case <-ctx.Done():
return
default:
// Only process webhooks that have an event database file
if !e.dbManager.DBExists(webhookID) {
}
if !e.dbManager.DBExists(webhookID) {
continue
}
e.recoverWebhookDeliveries(ctx, webhookID)
}
}
// recoverWebhookDeliveries recovers pending and retrying deliveries for
// a single webhook. Pending deliveries are processed directly (loading
// event data from DB); retrying deliveries get timers scheduled based on
// the elapsed time since the last attempt.
func (e *Engine) recoverWebhookDeliveries(ctx context.Context, webhookID string) {
webhookDB, err := e.dbManager.GetDB(webhookID)
if err != nil {
e.log.Error("failed to get webhook database for recovery",
"webhook_id", webhookID,
"error", err,
)
return
}
// Check for pending deliveries and process them immediately
var pendingCount int64
webhookDB.Model(&database.Delivery{}).
Where("status = ?", database.DeliveryStatusPending).
Count(&pendingCount)
if pendingCount > 0 {
e.log.Info("recovering pending deliveries",
"webhook_id", webhookID,
"count", pendingCount,
)
e.processWebhookPendingDeliveries(ctx, webhookID)
}
// Schedule timers for retrying deliveries based on remaining backoff
var retrying []database.Delivery
if err := webhookDB.Where("status = ?", database.DeliveryStatusRetrying).
Find(&retrying).Error; err != nil {
e.log.Error("failed to query retrying deliveries for recovery",
"webhook_id", webhookID,
"error", err,
)
return
}
for i := range retrying {
d := &retrying[i]
var resultCount int64
webhookDB.Model(&database.DeliveryResult{}).
Where("delivery_id = ?", d.ID).
Count(&resultCount)
attemptNum := int(resultCount)
// Calculate remaining backoff from last attempt
remaining := time.Duration(0)
var lastResult database.DeliveryResult
if err := webhookDB.Where("delivery_id = ?", d.ID).
Order("created_at DESC").
First(&lastResult).Error; err == nil {
shift := attemptNum - 1
if shift < 0 {
shift = 0
}
if shift > 30 {
shift = 30
}
backoff := time.Duration(1<<uint(shift)) * time.Second //nolint:gosec // bounded above
elapsed := time.Since(lastResult.CreatedAt)
remaining = backoff - elapsed
if remaining < 0 {
remaining = 0
}
}
e.log.Info("recovering retrying delivery",
"webhook_id", webhookID,
"delivery_id", d.ID,
"attempt", attemptNum,
"remaining_backoff", remaining,
)
e.scheduleRetry(webhookID, d.ID, remaining)
}
}
// processNotification handles a delivery notification from the webhook
// handler. It uses the inline event data from the notification (avoiding
// a DB round-trip for the event) and only fetches the body from DB when
// it was too large to carry inline (Body pointer is nil).
func (e *Engine) processNotification(ctx context.Context, n Notification) {
webhookDB, err := e.dbManager.GetDB(n.WebhookID)
if err != nil {
e.log.Error("failed to get webhook database",
"webhook_id", n.WebhookID,
"error", err,
)
return
}
// Build the Event from the notification's inline data
event := database.Event{
Method: n.Method,
Headers: n.Headers,
ContentType: n.ContentType,
}
event.ID = n.EventID
event.WebhookID = n.WebhookID
if n.Body != nil {
event.Body = *n.Body
} else {
// Body was too large for inline transport — fetch from DB
var dbEvent database.Event
if err := webhookDB.Select("body").
First(&dbEvent, "id = ?", n.EventID).Error; err != nil {
e.log.Error("failed to fetch event body from database",
"event_id", n.EventID,
"error", err,
)
return
}
event.Body = dbEvent.Body
}
// Query pending deliveries for this specific event
var deliveries []database.Delivery
result := webhookDB.
Where("event_id = ? AND status = ?", n.EventID, database.DeliveryStatusPending).
Find(&deliveries)
if result.Error != nil {
e.log.Error("failed to query pending deliveries",
"webhook_id", n.WebhookID,
"event_id", n.EventID,
"error", result.Error,
)
return
}
if len(deliveries) == 0 {
return
}
// Collect unique target IDs and load targets from the main DB
seen := make(map[string]bool)
targetIDs := make([]string, 0, len(deliveries))
for _, d := range deliveries {
if !seen[d.TargetID] {
targetIDs = append(targetIDs, d.TargetID)
seen[d.TargetID] = true
}
}
var targets []database.Target
if err := e.database.DB().Where("id IN ?", targetIDs).Find(&targets).Error; err != nil {
e.log.Error("failed to load targets from main DB", "error", err)
return
}
targetMap := make(map[string]database.Target, len(targets))
for _, t := range targets {
targetMap[t.ID] = t
}
for i := range deliveries {
select {
case <-ctx.Done():
return
default:
target, ok := targetMap[deliveries[i].TargetID]
if !ok {
e.log.Error("target not found for delivery",
"delivery_id", deliveries[i].ID,
"target_id", deliveries[i].TargetID,
)
continue
}
e.processWebhookDeliveries(ctx, webhookID)
deliveries[i].Event = event
deliveries[i].Target = target
e.processDelivery(ctx, webhookDB, &deliveries[i])
}
}
}
// processWebhookDeliveries polls a single webhook's database for pending
// deliveries and processes them.
func (e *Engine) processWebhookDeliveries(ctx context.Context, webhookID string) {
// processWebhookPendingDeliveries queries a single webhook's database for
// all pending deliveries and processes them. Used for crash recovery where
// we don't have inline event data — everything is loaded from the DB.
func (e *Engine) processWebhookPendingDeliveries(ctx context.Context, webhookID string) {
webhookDB, err := e.dbManager.GetDB(webhookID)
if err != nil {
e.log.Error("failed to get webhook database",
@ -149,14 +401,9 @@ func (e *Engine) processWebhookDeliveries(ctx context.Context, webhookID string)
return
}
// Query pending and retrying deliveries from the per-webhook DB.
// Preload Event (same DB) but NOT Target (Target is in the main DB).
var deliveries []database.Delivery
result := webhookDB.
Where("status IN ?", []database.DeliveryStatus{
database.DeliveryStatusPending,
database.DeliveryStatusRetrying,
}).
Where("status = ?", database.DeliveryStatusPending).
Preload("Event").
Find(&deliveries)
@ -212,6 +459,79 @@ func (e *Engine) processWebhookDeliveries(ctx context.Context, webhookID string)
}
}
// processRetryDelivery handles a single retry delivery triggered by a
// backoff timer. It loads the delivery and target from the database and
// re-attempts delivery.
func (e *Engine) processRetryDelivery(ctx context.Context, req retryRequest) {
webhookDB, err := e.dbManager.GetDB(req.webhookID)
if err != nil {
e.log.Error("failed to get webhook database for retry",
"webhook_id", req.webhookID,
"delivery_id", req.deliveryID,
"error", err,
)
return
}
var d database.Delivery
if err := webhookDB.Preload("Event").
First(&d, "id = ?", req.deliveryID).Error; err != nil {
e.log.Error("failed to load delivery for retry",
"delivery_id", req.deliveryID,
"error", err,
)
return
}
// Verify delivery is still in retrying status (may have been
// cancelled or manually resolved while the timer was pending)
if d.Status != database.DeliveryStatusRetrying {
e.log.Debug("skipping retry for delivery no longer in retrying status",
"delivery_id", d.ID,
"status", d.Status,
)
return
}
// Load target from main DB
var target database.Target
if err := e.database.DB().First(&target, "id = ?", d.TargetID).Error; err != nil {
e.log.Error("failed to load target for retry",
"delivery_id", d.ID,
"target_id", d.TargetID,
"error", err,
)
return
}
d.Target = target
e.processDelivery(ctx, webhookDB, &d)
}
// scheduleRetry creates a Go timer that fires after the given delay and
// sends a retry request to the engine's retry channel. This is the
// mechanism for exponential backoff — no periodic DB scanning needed.
func (e *Engine) scheduleRetry(webhookID, deliveryID string, delay time.Duration) {
e.log.Debug("scheduling delivery retry",
"webhook_id", webhookID,
"delivery_id", deliveryID,
"delay", delay,
)
time.AfterFunc(delay, func() {
select {
case e.retryCh <- retryRequest{
webhookID: webhookID,
deliveryID: deliveryID,
}:
default:
e.log.Warn("retry channel full, delivery will be recovered on restart",
"delivery_id", deliveryID,
)
}
})
}
func (e *Engine) processDelivery(ctx context.Context, webhookDB *gorm.DB, d *database.Delivery) {
switch d.Target.Type {
case database.TargetTypeHTTP:
@ -277,24 +597,8 @@ func (e *Engine) deliverRetry(_ context.Context, webhookDB *gorm.DB, d *database
webhookDB.Model(&database.DeliveryResult{}).Where("delivery_id = ?", d.ID).Count(&resultCount)
attemptNum := int(resultCount) + 1
// Check if we should wait before retrying (exponential backoff)
if attemptNum > 1 {
var lastResult database.DeliveryResult
lookupErr := webhookDB.Where("delivery_id = ?", d.ID).Order("created_at DESC").First(&lastResult).Error
if lookupErr == nil {
shift := attemptNum - 2
if shift > 30 {
shift = 30
}
backoff := time.Duration(1<<uint(shift)) * time.Second //nolint:gosec // bounded above
nextAttempt := lastResult.CreatedAt.Add(backoff)
if time.Now().UTC().Before(nextAttempt) {
// Not time to retry yet
return
}
}
}
// Attempt delivery immediately — backoff is handled by the timer
// that triggered this call, not by polling.
statusCode, respBody, duration, err := e.doHTTPRequest(cfg, &d.Event)
success := err == nil && statusCode >= 200 && statusCode < 300
@ -319,6 +623,16 @@ func (e *Engine) deliverRetry(_ context.Context, webhookDB *gorm.DB, d *database
e.updateDeliveryStatus(webhookDB, d, database.DeliveryStatusFailed)
} else {
e.updateDeliveryStatus(webhookDB, d, database.DeliveryStatusRetrying)
// Schedule a timer for the next retry with exponential backoff.
// The timer will fire and send a retryRequest to the engine's
// retry channel, which triggers processRetryDelivery.
shift := attemptNum - 1
if shift > 30 {
shift = 30
}
backoff := time.Duration(1<<uint(shift)) * time.Second //nolint:gosec // bounded above
e.scheduleRetry(d.Event.WebhookID, d.ID, backoff)
}
}

View File

@ -9,6 +9,7 @@ import (
"go.uber.org/fx"
"sneak.berlin/go/webhooker/internal/database"
"sneak.berlin/go/webhooker/internal/delivery"
"sneak.berlin/go/webhooker/internal/globals"
"sneak.berlin/go/webhooker/internal/healthcheck"
"sneak.berlin/go/webhooker/internal/logger"
@ -25,6 +26,7 @@ type HandlersParams struct {
WebhookDBMgr *database.WebhookDBManager
Healthcheck *healthcheck.Healthcheck
Session *session.Session
Notifier delivery.Notifier
}
type Handlers struct {
@ -34,6 +36,7 @@ type Handlers struct {
db *database.Database
dbMgr *database.WebhookDBManager
session *session.Session
notifier delivery.Notifier
templates map[string]*template.Template
}
@ -57,6 +60,7 @@ func New(lc fx.Lifecycle, params HandlersParams) (*Handlers, error) {
s.db = params.Database
s.dbMgr = params.WebhookDBMgr
s.session = params.Session
s.notifier = params.Notifier
// Parse all page templates once at startup
s.templates = map[string]*template.Template{

View File

@ -12,12 +12,18 @@ import (
"go.uber.org/fx/fxtest"
"sneak.berlin/go/webhooker/internal/config"
"sneak.berlin/go/webhooker/internal/database"
"sneak.berlin/go/webhooker/internal/delivery"
"sneak.berlin/go/webhooker/internal/globals"
"sneak.berlin/go/webhooker/internal/healthcheck"
"sneak.berlin/go/webhooker/internal/logger"
"sneak.berlin/go/webhooker/internal/session"
)
// noopNotifier is a no-op delivery.Notifier for tests.
type noopNotifier struct{}
func (n *noopNotifier) Notify(delivery.Notification) {}
func TestHandleIndex(t *testing.T) {
var h *Handlers
@ -41,6 +47,7 @@ func TestHandleIndex(t *testing.T) {
database.NewWebhookDBManager,
healthcheck.New,
session.New,
func() delivery.Notifier { return &noopNotifier{} },
New,
),
fx.Populate(&h),
@ -76,6 +83,7 @@ func TestRenderTemplate(t *testing.T) {
database.NewWebhookDBManager,
healthcheck.New,
session.New,
func() delivery.Notifier { return &noopNotifier{} },
New,
),
fx.Populate(&h),

View File

@ -7,6 +7,7 @@ import (
"github.com/go-chi/chi"
"sneak.berlin/go/webhooker/internal/database"
"sneak.berlin/go/webhooker/internal/delivery"
)
const (
@ -117,12 +118,12 @@ func (h *Handlers) HandleWebhook() http.HandlerFunc {
// Create delivery records for each active target
for i := range targets {
delivery := &database.Delivery{
dlv := &database.Delivery{
EventID: event.ID,
TargetID: targets[i].ID,
Status: database.DeliveryStatusPending,
}
if err := tx.Create(delivery).Error; err != nil {
if err := tx.Create(dlv).Error; err != nil {
tx.Rollback()
h.log.Error("failed to create delivery",
"target_id", targets[i].ID,
@ -139,6 +140,23 @@ func (h *Handlers) HandleWebhook() http.HandlerFunc {
return
}
// Notify the delivery engine with inline event data so it can
// process deliveries immediately without a DB round-trip.
// Large bodies (>= 16KB) are left nil to keep channel memory
// bounded; the engine fetches them from DB on demand.
n := delivery.Notification{
WebhookID: entrypoint.WebhookID,
EventID: event.ID,
Method: event.Method,
Headers: event.Headers,
ContentType: event.ContentType,
}
bodyStr := string(body)
if len(body) < delivery.MaxInlineBodySize {
n.Body = &bodyStr
}
h.notifier.Notify(n)
h.log.Info("webhook event created",
"event_id", event.ID,
"webhook_id", entrypoint.WebhookID,