feat: parallel fan-out delivery + circuit breaker for retry targets
All checks were successful
check / check (push) Successful in 1m52s
All checks were successful
check / check (push) Successful in 1m52s
- Fan out all targets for an event in parallel goroutines (fire-and-forget) - Add per-target circuit breaker for retry targets (closed/open/half-open) - Circuit breaker trips after 5 consecutive failures, 30s cooldown - Open circuit skips delivery and reschedules after cooldown - Half-open allows one probe delivery to test recovery - HTTP/database/log targets unaffected (no circuit breaker) - Recovery path also fans out in parallel - Update README with parallel delivery and circuit breaker docs
This commit is contained in:
@@ -104,6 +104,11 @@ type EngineParams struct {
|
||||
// DeliveryTask so retries also avoid unnecessary DB reads. The database
|
||||
// stores delivery status for crash recovery only; on startup the engine
|
||||
// scans for interrupted deliveries and re-queues them.
|
||||
//
|
||||
// All targets for a single event are delivered in parallel — each
|
||||
// DeliveryTask is dispatched in its own goroutine for maximum fan-out
|
||||
// speed. Retry targets are protected by a per-target circuit breaker
|
||||
// that stops hammering a down target after consecutive failures.
|
||||
type Engine struct {
|
||||
database *database.Database
|
||||
dbManager *database.WebhookDBManager
|
||||
@@ -113,6 +118,11 @@ type Engine struct {
|
||||
wg sync.WaitGroup
|
||||
notifyCh chan []DeliveryTask
|
||||
retryCh chan DeliveryTask
|
||||
|
||||
// circuitBreakers stores a *CircuitBreaker per target ID. Only used
|
||||
// for retry targets — HTTP, database, and log targets do not need
|
||||
// circuit breakers because they either fire once or are local ops.
|
||||
circuitBreakers sync.Map
|
||||
}
|
||||
|
||||
// New creates and registers the delivery engine with the fx lifecycle.
|
||||
@@ -347,10 +357,12 @@ func (e *Engine) recoverWebhookDeliveries(ctx context.Context, webhookID string)
|
||||
}
|
||||
|
||||
// processDeliveryTasks handles a batch of delivery tasks from the webhook
|
||||
// handler. In the happy path (body ≤ 16KB), the engine delivers without
|
||||
// reading from any database — it trusts the handler's inline data and
|
||||
// only touches the DB to record results. For large bodies (body > 16KB),
|
||||
// the body is fetched from the per-webhook database on demand.
|
||||
// handler. Each task is dispatched in its own goroutine for parallel
|
||||
// fan-out — all targets for a single event start delivering simultaneously.
|
||||
// In the happy path (body ≤ 16KB), the engine delivers without reading
|
||||
// from any database — it trusts the handler's inline data and only touches
|
||||
// the DB to record results. For large bodies (body > 16KB), the body is
|
||||
// fetched once and shared across all goroutines in the batch.
|
||||
func (e *Engine) processDeliveryTasks(ctx context.Context, tasks []DeliveryTask) {
|
||||
if len(tasks) == 0 {
|
||||
return
|
||||
@@ -367,10 +379,25 @@ func (e *Engine) processDeliveryTasks(ctx context.Context, tasks []DeliveryTask)
|
||||
return
|
||||
}
|
||||
|
||||
// For the large-body case, we may need to fetch the event body once
|
||||
// for all tasks sharing the same event. Cache it here.
|
||||
// For the large-body case, pre-fetch the event body once before
|
||||
// fanning out so all goroutines share the same data.
|
||||
var fetchedBody *string
|
||||
if tasks[0].Body == nil {
|
||||
var dbEvent database.Event
|
||||
if err := webhookDB.Select("body").
|
||||
First(&dbEvent, "id = ?", tasks[0].EventID).Error; err != nil {
|
||||
e.log.Error("failed to fetch event body from database",
|
||||
"event_id", tasks[0].EventID,
|
||||
"error", err,
|
||||
)
|
||||
return
|
||||
}
|
||||
fetchedBody = &dbEvent.Body
|
||||
}
|
||||
|
||||
// Fan out: spin up a goroutine per task for parallel delivery.
|
||||
// Each goroutine is independent (fire-and-forget) and records its
|
||||
// own result. No need to wait for all goroutines to finish.
|
||||
for i := range tasks {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@@ -378,60 +405,61 @@ func (e *Engine) processDeliveryTasks(ctx context.Context, tasks []DeliveryTask)
|
||||
default:
|
||||
}
|
||||
|
||||
task := &tasks[i]
|
||||
task := tasks[i] // copy for goroutine closure safety
|
||||
|
||||
// Build Event from task data
|
||||
event := database.Event{
|
||||
Method: task.Method,
|
||||
Headers: task.Headers,
|
||||
ContentType: task.ContentType,
|
||||
}
|
||||
event.ID = task.EventID
|
||||
event.WebhookID = task.WebhookID
|
||||
|
||||
if task.Body != nil {
|
||||
// Happy path: body inline, no DB read needed
|
||||
event.Body = *task.Body
|
||||
} else {
|
||||
// Large body path: fetch from per-webhook DB (once per batch)
|
||||
if fetchedBody == nil {
|
||||
var dbEvent database.Event
|
||||
if err := webhookDB.Select("body").
|
||||
First(&dbEvent, "id = ?", task.EventID).Error; err != nil {
|
||||
e.log.Error("failed to fetch event body from database",
|
||||
"event_id", task.EventID,
|
||||
"error", err,
|
||||
)
|
||||
continue
|
||||
}
|
||||
fetchedBody = &dbEvent.Body
|
||||
}
|
||||
event.Body = *fetchedBody
|
||||
}
|
||||
|
||||
// Build Target from task data (no main DB query needed)
|
||||
target := database.Target{
|
||||
Name: task.TargetName,
|
||||
Type: task.TargetType,
|
||||
Config: task.TargetConfig,
|
||||
MaxRetries: task.MaxRetries,
|
||||
}
|
||||
target.ID = task.TargetID
|
||||
|
||||
// Build Delivery struct for the processing chain
|
||||
d := &database.Delivery{
|
||||
EventID: task.EventID,
|
||||
TargetID: task.TargetID,
|
||||
Status: database.DeliveryStatusPending,
|
||||
Event: event,
|
||||
Target: target,
|
||||
}
|
||||
d.ID = task.DeliveryID
|
||||
|
||||
e.processDelivery(ctx, webhookDB, d, task)
|
||||
go func() {
|
||||
e.deliverTask(ctx, webhookDB, &task, fetchedBody)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// deliverTask prepares and executes a single delivery task. Called from
|
||||
// a dedicated goroutine for parallel fan-out.
|
||||
func (e *Engine) deliverTask(ctx context.Context, webhookDB *gorm.DB, task *DeliveryTask, fetchedBody *string) {
|
||||
// Build Event from task data
|
||||
event := database.Event{
|
||||
Method: task.Method,
|
||||
Headers: task.Headers,
|
||||
ContentType: task.ContentType,
|
||||
}
|
||||
event.ID = task.EventID
|
||||
event.WebhookID = task.WebhookID
|
||||
|
||||
switch {
|
||||
case task.Body != nil:
|
||||
event.Body = *task.Body
|
||||
case fetchedBody != nil:
|
||||
event.Body = *fetchedBody
|
||||
default:
|
||||
e.log.Error("no body available for delivery task",
|
||||
"delivery_id", task.DeliveryID,
|
||||
"event_id", task.EventID,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
// Build Target from task data (no main DB query needed)
|
||||
target := database.Target{
|
||||
Name: task.TargetName,
|
||||
Type: task.TargetType,
|
||||
Config: task.TargetConfig,
|
||||
MaxRetries: task.MaxRetries,
|
||||
}
|
||||
target.ID = task.TargetID
|
||||
|
||||
// Build Delivery struct for the processing chain
|
||||
d := &database.Delivery{
|
||||
EventID: task.EventID,
|
||||
TargetID: task.TargetID,
|
||||
Status: database.DeliveryStatusPending,
|
||||
Event: event,
|
||||
Target: target,
|
||||
}
|
||||
d.ID = task.DeliveryID
|
||||
|
||||
e.processDelivery(ctx, webhookDB, d, task)
|
||||
}
|
||||
|
||||
// processRetryTask handles a single delivery task fired by a retry timer.
|
||||
// The task carries all data needed for delivery (same as the initial
|
||||
// notification). The only DB read is a status check to verify the delivery
|
||||
@@ -562,41 +590,47 @@ func (e *Engine) processWebhookPendingDeliveries(ctx context.Context, webhookID
|
||||
targetMap[t.ID] = t
|
||||
}
|
||||
|
||||
// Fan out recovered deliveries in parallel — same as the normal
|
||||
// delivery path, each task gets its own goroutine.
|
||||
for i := range deliveries {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
target, ok := targetMap[deliveries[i].TargetID]
|
||||
if !ok {
|
||||
e.log.Error("target not found for delivery",
|
||||
"delivery_id", deliveries[i].ID,
|
||||
"target_id", deliveries[i].TargetID,
|
||||
)
|
||||
continue
|
||||
}
|
||||
deliveries[i].Target = target
|
||||
|
||||
// Build task from DB data for the recovery path
|
||||
bodyStr := deliveries[i].Event.Body
|
||||
task := &DeliveryTask{
|
||||
DeliveryID: deliveries[i].ID,
|
||||
EventID: deliveries[i].EventID,
|
||||
WebhookID: webhookID,
|
||||
TargetID: target.ID,
|
||||
TargetName: target.Name,
|
||||
TargetType: target.Type,
|
||||
TargetConfig: target.Config,
|
||||
MaxRetries: target.MaxRetries,
|
||||
Method: deliveries[i].Event.Method,
|
||||
Headers: deliveries[i].Event.Headers,
|
||||
ContentType: deliveries[i].Event.ContentType,
|
||||
Body: &bodyStr,
|
||||
AttemptNum: 1,
|
||||
}
|
||||
|
||||
e.processDelivery(ctx, webhookDB, &deliveries[i], task)
|
||||
}
|
||||
|
||||
target, ok := targetMap[deliveries[i].TargetID]
|
||||
if !ok {
|
||||
e.log.Error("target not found for delivery",
|
||||
"delivery_id", deliveries[i].ID,
|
||||
"target_id", deliveries[i].TargetID,
|
||||
)
|
||||
continue
|
||||
}
|
||||
deliveries[i].Target = target
|
||||
|
||||
// Build task from DB data for the recovery path
|
||||
bodyStr := deliveries[i].Event.Body
|
||||
task := DeliveryTask{
|
||||
DeliveryID: deliveries[i].ID,
|
||||
EventID: deliveries[i].EventID,
|
||||
WebhookID: webhookID,
|
||||
TargetID: target.ID,
|
||||
TargetName: target.Name,
|
||||
TargetType: target.Type,
|
||||
TargetConfig: target.Config,
|
||||
MaxRetries: target.MaxRetries,
|
||||
Method: deliveries[i].Event.Method,
|
||||
Headers: deliveries[i].Event.Headers,
|
||||
ContentType: deliveries[i].Event.ContentType,
|
||||
Body: &bodyStr,
|
||||
AttemptNum: 1,
|
||||
}
|
||||
|
||||
d := deliveries[i] // copy for goroutine closure safety
|
||||
go func() {
|
||||
e.processDelivery(ctx, webhookDB, &d, &task)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -683,6 +717,26 @@ func (e *Engine) deliverRetry(_ context.Context, webhookDB *gorm.DB, d *database
|
||||
return
|
||||
}
|
||||
|
||||
// Check the circuit breaker for this target before attempting delivery.
|
||||
cb := e.getCircuitBreaker(task.TargetID)
|
||||
if !cb.Allow() {
|
||||
// Circuit is open — skip delivery, mark as retrying, and
|
||||
// schedule a retry for after the cooldown expires.
|
||||
remaining := cb.CooldownRemaining()
|
||||
e.log.Info("circuit breaker open, skipping delivery",
|
||||
"target_id", task.TargetID,
|
||||
"target_name", task.TargetName,
|
||||
"delivery_id", d.ID,
|
||||
"cooldown_remaining", remaining,
|
||||
)
|
||||
e.updateDeliveryStatus(webhookDB, d, database.DeliveryStatusRetrying)
|
||||
|
||||
retryTask := *task
|
||||
// Don't increment AttemptNum — this wasn't a real attempt
|
||||
e.scheduleRetry(retryTask, remaining)
|
||||
return
|
||||
}
|
||||
|
||||
attemptNum := task.AttemptNum
|
||||
|
||||
// Attempt delivery immediately — backoff is handled by the timer
|
||||
@@ -698,10 +752,14 @@ func (e *Engine) deliverRetry(_ context.Context, webhookDB *gorm.DB, d *database
|
||||
e.recordResult(webhookDB, d, attemptNum, success, statusCode, respBody, errMsg, duration)
|
||||
|
||||
if success {
|
||||
cb.RecordSuccess()
|
||||
e.updateDeliveryStatus(webhookDB, d, database.DeliveryStatusDelivered)
|
||||
return
|
||||
}
|
||||
|
||||
// Delivery failed — record failure in circuit breaker
|
||||
cb.RecordFailure()
|
||||
|
||||
maxRetries := d.Target.MaxRetries
|
||||
if maxRetries <= 0 {
|
||||
maxRetries = 5 // default
|
||||
@@ -727,6 +785,20 @@ func (e *Engine) deliverRetry(_ context.Context, webhookDB *gorm.DB, d *database
|
||||
}
|
||||
}
|
||||
|
||||
// getCircuitBreaker returns the circuit breaker for the given target ID,
|
||||
// creating one if it doesn't exist yet. Circuit breakers are in-memory
|
||||
// only and reset on restart (startup recovery rescans the DB anyway).
|
||||
func (e *Engine) getCircuitBreaker(targetID string) *CircuitBreaker {
|
||||
if val, ok := e.circuitBreakers.Load(targetID); ok {
|
||||
cb, _ := val.(*CircuitBreaker) //nolint:errcheck // type is guaranteed by LoadOrStore below
|
||||
return cb
|
||||
}
|
||||
fresh := NewCircuitBreaker()
|
||||
actual, _ := e.circuitBreakers.LoadOrStore(targetID, fresh)
|
||||
cb, _ := actual.(*CircuitBreaker) //nolint:errcheck // we only store *CircuitBreaker values
|
||||
return cb
|
||||
}
|
||||
|
||||
// deliverDatabase handles the database target type. Since events are already
|
||||
// stored in the per-webhook database (that's the whole point of per-webhook
|
||||
// databases), the database target simply marks the delivery as successful.
|
||||
|
||||
Reference in New Issue
Block a user