Webhook Delivery System Low-Level Design: Fan-out, Retry, HMAC Signing, and Failure Handling

A webhook delivery system pushes event notifications to customer-configured HTTP endpoints — the foundation of Stripe, GitHub, Shopify, and Twilio’s integration ecosystem. Core challenges: reliable delivery with retries and exponential backoff, signing payloads so receivers can verify authenticity, handling slow or failing endpoints without blocking other deliveries, and giving customers visibility into delivery attempts and failures.

Core Data Model

CREATE TABLE WebhookEndpoint (
    endpoint_id  UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    user_id      UUID NOT NULL,
    url          TEXT NOT NULL,
    signing_secret TEXT NOT NULL,           -- HMAC key, shown once at creation
    event_types  TEXT[] NOT NULL DEFAULT '{}',  -- subscribed event types
    is_active    BOOLEAN NOT NULL DEFAULT TRUE,
    created_at   TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_webhook_user ON WebhookEndpoint (user_id);

CREATE TYPE delivery_status AS ENUM ('pending','succeeded','failed','retrying');

CREATE TABLE WebhookDelivery (
    delivery_id    UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    endpoint_id    UUID NOT NULL REFERENCES WebhookEndpoint(endpoint_id),
    event_type     TEXT NOT NULL,
    event_id       TEXT NOT NULL,          -- idempotency key
    payload        JSONB NOT NULL,
    status         delivery_status NOT NULL DEFAULT 'pending',
    attempt_count  SMALLINT NOT NULL DEFAULT 0,
    next_attempt   TIMESTAMPTZ,
    last_http_code SMALLINT,
    last_error     TEXT,
    created_at     TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    delivered_at   TIMESTAMPTZ
);
CREATE INDEX idx_delivery_due ON WebhookDelivery (next_attempt)
    WHERE status IN ('pending','retrying');
CREATE UNIQUE INDEX idx_delivery_idempotency ON WebhookDelivery (endpoint_id, event_id);

Enqueuing Deliveries on Event

import hashlib, hmac, json, time
from uuid import uuid4

def enqueue_webhook_deliveries(conn, event_type: str, event_id: str, payload: dict):
    """
    Find all active endpoints subscribed to this event type and create
    a delivery record for each. Fan-out happens here; actual HTTP happens async.
    """
    with conn.cursor() as cur:
        cur.execute("""
            SELECT endpoint_id FROM WebhookEndpoint
            WHERE is_active = TRUE
              AND (event_types = '{}' OR %s = ANY(event_types))
        """, (event_type,))
        endpoints = [row[0] for row in cur.fetchall()]

    if not endpoints:
        return

    now_utc = __import__('datetime').datetime.now(__import__('datetime').timezone.utc)
    with conn.cursor() as cur:
        for endpoint_id in endpoints:
            cur.execute("""
                INSERT INTO WebhookDelivery
                (delivery_id, endpoint_id, event_type, event_id, payload, next_attempt)
                VALUES (%s,%s,%s,%s,%s,%s)
                ON CONFLICT (endpoint_id, event_id) DO NOTHING
            """, (str(uuid4()), endpoint_id, event_type, event_id,
                  __import__('psycopg2').extras.Json(payload), now_utc))
    conn.commit()

Delivery Worker with Exponential Backoff

import requests, time
from datetime import datetime, timezone, timedelta

RETRY_SCHEDULE = [0, 30, 60, 300, 1800, 7200, 86400]  # seconds after first attempt
MAX_ATTEMPTS = len(RETRY_SCHEDULE)
DELIVERY_TIMEOUT = 30  # seconds

def run_delivery_worker(conn):
    import time as _time
    while True:
        deliver_due(conn)
        _time.sleep(5)

def deliver_due(conn):
    now = datetime.now(timezone.utc)
    with conn.cursor() as cur:
        cur.execute("""
            UPDATE WebhookDelivery
            SET status = 'retrying', attempt_count = attempt_count + 1
            WHERE delivery_id IN (
                SELECT delivery_id FROM WebhookDelivery
                WHERE status IN ('pending','retrying')
                  AND next_attempt <= %s
                ORDER BY next_attempt ASC
                LIMIT 50
                FOR UPDATE SKIP LOCKED
            )
            RETURNING delivery_id, endpoint_id, payload, event_type, event_id, attempt_count
        """, (now,))
        jobs = cur.fetchall()
    conn.commit()

    for delivery_id, endpoint_id, payload, event_type, event_id, attempt in jobs:
        endpoint = load_endpoint(conn, endpoint_id)
        if not endpoint:
            continue
        attempt_delivery(conn, delivery_id, endpoint, payload, event_type, event_id, attempt)

def attempt_delivery(conn, delivery_id, endpoint, payload, event_type, event_id, attempt):
    url = endpoint['url']
    secret = endpoint['signing_secret']
    body = json.dumps(payload, default=str)
    timestamp = str(int(time.time()))

    # HMAC-SHA256 signature
    sig_payload = f"{timestamp}.{body}"
    signature = hmac.new(secret.encode(), sig_payload.encode(), hashlib.sha256).hexdigest()

    headers = {
        "Content-Type": "application/json",
        "X-Webhook-Timestamp": timestamp,
        "X-Webhook-Signature": f"sha256={signature}",
        "X-Event-Type": event_type,
        "X-Event-ID": event_id,
    }

    try:
        resp = requests.post(url, data=body, headers=headers,
                              timeout=DELIVERY_TIMEOUT, allow_redirects=False)
        success = 200 <= resp.status_code < 300
    except requests.exceptions.RequestException as e:
        success = False
        resp = None

    now = datetime.now(timezone.utc)
    if success:
        with conn.cursor() as cur:
            cur.execute("""
                UPDATE WebhookDelivery
                SET status='succeeded', delivered_at=%s,
                    last_http_code=%s
                WHERE delivery_id=%s
            """, (now, resp.status_code if resp else None, delivery_id))
        conn.commit()
    else:
        next_delay = RETRY_SCHEDULE[attempt] if attempt  NOW() - interval '7 days'
        """, (endpoint_id,))
        fail_count = cur.fetchone()[0]

    if fail_count >= 5:
        with conn.cursor() as cur:
            cur.execute(
                "UPDATE WebhookEndpoint SET is_active=FALSE WHERE endpoint_id=%s",
                (endpoint_id,)
            )
        conn.commit()
        notify_endpoint_disabled(endpoint_id)

Key Interview Points

Fan-out at event time, deliver async: Creating delivery records synchronously (during the triggering event) is fast — just DB inserts. Actual HTTP delivery is asynchronous via the worker. This means the original event handler returns immediately regardless of how many endpoints are subscribed or how slow they are. Fan-out records serve as the durable queue — no separate message broker needed for webhook delivery.
Exponential backoff schedule: Retry delays of 0, 30s, 60s, 5m, 30m, 2h, 24h give a slow endpoint time to recover without retrying every second. After 7 attempts (~27 hours total), mark as failed. Notify the user: their endpoint is unreachable and deliveries are being dropped. The retry schedule is stored as a constant array — next_delay = RETRY_SCHEDULE[attempt_count].
HMAC signature verification: Include the request timestamp in the signed payload (timestamp.body) and validate that the timestamp is within ±5 minutes. This prevents replay attacks: a valid webhook captured and re-sent 10 minutes later fails the timestamp check. Receivers verify: parse the X-Webhook-Signature header, compute HMAC(secret, timestamp.body), compare with constant-time hmac.compare_digest().
Slow endpoint isolation: A single slow endpoint (30s timeouts) must not delay deliveries to other endpoints. Separate delivery worker pools per endpoint (or per user tier), or use SKIP LOCKED with per-endpoint parallelism limits. Alternatively, use a dedicated async HTTP client (httpx async) with concurrency limits so one slow endpoint doesn’t block worker threads.
Delivery dashboard for customers: Show recent deliveries with status, HTTP response code, response body (first 500 chars), and retry timeline. Allow manual re-delivery of any delivery (creates a new delivery record for the same event_id — the receiver must be idempotent). This is what makes Stripe’s and GitHub’s webhook UX excellent — debugging failed webhooks is self-service.

{“@context”:”https://schema.org”,”@type”:”FAQPage”,”mainEntity”:[{“@type”:”Question”,”name”:”Why fan out webhook deliveries into a queue instead of calling endpoints inline?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Inline delivery blocks the event producer until every subscriber receives a response. If subscriber A times out after 30 seconds and subscriber B is instant, the caller blocks for 30 seconds — and if the caller is the payments service accepting a transaction, that 30-second block is unacceptable. Fan-out into a delivery queue decouples the publisher: the event is recorded and queued atomically in milliseconds, and the delivery worker sends HTTP requests asynchronously at its own pace. Workers can retry failed deliveries independently without blocking new events. The queue also provides backpressure: if a subscriber is slow, its DeliveryAttempt rows pile up without affecting other subscribers’ queues.”}},{“@type”:”Question”,”name”:”How does HMAC-SHA256 signing prevent replay attacks on webhook deliveries?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”The HMAC alone doesn’t prevent replays — it only proves the payload came from you. Add a timestamp: include X-Webhook-Timestamp: <unix_epoch> in the signed headers. The signature covers both the payload and the timestamp: HMAC-SHA256(secret, timestamp + "." + payload_json). On the subscriber side: (1) verify the signature; (2) reject if |now – timestamp| > 300 seconds. This 5-minute window means a captured webhook cannot be replayed after 5 minutes. The X-Webhook-ID header provides idempotency: if the subscriber receives the same webhook_id twice (due to retry after a delivery that was received but whose 200 response was lost), they can detect the duplicate and skip reprocessing.”}},{“@type”:”Question”,”name”:”What retry schedule works best for failed webhook deliveries?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Exponential backoff: [0, 30, 60, 300, 1800, 7200, 86400] seconds (immediate, 30s, 1m, 5m, 30m, 2h, 24h). This gives the subscriber time to recover from transient failures (brief deployment, overloaded server) while not retrying infinitely. 7 attempts spanning 24 hours covers all common downtime scenarios. After the 7th attempt with no success, mark status=’failed’ and increment the endpoint’s consecutive_failures counter. Auto-disable the endpoint after 5 consecutive delivery failures (not 5 attempts on one event — 5 separate events all failing). Auto-disable prevents wasting retry cycles on a permanently dead endpoint. Email the endpoint owner on disable: "We’ve paused deliveries to https://api.example.com/webhook — please re-enable after fixing."”}},{“@type”:”Question”,”name”:”How do you ensure exactly-once processing on the subscriber side?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”The webhook system guarantees at-least-once delivery (it retries until it gets a 2xx). Subscribers must implement idempotency to make it effectively exactly-once. The X-Webhook-ID header is the idempotency key. Subscriber pattern: INSERT INTO processed_webhooks (webhook_id) VALUES ($1) ON CONFLICT (webhook_id) DO NOTHING. If rowcount=0, this event was already processed — return 200 immediately without re-executing the business logic. The INSERT and the business logic must be in the same database transaction: if the transaction commits, the webhook is processed and recorded atomically. If the process crashes after processing but before the 200 response, the delivery worker retries; the next attempt hits the ON CONFLICT and returns 200 without reprocessing.”}},{“@type”:”Question”,”name”:”How do you handle slow subscribers that accept webhooks but process them too slowly?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”The webhook delivery system measures end-to-end HTTP response latency and enforces a response_timeout (e.g. 30 seconds). Subscribers must acknowledge receipt immediately (return 200) and process asynchronously — the 200 means "I received it," not "I processed it." If a subscriber regularly returns 200 in 25 seconds, consider it a latency problem: log high-latency endpoints, alert the subscriber’s team. For subscribers that queue internally: the webhook payload is written to their own queue (Kafka, SQS) in the HTTP handler and the 200 is returned in milliseconds. For subscribers that can’t guarantee async processing (e.g. serverless functions with cold start): increase the timeout to 60 seconds and add a circuit breaker — if p99 latency exceeds 45 seconds for 10% of deliveries over 5 minutes, pause delivery to that endpoint.”}}]}

Webhook delivery and event notification system design is discussed in Stripe system design interview questions.

Webhook delivery and merchant event notification design is covered in Shopify system design interview preparation.

Webhook delivery and integration event system design is discussed in Atlassian system design interview guide.