Webhook Gateway: Low-Level Design
A webhook gateway decouples internal event producers from external consumers. It accepts domain events, routes them to registered endpoints, transforms payloads as needed, fans out in parallel, and retries failed deliveries with exponential backoff. This post covers the full design including circuit breaking and delivery auditing.
Event Routing
Every registered endpoint declares a topic_pattern: either an exact match (order.completed) or a wildcard pattern (order.*). When an event arrives, the router queries all endpoints whose pattern matches the topic and enqueues one delivery record per endpoint.
import re
from datetime import datetime, timezone
def route_event(topic: str, payload: dict, db, queue) -> list[int]:
"""
Matches topic against registered endpoints and enqueues deliveries.
Returns list of delivery IDs created.
"""
endpoints = db.fetchall(
"SELECT id, url, topic_pattern, transform_template, secret "
"FROM webhook_endpoint WHERE status = 'active'"
)
delivery_ids = []
for ep in endpoints:
pattern = ep.topic_pattern.replace("*", ".*")
if not re.fullmatch(pattern, topic):
continue
transformed = _apply_transform(payload, ep.transform_template)
delivery_id = db.fetchone(
"""INSERT INTO webhook_delivery
(endpoint_id, event_id, payload, attempt, status, next_retry_at)
VALUES (%s, %s, %s, 0, 'pending', NOW())
RETURNING id""",
(ep.id, payload["event_id"], transformed)
)[0]
queue.enqueue("deliver_webhook", delivery_id)
delivery_ids.append(delivery_id)
return delivery_ids
Payload Transformation
Each endpoint can declare a Jinja2 template that reshapes the canonical event payload into whatever structure the downstream service expects. This keeps the event producer ignorant of each consumer's schema.
from jinja2 import Environment, BaseLoader
import json
def _apply_transform(payload: dict, template_str: str | None) -> str:
if not template_str:
return json.dumps(payload)
env = Environment(loader=BaseLoader())
tmpl = env.from_string(template_str)
rendered = tmpl.render(**payload)
return rendered # expected to be valid JSON string
HMAC-SHA256 Signing
Before each delivery the gateway computes an HMAC-SHA256 signature over the raw payload body using the endpoint's secret. The signature is sent in the X-Webhook-Signature header. The receiving service recomputes the signature and rejects requests where the values do not match.
import hmac, hashlib
def _sign_payload(body: str, secret: str) -> str:
return "sha256=" + hmac.new(
secret.encode(), body.encode(), hashlib.sha256
).hexdigest()
Delivery with Exponential Backoff
The delivery worker fetches the delivery record, sends the HTTP POST, and updates the status. On failure it schedules the next retry at now + 2^attempt seconds, up to a maximum of 5 attempts.
import requests
from datetime import timedelta
BACKOFF_SECONDS = [1, 2, 4, 8, 16]
MAX_ATTEMPTS = 5
def deliver_webhook(delivery_id: int, db) -> bool:
row = db.fetchone(
"""SELECT d.id, d.payload, d.attempt, e.url, e.secret
FROM webhook_delivery d
JOIN webhook_endpoint e ON e.id = d.endpoint_id
WHERE d.id = %s FOR UPDATE""",
(delivery_id,)
)
body = row.payload
sig = _sign_payload(body, row.secret)
try:
resp = requests.post(
row.url, data=body,
headers={"Content-Type": "application/json",
"X-Webhook-Signature": sig},
timeout=10
)
success = 200 <= resp.status_code = MAX_ATTEMPTS:
db.execute(
"UPDATE webhook_delivery SET status='failed', attempt=%s WHERE id=%s",
(attempt, delivery_id)
)
return False
delay = BACKOFF_SECONDS[min(attempt - 1, len(BACKOFF_SECONDS) - 1)]
next_retry = datetime.now(timezone.utc) + timedelta(seconds=delay)
db.execute(
"UPDATE webhook_delivery SET attempt=%s, next_retry_at=%s WHERE id=%s",
(attempt, next_retry, delivery_id)
)
return False
Circuit Breaker per Endpoint
If an endpoint returns consecutive failures above a threshold (e.g., 5 in a row), the circuit opens and deliveries are skipped until a cooldown period passes. This prevents the retry queue from flooding with requests to a permanently-down endpoint.
TRIP_THRESHOLD = 5
COOLDOWN_SECONDS = 300
def evaluate_circuit_breaker(endpoint_id: int, success: bool, db) -> str:
"""Returns 'open' | 'closed' after updating the breaker state."""
cb = db.fetchone(
"SELECT consecutive_failures, tripped_at "
"FROM webhook_circuit_breaker WHERE endpoint_id=%s",
(endpoint_id,)
)
if success:
db.execute(
"""INSERT INTO webhook_circuit_breaker
(endpoint_id, consecutive_failures, tripped_at)
VALUES (%s, 0, NULL)
ON CONFLICT (endpoint_id)
DO UPDATE SET consecutive_failures=0, tripped_at=NULL""",
(endpoint_id,)
)
return "closed"
failures = (cb.consecutive_failures if cb else 0) + 1
tripped_at = datetime.now(timezone.utc) if failures >= TRIP_THRESHOLD else None
db.execute(
"""INSERT INTO webhook_circuit_breaker
(endpoint_id, consecutive_failures, tripped_at)
VALUES (%s, %s, %s)
ON CONFLICT (endpoint_id)
DO UPDATE SET consecutive_failures=%s, tripped_at=COALESCE(EXCLUDED.tripped_at, webhook_circuit_breaker.tripped_at)""",
(endpoint_id, failures, tripped_at, failures)
)
return "open" if failures >= TRIP_THRESHOLD else "closed"
Database Schema
CREATE TABLE webhook_endpoint (
id BIGSERIAL PRIMARY KEY,
url TEXT NOT NULL,
topic_pattern VARCHAR(255) NOT NULL,
transform_template TEXT,
secret VARCHAR(255) NOT NULL,
status VARCHAR(32) NOT NULL DEFAULT 'active',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_we_status ON webhook_endpoint(status);
CREATE TABLE webhook_delivery (
id BIGSERIAL PRIMARY KEY,
endpoint_id BIGINT NOT NULL REFERENCES webhook_endpoint(id),
event_id UUID NOT NULL,
payload TEXT NOT NULL,
attempt SMALLINT NOT NULL DEFAULT 0,
status VARCHAR(32) NOT NULL DEFAULT 'pending',
response_code SMALLINT,
next_retry_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_wd_retry ON webhook_delivery(next_retry_at)
WHERE status = 'pending';
CREATE TABLE webhook_circuit_breaker (
endpoint_id BIGINT PRIMARY KEY REFERENCES webhook_endpoint(id),
consecutive_failures INT NOT NULL DEFAULT 0,
tripped_at TIMESTAMPTZ
);
Fan-Out and Ordering
Fan-out to N endpoints is parallelized via a thread pool. Ordering guarantees only apply within a single endpoint: the delivery queue for each endpoint is processed in created_at order. Across endpoints there is no global ordering requirement because each endpoint is an independent consumer.
See also: Stripe Interview Guide 2026: Process, Bug Bash Round, and Payment Systems
See also: Scale AI Interview Guide 2026: Data Infrastructure, RLHF Pipelines, and ML Engineering
See also: Anthropic Interview Guide 2026: Process, Questions, and AI Safety