Push Notification System Low-Level Design: Multi-Platform Delivery, Fan-Out, and Analytics

Push Notification System Low-Level Design

A push notification system must reliably deliver time-sensitive messages to millions of devices across Android (FCM) and iOS (APNs), respect user quiet hours, fan out to topic subscribers efficiently, and track delivery receipts so product teams can measure engagement. This article covers the full low-level design from device token registration through fan-out workers, retry logic, and analytics.

Device Token Registration

Every device receives a push token from the OS after the user grants permission. The token changes on app reinstall and periodically on iOS. The registration endpoint must handle upserts gracefully:

from dataclasses import dataclass
from enum import Enum
from typing import Optional
import uuid

class Platform(str, Enum):
    FCM  = "fcm"
    APNS = "apns"

@dataclass
class DeviceTokenRegistration:
    user_id:    int
    device_id:  str          # stable app-level UUID stored on device
    token:      str          # FCM registration token or APNs device token
    platform:   Platform
    app_version: str
    locale:     str          # e.g. "en-US"
    timezone:   str          # e.g. "America/New_York"

def register_device_token(db, reg: DeviceTokenRegistration) -> None:
    """Upsert device token; handles token rotation transparently."""
    db.execute(
        """
        INSERT INTO device_token
            (device_id, user_id, token, platform, app_version, locale, timezone, updated_at)
        VALUES
            (%s, %s, %s, %s, %s, %s, %s, NOW())
        ON CONFLICT (device_id) DO UPDATE SET
            token       = EXCLUDED.token,
            app_version = EXCLUDED.app_version,
            locale      = EXCLUDED.locale,
            timezone    = EXCLUDED.timezone,
            updated_at  = NOW(),
            invalid_at  = NULL          -- clear any previous invalidation
        """,
        (reg.device_id, reg.user_id, reg.token, reg.platform.value,
         reg.app_version, reg.locale, reg.timezone)
    )

SQL Schema

CREATE TABLE device_token (
    id          BIGSERIAL PRIMARY KEY,
    device_id   UUID        NOT NULL UNIQUE,   -- app-assigned stable ID
    user_id     BIGINT      NOT NULL,
    token       TEXT        NOT NULL,
    platform    TEXT        NOT NULL CHECK (platform IN ('fcm','apns')),
    app_version TEXT,
    locale      TEXT        NOT NULL DEFAULT 'en-US',
    timezone    TEXT        NOT NULL DEFAULT 'UTC',
    invalid_at  TIMESTAMPTZ DEFAULT NULL,      -- set when APNs/FCM reports token dead
    created_at  TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at  TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

CREATE INDEX idx_dt_user_valid ON device_token (user_id)
    WHERE invalid_at IS NULL;

CREATE TABLE notification_template (
    id          BIGSERIAL PRIMARY KEY,
    name        TEXT        NOT NULL UNIQUE,
    title_tmpl  TEXT        NOT NULL,          -- Jinja2 template
    body_tmpl   TEXT        NOT NULL,
    data_schema JSONB,                         -- expected payload keys
    ttl_seconds INT         NOT NULL DEFAULT 86400,
    created_at  TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

CREATE TABLE notification_delivery (
    id           BIGSERIAL PRIMARY KEY,
    device_id    UUID        NOT NULL REFERENCES device_token(device_id),
    template_id  BIGINT      REFERENCES notification_template(id),
    idempotency_key TEXT     NOT NULL UNIQUE,  -- prevents double-send
    payload      JSONB       NOT NULL,
    status       TEXT        NOT NULL DEFAULT 'pending'
                             CHECK (status IN ('pending','sent','delivered','failed','expired')),
    attempt_count INT        NOT NULL DEFAULT 0,
    next_attempt  TIMESTAMPTZ,
    sent_at       TIMESTAMPTZ,
    delivered_at  TIMESTAMPTZ,
    failed_at     TIMESTAMPTZ,
    provider_msg_id TEXT,                      -- FCM message_id or APNs apns-id
    created_at   TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

CREATE INDEX idx_nd_pending ON notification_delivery (next_attempt)
    WHERE status = 'pending';

FCM/APNs Abstraction Layer

Abstract both providers behind a single interface so worker code is provider-agnostic:

import httpx
import jwt
import time
from typing import Optional

class PushResult:
    def __init__(self, success: bool, provider_id: Optional[str], error: Optional[str], token_invalid: bool = False):
        self.success       = success
        self.provider_id   = provider_id
        self.error         = error
        self.token_invalid = token_invalid   # signals token should be marked dead

async def send_fcm(token: str, title: str, body: str, data: dict, ttl: int) -> PushResult:
    payload = {
        "message": {
            "token": token,
            "notification": {"title": title, "body": body},
            "data": {k: str(v) for k, v in data.items()},
            "android": {"ttl": f"{ttl}s", "priority": "high"}
        }
    }
    async with httpx.AsyncClient() as client:
        resp = await client.post(
            "https://fcm.googleapis.com/v1/projects/MY_PROJECT/messages:send",
            json=payload,
            headers={"Authorization": f"Bearer {get_fcm_access_token()}"}
        )
    if resp.status_code == 200:
        return PushResult(True, resp.json().get("name"), None)
    body_json = resp.json()
    err = body_json.get("error", {})
    invalid = err.get("status") in ("UNREGISTERED", "INVALID_ARGUMENT")
    return PushResult(False, None, err.get("message"), invalid)

async def send_push(db, delivery_id: int) -> None:
    """Fetch delivery record and dispatch to correct provider."""
    row = db.fetchone("SELECT * FROM notification_delivery WHERE id=%s", (delivery_id,))
    device = db.fetchone("SELECT * FROM device_token WHERE device_id=%s", (row['device_id'],))
    payload = row['payload']

    if device['platform'] == 'fcm':
        result = await send_fcm(device['token'], payload['title'], payload['body'],
                                payload.get('data', {}), payload.get('ttl', 86400))
    else:
        result = await send_apns(device['token'], payload)

    if result.success:
        db.execute("UPDATE notification_delivery SET status='sent', sent_at=NOW(), "
                   "provider_msg_id=%s WHERE id=%s", (result.provider_id, delivery_id))
    elif result.token_invalid:
        db.execute("UPDATE device_token SET invalid_at=NOW() WHERE device_id=%s", (device['device_id'],))
        db.execute("UPDATE notification_delivery SET status='failed', failed_at=NOW() WHERE id=%s", (delivery_id,))
    else:
        schedule_retry(db, delivery_id, row['attempt_count'])

Topic Fan-Out Workers

Sending a notification to a topic (e.g., “breaking_news”) means fanning out to potentially millions of devices. Use a PostgreSQL job queue with SKIP LOCKED so multiple workers share the load without contention:

async def fan_out_topic(db, topic: str, template_name: str, context: dict) -> int:
    """
    Enqueues one notification_delivery row per active device subscribed to topic.
    Returns count of enqueued deliveries.
    """
    template = db.fetchone(
        "SELECT * FROM notification_template WHERE name=%s", (template_name,)
    )
    title = render_template(template['title_tmpl'], context)
    body  = render_template(template['body_tmpl'],  context)

    # Insert in batches of 1000 to avoid lock escalation
    count = 0
    cursor = None
    while True:
        rows = db.fetchall(
            """
            SELECT dt.device_id FROM topic_subscription ts
            JOIN device_token dt ON dt.user_id = ts.user_id
            WHERE ts.topic = %s
              AND dt.invalid_at IS NULL
              AND (NOT ts.quiet_hours_enabled OR NOT is_quiet_hour(dt.timezone))
              AND (%s IS NULL OR dt.device_id > %s)
            ORDER BY dt.device_id
            LIMIT 1000
            """,
            (topic, cursor, cursor)
        )
        if not rows:
            break
        values = [
            (row['device_id'], template['id'],
             f"{topic}:{context.get('event_id','?')}:{row['device_id']}",
             json.dumps({"title": title, "body": body, "data": context}),
             datetime.utcnow())
            for row in rows
        ]
        db.executemany(
            """INSERT INTO notification_delivery
               (device_id, template_id, idempotency_key, payload, next_attempt)
               VALUES (%s,%s,%s,%s,%s)
               ON CONFLICT (idempotency_key) DO NOTHING""",
            values
        )
        count += len(rows)
        cursor = rows[-1]['device_id']

    return count

Quiet Hours Enforcement

Users in timezone “America/Los_Angeles” should not receive marketing pushes at 2 AM. The is_quiet_hour function is a PostgreSQL helper:

CREATE OR REPLACE FUNCTION is_quiet_hour(tz TEXT)
RETURNS BOOLEAN LANGUAGE sql STABLE AS $$
    SELECT EXTRACT(HOUR FROM NOW() AT TIME ZONE tz) BETWEEN 22 AND 23
        OR EXTRACT(HOUR FROM NOW() AT TIME ZONE tz) BETWEEN 0  AND 7;
$$;

Retry with Exponential Backoff

import math

MAX_ATTEMPTS = 5

def schedule_retry(db, delivery_id: int, attempt_count: int) -> None:
    if attempt_count >= MAX_ATTEMPTS:
        db.execute(
            "UPDATE notification_delivery SET status='failed', failed_at=NOW() WHERE id=%s",
            (delivery_id,)
        )
        return
    delay_seconds = min(30 * (2 ** attempt_count), 3600)  # cap at 1 hour
    db.execute(
        """UPDATE notification_delivery
           SET attempt_count = attempt_count + 1,
               next_attempt  = NOW() + (%s || ' seconds')::INTERVAL
           WHERE id = %s""",
        (delay_seconds, delivery_id)
    )

{
“@context”: “https://schema.org”,
“@type”: “FAQPage”,
“mainEntity”: [
{
“@type”: “Question”,
“name”: “How do you handle stale or invalid device tokens in a push system?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Both FCM and APNs return specific error codes when a token is no longer valid (FCM: UNREGISTERED; APNs: 410 Gone). On receiving these errors the worker sets invalid_at on the device_token row so future fan-outs skip it. The token is cleared on next app launch when a fresh registration upsert arrives.”
}
},
{
“@type”: “Question”,
“name”: “What prevents duplicate pushes during a fan-out retry?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Each notification_delivery row has an idempotency_key composed of topic + event_id + device_id. The INSERT uses ON CONFLICT (idempotency_key) DO NOTHING, so re-running a fan-out job after a partial failure only inserts rows that were not already created.”
}
},
{
“@type”: “Question”,
“name”: “How should quiet hours be enforced without storing per-user schedules?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Store the device timezone during registration. At fan-out time, filter out devices where the current UTC time maps to quiet hours in their timezone using a lightweight SQL function. This avoids per-user schedule tables and keeps the logic in the database close to the data.”
}
},
{
“@type”: “Question”,
“name”: “How do you scale fan-out to millions of devices?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Partition fan-out across multiple workers using cursor-based pagination over device_id. Each worker claims a batch using SELECT … SKIP LOCKED and inserts delivery rows independently. For very large topics (100M+ devices), pre-shard the device_token table by user_id mod N and run one fan-out worker per shard.”
}
}
]
}

{
“@context”: “https://schema.org”,
“@type”: “FAQPage”,
“mainEntity”: [
{
“@type”: “Question”,
“name”: “How is FCM/APNs abstraction implemented?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “A PushProvider interface is implemented by FCMProvider and APNsProvider classes; the dispatcher selects the provider based on the device platform field.”
}
},
{
“@type”: “Question”,
“name”: “How does topic fan-out work at scale?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “A fan-out worker uses SELECT FOR UPDATE SKIP LOCKED to claim batches of DeviceToken rows for a topic and dispatches to the push provider in parallel threads or async tasks.”
}
},
{
“@type”: “Question”,
“name”: “How are delivery receipts tracked?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “The push provider's response (success/failure/token_invalid) is written to NotificationDelivery; invalid tokens are deactivated to prevent future sends.”
}
},
{
“@type”: “Question”,
“name”: “What triggers quiet hours enforcement?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Before dispatch, the system checks the user's quiet_hours_start/end (stored in UTC) against the current time; notifications below a priority threshold are queued rather than sent immediately.”
}
}
]
}

See also: Meta Interview Guide 2026: Facebook, Instagram, WhatsApp Engineering

See also: Apple Interview Guide 2026: iOS Systems, Hardware-Software Integration, and iCloud Architecture

See also: Snap Interview Guide

Scroll to Top