Designing an e-commerce platform at scale appears frequently in system design interviews at Amazon, Shopify, Stripe, and DoorDash. The challenge is coordinating product catalog, inventory, ordering, payments, and recommendations under high-concurrency flash-sale conditions.
Requirements Clarification
- Scale: 100M DAU, 1M products, 10M orders/day, peak 100k concurrent users during sales events
- Consistency needs: Inventory must be strongly consistent (no oversell); recommendations can be eventually consistent
- Latency targets: Product search <200ms p99; checkout <500ms p99
- Availability: 99.99% uptime — 52 minutes downtime/year maximum
High-Level Architecture
┌─────────────────────────────────────────────────────────────────┐
│ CDN (CloudFront) ←── Static assets, product images │
└───────────────────────────┬─────────────────────────────────────┘
│
┌───────────────────────────▼─────────────────────────────────────┐
│ API Gateway / Load Balancer │
└──┬──────────────┬──────────────┬──────────────┬─────────────────┘
│ │ │ │
▼ ▼ ▼ ▼
Product Search Cart/Order Payment
Service Service Service Service
│ │ │ │
PostgreSQL Elasticsearch PostgreSQL Stripe/Adyen
+ Redis cache + Redis + Redis + Ledger DB
Product Catalog and Search
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
import elasticsearch
@dataclass
class ProductSearchRequest:
query: str
category_id: Optional[int] = None
price_min: Optional[float] = None
price_max: Optional[float] = None
brand: Optional[str] = None
in_stock_only: bool = True
sort_by: str = "relevance" # relevance | price_asc | price_desc | newest
page: int = 1
page_size: int = 24
class ProductSearchService:
def __init__(self, es_client: elasticsearch.Elasticsearch):
self.es = es_client
def search(self, req: ProductSearchRequest) -> Dict[str, Any]:
must_clauses = [
{
"multi_match": {
"query": req.query,
"fields": ["title^3", "description^1", "brand^2", "tags^1.5"],
"type": "best_fields",
"fuzziness": "AUTO"
}
}
]
filter_clauses = []
if req.category_id:
filter_clauses.append({"term": {"category_ids": req.category_id}})
if req.price_min is not None or req.price_max is not None:
filter_clauses.append({"range": {"price": {
"gte": req.price_min, "lte": req.price_max
}}})
if req.in_stock_only:
filter_clauses.append({"term": {"in_stock": True}})
sort_map = {
"relevance": [{"_score": "desc"}, {"sold_count": "desc"}],
"price_asc": [{"price": "asc"}],
"price_desc": [{"price": "desc"}],
"newest": [{"created_at": "desc"}],
}
body = {
"query": {"bool": {"must": must_clauses, "filter": filter_clauses}},
"sort": sort_map.get(req.sort_by, sort_map["relevance"]),
"from": (req.page - 1) * req.page_size,
"size": req.page_size,
"aggs": {
"categories": {"terms": {"field": "category_ids", "size": 20}},
"brands": {"terms": {"field": "brand.keyword", "size": 20}},
"price_range":{"stats": {"field": "price"}},
}
}
return self.es.search(index="products", body=body)
Inventory Management — Preventing Oversell
import redis
import psycopg2
class InventoryService:
"""
Two-layer inventory: Redis for fast reservation, Postgres for ground truth.
Lua script makes reserve operation atomic — no race conditions.
"""
RESERVE_SCRIPT = """
local key = KEYS[1]
local qty = tonumber(ARGV[1])
local current = tonumber(redis.call("GET", key) or "0")
if current bool:
"""Atomically reserve inventory. Returns True if successful."""
key = f"inventory:{product_id}"
# Atomic Lua reserve on Redis
result = self._reserve_fn(keys=[key], args=[qty])
if result == -1:
return False # Insufficient stock
# Write reservation to Postgres for durability
with self.pg.cursor() as cur:
cur.execute("""
INSERT INTO inventory_reservations (order_id, product_id, qty, expires_at)
VALUES (%s, %s, %s, NOW() + INTERVAL '15 minutes')
ON CONFLICT (order_id, product_id) DO NOTHING
""", (order_id, product_id, qty))
self.pg.commit()
return True
def release(self, product_id: str, qty: int, order_id: str):
"""Release reservation on cancel/timeout."""
key = f"inventory:{product_id}"
self.redis.incrby(key, qty)
with self.pg.cursor() as cur:
cur.execute("""
DELETE FROM inventory_reservations
WHERE order_id = %s AND product_id = %s
""", (order_id, product_id))
self.pg.commit()
def warm_cache(self, product_id: str):
"""Load inventory from Postgres into Redis on cache miss."""
with self.pg.cursor() as cur:
cur.execute("SELECT available_qty FROM inventory WHERE product_id = %s", (product_id,))
row = cur.fetchone()
if row:
self.redis.set(f"inventory:{product_id}", row[0], ex=3600)
Order Orchestration — Saga Pattern
from enum import Enum
from typing import Callable, List
class OrderState(Enum):
PENDING = "pending"
INVENTORY_HELD = "inventory_held"
PAYMENT_PENDING = "payment_pending"
PAYMENT_COMPLETE = "payment_complete"
CONFIRMED = "confirmed"
FAILED = "failed"
class SagaStep:
def __init__(self, name: str, action: Callable, compensate: Callable):
self.name = name
self.action = action
self.compensate = compensate
class OrderOrchestrator:
"""
Saga pattern for distributed order transaction.
Each step has a compensating action for rollback.
"""
def __init__(self, inventory_svc, payment_svc, notification_svc, db):
self.inventory = inventory_svc
self.payment = payment_svc
self.notification = notification_svc
self.db = db
def place_order(self, order: dict) -> dict:
order_id = order["id"]
completed_steps: List[SagaStep] = []
steps = [
SagaStep(
name="validate_cart",
action=lambda: self._validate_cart(order),
compensate=lambda: None, # no-op
),
SagaStep(
name="reserve_inventory",
action=lambda: self.inventory.reserve(
order["product_id"], order["qty"], order_id),
compensate=lambda: self.inventory.release(
order["product_id"], order["qty"], order_id),
),
SagaStep(
name="process_payment",
action=lambda: self.payment.charge(
order["payment_token"], order["total"]),
compensate=lambda: self.payment.refund(order_id),
),
SagaStep(
name="confirm_order",
action=lambda: self._confirm_order(order_id),
compensate=lambda: self._cancel_order(order_id),
),
]
for step in steps:
try:
step.action()
completed_steps.append(step)
except Exception as exc:
# Rollback completed steps in reverse
for done in reversed(completed_steps):
try:
done.compensate()
except Exception:
pass # Log and alert — manual intervention needed
return {"success": False, "error": str(exc)}
self.notification.send_confirmation(order_id)
return {"success": True, "order_id": order_id}
def _validate_cart(self, order: dict):
if order["qty"] <= 0:
raise ValueError("Invalid quantity")
if order["total"] <= 0:
raise ValueError("Invalid total")
def _confirm_order(self, order_id: str):
self.db.execute(
"UPDATE orders SET state = %s WHERE id = %s",
(OrderState.CONFIRMED.value, order_id)
)
Flash Sale Architecture
Flash sales create 100x normal traffic spikes. Key strategies:
- Pre-warm caches: Load inventory into Redis before sale starts
- Queue admission: Virtual waiting room — users get a token, queue drains at controlled rate
- Read replica routing: All reads go to replicas; writes only to primary
- Rate limiting per user: Max 2 items per user per sale prevents bots
- Circuit breakers: Degrade gracefully — disable recommendations before dropping orders
Key Design Decisions
| Decision | Choice | Rationale |
|---|---|---|
| Inventory consistency | Strong (Redis Lua + Postgres) | Oversell = financial loss + customer trust |
| Search | Elasticsearch | Fuzzy, faceted, ranked search at scale |
| Order state machine | Saga pattern | Distributed transactions without 2PC latency |
| Session/cart storage | Redis with TTL | O(1) read, auto-expire abandoned carts |
| Images/assets | S3 + CloudFront CDN | Offload static traffic; global edge delivery |
| Recommendations | Offline ML + Redis serving | Pre-compute overnight; serve from cache |
Companies That Ask This System Design Question
This problem type commonly appears in interviews at:
See our company interview guides for full interview process, compensation, and preparation tips.