Feature Rollout System: Low-Level Design
A feature rollout system lets engineering teams deploy code to production while controlling which users see the new behavior. It decouples deployment from release: code ships to all servers, but a feature flag gate determines whether each request activates the new path. This design covers the flag evaluation engine, targeting rules, gradual percentage rollouts, kill switches, and the observability needed to roll back safely.
Core Data Model
CREATE TABLE Feature (
feature_key VARCHAR(100) PRIMARY KEY, -- "checkout_v2", "dark_mode"
description TEXT,
status VARCHAR(20) NOT NULL DEFAULT 'off', -- off, rolling, on
rollout_pct SMALLINT NOT NULL DEFAULT 0, -- 0-100
sticky BOOLEAN NOT NULL DEFAULT TRUE, -- same user always same bucket
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE FeatureOverride (
feature_key VARCHAR(100) NOT NULL REFERENCES Feature(feature_key) ON DELETE CASCADE,
target_type VARCHAR(20) NOT NULL, -- user, org, country, plan
target_id VARCHAR(200) NOT NULL, -- user_id, org_id, "US", "enterprise"
enabled BOOLEAN NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (feature_key, target_type, target_id)
);
CREATE TABLE FeatureAuditLog (
log_id BIGSERIAL PRIMARY KEY,
feature_key VARCHAR(100) NOT NULL,
changed_by BIGINT NOT NULL, -- user_id of engineer
old_value JSONB,
new_value JSONB,
reason TEXT,
changed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX ON FeatureOverride(feature_key, target_type);
CREATE INDEX ON FeatureAuditLog(feature_key, changed_at DESC);
Flag Evaluation Algorithm
import hashlib, redis, json
from dataclasses import dataclass
from typing import Optional
redis_client = redis.Redis(host='localhost', decode_responses=True)
FLAG_CACHE_TTL = 60 # seconds
@dataclass
class EvalContext:
user_id: int
org_id: Optional[int] = None
country: Optional[str] = None
plan: Optional[str] = None
def is_enabled(feature_key: str, ctx: EvalContext) -> bool:
"""
Evaluation order (first match wins):
1. Kill switch override (enabled=False for user/org)
2. Explicit enable override (user, org, country, plan)
3. Percentage rollout bucket
4. Global status (on/off)
"""
flag = _load_flag(feature_key)
if flag is None:
return False # unknown flag → off
# 1 & 2: explicit overrides — user first, then org, country, plan
for target_type, target_id in _override_targets(ctx):
override = _get_override(feature_key, target_type, str(target_id))
if override is not None:
return override # True or False
# 3: percentage rollout
if flag['status'] == 'rolling' and flag['rollout_pct'] > 0:
return _in_rollout_bucket(feature_key, ctx.user_id, flag['rollout_pct'], flag['sticky'])
# 4: global status
return flag['status'] == 'on'
def _override_targets(ctx: EvalContext):
targets = [('user', ctx.user_id)]
if ctx.org_id: targets.append(('org', ctx.org_id))
if ctx.country: targets.append(('country', ctx.country))
if ctx.plan: targets.append(('plan', ctx.plan))
return targets
def _in_rollout_bucket(feature_key: str, user_id: int, pct: int, sticky: bool) -> bool:
"""
Deterministic bucket: hash(feature_key + user_id) mod 100.
Sticky = same user always gets the same bucket for the same flag.
Non-sticky (rare) would use hash(feature_key + user_id + date) for daily re-assignment.
"""
seed = f"{feature_key}:{user_id}" if sticky else f"{feature_key}:{user_id}:{_today()}"
digest = hashlib.md5(seed.encode()).hexdigest()
bucket = int(digest[:8], 16) % 100 # 0-99
return bucket Optional[dict]:
cache_key = f"flag:{feature_key}"
cached = redis_client.get(cache_key)
if cached:
return json.loads(cached)
row = db.fetchone(
"SELECT status, rollout_pct, sticky FROM Feature WHERE feature_key=%s",
(feature_key,)
)
if not row:
return None
flag = {'status': row['status'], 'rollout_pct': row['rollout_pct'], 'sticky': row['sticky']}
redis_client.setex(cache_key, FLAG_CACHE_TTL, json.dumps(flag))
return flag
def _get_override(feature_key: str, target_type: str, target_id: str) -> Optional[bool]:
cache_key = f"flag_override:{feature_key}:{target_type}:{target_id}"
cached = redis_client.get(cache_key)
if cached is not None:
return cached == 'true'
row = db.fetchone(
"SELECT enabled FROM FeatureOverride WHERE feature_key=%s AND target_type=%s AND target_id=%s",
(feature_key, target_type, target_id)
)
if row is None:
redis_client.setex(cache_key, FLAG_CACHE_TTL, 'null')
return None
redis_client.setex(cache_key, FLAG_CACHE_TTL, 'true' if row['enabled'] else 'false')
return row['enabled']
Gradual Rollout API
def set_rollout(feature_key: str, pct: int, changed_by: int, reason: str):
"""Advance or retract a percentage rollout. Audit-logged."""
assert 0 <= pct <= 100
old = db.fetchone("SELECT status, rollout_pct FROM Feature WHERE feature_key=%s", (feature_key,))
new_status = 'rolling' if 0 < pct < 100 else ('on' if pct == 100 else 'off')
db.execute("""
UPDATE Feature SET rollout_pct=%s, status=%s, updated_at=NOW()
WHERE feature_key=%s
""", (pct, new_status, feature_key))
db.execute("""
INSERT INTO FeatureAuditLog (feature_key, changed_by, old_value, new_value, reason)
VALUES (%s, %s, %s, %s, %s)
""", (
feature_key, changed_by,
json.dumps(old), json.dumps({'status': new_status, 'rollout_pct': pct}),
reason
))
_invalidate_flag_cache(feature_key)
def kill_switch(feature_key: str, changed_by: int, reason: str):
"""Immediately disable for all users regardless of overrides."""
set_rollout(feature_key, 0, changed_by, reason)
def _invalidate_flag_cache(feature_key: str):
# Delete the flag cache; overrides time out naturally within FLAG_CACHE_TTL
redis_client.delete(f"flag:{feature_key}")
SDK Usage in Application Code
# In any application handler:
from feature_flags import is_enabled, EvalContext
def checkout_handler(request):
ctx = EvalContext(
user_id=request.user.id,
org_id=request.user.org_id,
country=request.geo.country_code,
plan=request.user.plan,
)
if is_enabled('checkout_v2', ctx):
return checkout_v2(request)
return checkout_v1(request)
Observability: Exposure Logging
# Log every flag evaluation for analytics and guardrail metrics
def is_enabled_logged(feature_key: str, ctx: EvalContext) -> bool:
result = is_enabled(feature_key, ctx)
# Fire-and-forget async log (Kafka or in-process queue)
analytics.track('flag_evaluated', {
'feature_key': feature_key,
'user_id': ctx.user_id,
'enabled': result,
'ts': time.time(),
})
return result
# Downstream: join flag exposures with conversion events in data warehouse.
# SELECT f.feature_key, f.enabled, COUNT(*) AS users, SUM(c.converted) AS conversions
# FROM FlagExposure f JOIN ConversionEvent c USING (user_id, session_id)
# GROUP BY 1, 2 ORDER BY 1, 2;
Key Design Decisions
- Deterministic hash bucketing: MD5(feature_key + user_id) % 100 ensures the same user always gets the same treatment for a given flag — no flicker across page loads or API calls. Using the feature key in the seed means user 42 can be in the 5% bucket for “checkout_v2” but not for “dark_mode” — flags are independent.
- Override priority order: user > org > country > plan > rollout > global. Kill switches use user/org overrides with enabled=False — they fire before the percentage bucket is checked, so beta users get the flag but a banned user does not.
- 60-second Redis cache: flag state reads hit Redis, not Postgres — evaluation adds ~0.5ms. Cache invalidation on update is synchronous (delete on write); stale reads during the 60s window are acceptable for gradual rollouts. For emergency kill switches, explicitly flush the cache after writing.
- Audit log for compliance: every rollout change records who changed it, from what state, and why. Enables post-incident review (“who turned on feature X at 2 AM?”).
Feature rollout and flag evaluation system design is discussed in Google system design interview questions.
Feature rollout and gradual deployment system design is covered in Meta system design interview preparation.
Feature rollout and canary deployment design is discussed in LinkedIn system design interview guide.
See also: Scale AI Interview Guide 2026: Data Infrastructure, RLHF Pipelines, and ML Engineering
See also: Anthropic Interview Guide 2026: Process, Questions, and AI Safety