Tenant onboarding provisions all the resources a new customer needs to use a multi-tenant SaaS product: creating their account, workspace, and initial user; provisioning infrastructure resources (database schema, S3 bucket prefix, Stripe customer); and configuring defaults. The key design requirement is that onboarding completes atomically — a failure halfway through leaves no orphaned resources that must be manually cleaned up. This calls for the saga pattern, with compensation for each provisioned resource.
Core Data Model
CREATE TABLE Tenant (
tenant_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
slug VARCHAR(100) UNIQUE NOT NULL, -- URL-safe identifier
name VARCHAR(255) NOT NULL,
plan VARCHAR(50) NOT NULL DEFAULT 'trial',
status VARCHAR(20) NOT NULL DEFAULT 'provisioning',
-- provisioning, active, suspended, cancelled
owner_user_id BIGINT,
stripe_customer_id VARCHAR(100),
s3_prefix VARCHAR(200),
created_at TIMESTAMPTZ DEFAULT NOW(),
activated_at TIMESTAMPTZ
);
CREATE TABLE TenantOnboardingJob (
job_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES Tenant(tenant_id),
status VARCHAR(20) NOT NULL DEFAULT 'pending',
-- pending, running, completed, failed, compensating, rolled_back
current_step VARCHAR(50),
completed_steps JSONB NOT NULL DEFAULT '[]',
error_message TEXT,
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW()
);
Onboarding Saga Orchestrator
ONBOARDING_STEPS = [
'create_db_schema',
'create_s3_prefix',
'create_stripe_customer',
'seed_default_data',
'send_welcome_email',
'activate_tenant',
]
COMPENSATION = {
'create_db_schema': 'drop_db_schema',
'create_s3_prefix': 'delete_s3_prefix',
'create_stripe_customer': 'archive_stripe_customer',
'seed_default_data': 'delete_seeded_data',
'send_welcome_email': None, # cannot unsend an email
'activate_tenant': 'deactivate_tenant',
}
def run_onboarding(job_id: str):
job = db.fetchone("SELECT * FROM TenantOnboardingJob WHERE job_id=%s", [job_id])
tenant = db.fetchone("SELECT * FROM Tenant WHERE tenant_id=%s", [job['tenant_id']])
completed = set(job['completed_steps'])
for step in ONBOARDING_STEPS:
if step in completed:
continue # already done — resume from here
db.execute("""
UPDATE TenantOnboardingJob SET current_step=%s, status='running'
WHERE job_id=%s
""", [step, job_id])
try:
result = execute_step(step, tenant)
# Save step output for potential compensation
completed.add(step)
db.execute("""
UPDATE TenantOnboardingJob
SET completed_steps = completed_steps || %s::jsonb
WHERE job_id=%s
""", [json.dumps([step]), job_id])
# Apply side effects to tenant record
apply_step_result(tenant['tenant_id'], step, result)
except Exception as e:
db.execute("""
UPDATE TenantOnboardingJob
SET status='failed', error_message=%s
WHERE job_id=%s
""", [str(e), job_id])
compensate(job_id, tenant, list(completed))
return
db.execute("""
UPDATE TenantOnboardingJob SET status='completed', completed_at=NOW()
WHERE job_id=%s
""", [job_id])
def compensate(job_id: str, tenant: dict, completed_steps: list):
db.execute("UPDATE TenantOnboardingJob SET status='compensating' WHERE job_id=%s", [job_id])
# Reverse order
for step in reversed(completed_steps):
comp = COMPENSATION.get(step)
if comp:
try:
execute_step(comp, tenant)
except Exception as e:
# Compensation failure — alert ops, do not retry automatically
alert_ops(f"Compensation failed for {step} on tenant {tenant['tenant_id']}: {e}")
db.execute("UPDATE TenantOnboardingJob SET status='rolled_back' WHERE job_id=%s", [job_id])
Individual Step Implementations
def execute_step(step: str, tenant: dict) -> dict:
if step == 'create_db_schema':
schema = f"tenant_{tenant['slug'].replace('-', '_')}"
db.execute(f"CREATE SCHEMA IF NOT EXISTS {schema}")
db.execute(f"SET search_path TO {schema}")
run_migrations(schema) # apply base tables for this tenant
return {'schema_name': schema}
if step == 'create_s3_prefix':
prefix = f"tenants/{tenant['tenant_id']}/"
# Create a "folder" by uploading a zero-byte marker
s3.put_object(Bucket=S3_BUCKET, Key=f"{prefix}.keep", Body=b'')
return {'s3_prefix': prefix}
if step == 'create_stripe_customer':
customer = stripe.Customer.create(
name=tenant['name'],
metadata={'tenant_id': str(tenant['tenant_id'])}
)
return {'stripe_customer_id': customer.id}
if step == 'activate_tenant':
db.execute("""
UPDATE Tenant SET status='active', activated_at=NOW()
WHERE tenant_id=%s
""", [tenant['tenant_id']])
return {}
Key Interview Points
- The saga pattern is essential here — onboarding touches DB, S3, Stripe, and email. No single transaction can span these systems; each step must have a compensating action.
- Idempotency at each step: CREATE SCHEMA IF NOT EXISTS and S3 put_object are naturally idempotent — safe to retry. Stripe customer creation requires an idempotency key header to prevent duplicate customers on retry.
- Compensation cannot unsend emails or undo notifications — design the onboarding sequence so irreversible steps (email, notifications) come last, after all reversible infrastructure steps succeed.
- Resumability: the completed_steps JSON array checkpoints progress. If the job worker crashes mid-onboarding, re-running the job skips already-completed steps.
- Tenant schema isolation (CREATE SCHEMA per tenant) provides row-level isolation at the DB level — queries must be namespaced but cross-tenant data leakage is prevented by schema boundaries.
- Async onboarding UX: return immediately from the POST /tenants endpoint with tenant_id and status=provisioning. Poll GET /tenants/{id}/status or use a webhook to notify when activation is complete.
Tenant onboarding and multi-step payment saga design is discussed in Stripe system design interview questions.
Tenant onboarding and SaaS multi-tenant provisioning design is covered in Atlassian system design interview preparation.
Tenant onboarding and distributed saga orchestration design is discussed in Amazon system design interview guide.