Low-Level Design: Customer Support Ticketing System (SLA, Routing, State Machine)

Low-Level Design: Customer Support Ticketing System

A ticketing system manages customer support requests from creation through resolution. It involves ticket routing, priority queuing, SLA tracking, and agent assignment. Asked at Atlassian, Zendesk, and Stripe.

Core Entities


from dataclasses import dataclass, field
from enum import Enum
from datetime import datetime, timedelta
from typing import Optional
import uuid

class Priority(Enum):
    LOW = 1
    MEDIUM = 2
    HIGH = 3
    URGENT = 4

class TicketStatus(Enum):
    OPEN = "open"
    ASSIGNED = "assigned"
    IN_PROGRESS = "in_progress"
    PENDING_CUSTOMER = "pending_customer"
    RESOLVED = "resolved"
    CLOSED = "closed"

class Category(Enum):
    BILLING = "billing"
    TECHNICAL = "technical"
    ACCOUNT = "account"
    GENERAL = "general"

# SLA: max time to first response and resolution by priority
SLA_RESPONSE = {
    Priority.URGENT: timedelta(hours=1),
    Priority.HIGH:   timedelta(hours=4),
    Priority.MEDIUM: timedelta(hours=8),
    Priority.LOW:    timedelta(hours=24),
}
SLA_RESOLUTION = {
    Priority.URGENT: timedelta(hours=4),
    Priority.HIGH:   timedelta(hours=24),
    Priority.MEDIUM: timedelta(days=3),
    Priority.LOW:    timedelta(days=7),
}

@dataclass
class Customer:
    customer_id: str
    name: str
    email: str
    tier: str = "standard"  # "standard", "premium", "enterprise"

@dataclass
class Agent:
    agent_id: str
    name: str
    email: str
    skills: list[Category]
    current_load: int = 0
    max_load: int = 10

    @property
    def is_available(self) -> bool:
        return self.current_load  datetime:
        return self.created_at + SLA_RESPONSE[self.priority]

    @property
    def resolution_due_at(self) -> datetime:
        return self.created_at + SLA_RESOLUTION[self.priority]

    @property
    def is_response_breached(self) -> bool:
        if self.first_response_at:
            return False  # already responded
        return datetime.utcnow() > self.response_due_at

    @property
    def is_resolution_breached(self) -> bool:
        if self.resolved_at:
            return False
        return datetime.utcnow() > self.resolution_due_at

Ticket Service and Routing


class TicketService:
    VALID_TRANSITIONS = {
        TicketStatus.OPEN:             {TicketStatus.ASSIGNED},
        TicketStatus.ASSIGNED:         {TicketStatus.IN_PROGRESS, TicketStatus.OPEN},
        TicketStatus.IN_PROGRESS:      {TicketStatus.PENDING_CUSTOMER, TicketStatus.RESOLVED},
        TicketStatus.PENDING_CUSTOMER: {TicketStatus.IN_PROGRESS, TicketStatus.RESOLVED},
        TicketStatus.RESOLVED:         {TicketStatus.CLOSED, TicketStatus.IN_PROGRESS},
        TicketStatus.CLOSED:           set(),
    }

    def __init__(self, agent_store, notifier):
        self._tickets: dict[str, Ticket] = {}
        self._agents: dict[str, Agent] = agent_store
        self.notifier = notifier

    def create_ticket(self, customer_id: str, subject: str, description: str,
                       category: Category, priority: Priority = Priority.MEDIUM) -> Ticket:
        ticket = Ticket(
            ticket_id=str(uuid.uuid4()),
            customer_id=customer_id,
            subject=subject,
            description=description,
            priority=priority,
            category=category,
        )
        self._tickets[ticket.ticket_id] = ticket
        self._auto_assign(ticket)
        self.notifier.notify_customer(customer_id, f"Ticket #{ticket.ticket_id} created")
        return ticket

    def _auto_assign(self, ticket: Ticket) -> None:
        """Assign to least-loaded available agent skilled in the ticket's category."""
        candidates = [
            a for a in self._agents.values()
            if a.is_available and ticket.category in a.skills
        ]
        if not candidates:
            return  # ticket stays OPEN, will be manually assigned or retried
        best = min(candidates, key=lambda a: a.current_load)
        self._assign(ticket, best.agent_id)

    def _assign(self, ticket: Ticket, agent_id: str) -> None:
        agent = self._agents[agent_id]
        ticket.assigned_agent_id = agent_id
        ticket.status = TicketStatus.ASSIGNED
        agent.current_load += 1
        self.notifier.notify_agent(agent_id, f"Ticket #{ticket.ticket_id} assigned to you")

    def transition(self, ticket_id: str, new_status: TicketStatus,
                    actor_id: str) -> Ticket:
        ticket = self._tickets.get(ticket_id)
        if not ticket:
            raise ValueError(f"Ticket {ticket_id} not found")
        if new_status not in self.VALID_TRANSITIONS[ticket.status]:
            raise ValueError(f"Cannot transition {ticket.status} -> {new_status}")
        old_status = ticket.status
        ticket.status = new_status
        if new_status == TicketStatus.RESOLVED:
            ticket.resolved_at = datetime.utcnow()
            if ticket.assigned_agent_id:
                self._agents[ticket.assigned_agent_id].current_load -= 1
        return ticket

    def add_comment(self, ticket_id: str, author_id: str, author_type: str,
                     body: str, is_internal: bool = False) -> Comment:
        ticket = self._tickets.get(ticket_id)
        if not ticket:
            raise ValueError(f"Ticket {ticket_id} not found")
        comment = Comment(
            comment_id=str(uuid.uuid4()),
            author_id=author_id,
            author_type=author_type,
            body=body,
            is_internal=is_internal,
        )
        ticket.comments.append(comment)
        if author_type == "agent" and not ticket.first_response_at and not is_internal:
            ticket.first_response_at = datetime.utcnow()
        return comment

    def get_sla_breached_tickets(self) -> list[Ticket]:
        return [t for t in self._tickets.values()
                if t.is_response_breached or t.is_resolution_breached]

Priority Queue for Unassigned Tickets


import heapq

class TicketQueue:
    """Priority queue weighting urgent tickets from premium customers highest."""

    def __init__(self, customer_store):
        self._heap = []  # (priority_score, created_at, ticket)
        self.customers = customer_store

    def _score(self, ticket: Ticket) -> int:
        tier_bonus = {"enterprise": 100, "premium": 50, "standard": 0}
        customer = self.customers.get(ticket.customer_id)
        tier = customer.tier if customer else "standard"
        return -(ticket.priority.value * 10 + tier_bonus.get(tier, 0))

    def push(self, ticket: Ticket) -> None:
        score = self._score(ticket)
        heapq.heappush(self._heap, (score, ticket.created_at, ticket))

    def pop(self) -> Ticket:
        if not self._heap:
            raise IndexError("Queue empty")
        _, _, ticket = heapq.heappop(self._heap)
        return ticket

Design Decisions

Decision Choice Rationale
Status transitions State machine with VALID_TRANSITIONS dict Prevents illegal status changes at class level
SLA tracking Computed properties on Ticket No background job needed; checked lazily
Agent load balancing Min current_load among skilled agents Simple, fair; extensible with skill rating
Internal comments is_internal flag on Comment Agents share notes without exposing to customer

{
“@context”: “https://schema.org”,
“@type”: “FAQPage”,
“mainEntity”: [
{
“@type”: “Question”,
“name”: “What design patterns are used in a ticketing system?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “State machine for ticket lifecycle (OPEN u2192 ASSIGNED u2192 IN_PROGRESS u2192 RESOLVED u2192 CLOSED) u2014 VALID_TRANSITIONS dict prevents illegal status changes. Strategy pattern for routing algorithms (round-robin, least-loaded, skill-based). Observer pattern for notifications (on status change, notify customer and agent without coupling notification code to ticket logic). Priority queue for unassigned ticket dispatching u2014 weighting by urgency and customer tier. Factory pattern for creating tickets with different default priorities based on category.”
}
},
{
“@type”: “Question”,
“name”: “How do you implement SLA tracking without a background job?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Compute SLA deadlines as properties derived from ticket creation time and priority: response_due_at = created_at + SLA_RESPONSE[priority]. Check is_breached lazily when the ticket is accessed: return datetime.utcnow() > response_due_at and first_response_at is None. For alerting before breach, run a lightweight periodic job (every 5 minutes) that queries unresolved tickets with response_due_at < now + 30min and publishes alerts. This avoids per-ticket timers while keeping deadline logic simple and testable."
}
},
{
"@type": "Question",
"name": "How do you implement skill-based ticket routing?",
"acceptedAnswer": {
"@type": "Answer",
"text": "Each Agent has a list of categories they handle (skills). When a ticket is created, filter agents by: (1) skills includes ticket.category, (2) current_load < max_load (available). Among candidates, pick the least-loaded agent (min current_load). This is least-connection routing adapted for skill filtering. For priority override: always assign URGENT tickets to the available agent with the most relevant skill, ignoring current load (emergency agent). Unassignable tickets (no available skilled agents) enter a priority queue for manual dispatch or auto-retry."
}
},
{
"@type": "Question",
"name": "How do you handle the re-open flow when a customer replies after resolution?",
"acceptedAnswer": {
"@type": "Answer",
"text": "When a customer adds a comment on a RESOLVED ticket, automatically transition to IN_PROGRESS and re-assign to the last agent (if available) or route to the queue. This is the RESOLVED u2192 IN_PROGRESS transition in the state machine. Track reopened_count on the Ticket to identify chronically unresolved issues. If a ticket is reopened more than N times, escalate to a senior tier. Reset SLA clock on reopen: update created_at or add a separate sla_start_at field so the next response/resolution deadline is measured from the reopen time."
}
},
{
"@type": "Question",
"name": "How would you scale a ticketing system to handle 1M tickets/day?",
"acceptedAnswer": {
"@type": "Answer",
"text": "Shard tickets by customer_id (most reads are customer-scoped). Store ticket content in a document store (Elasticsearch) for full-text search (search by customer name, keyword in description). Use a relational database for ticket metadata and state (status, priority, agent assignment) with proper indexes on (status, priority, created_at). Agent workload data lives in Redis (fast read/write for current_load). Routing decisions are made by a stateless routing service reading from Redis. Background job handles SLA breach alerts and aging escalations."
}
}
]
}

Asked at: Atlassian Interview Guide

Asked at: Shopify Interview Guide

Asked at: Stripe Interview Guide

Asked at: Airbnb Interview Guide

Scroll to Top