Tagging System Low-Level Design

Tagging System — Low-Level Design

A tagging system allows attaching labels to content for categorization, filtering, and discovery. It must support fast tag lookup, tag autocomplete, tag co-occurrence, and flexible many-to-many associations. This design appears in interviews at Stack Overflow, Medium, and any content platform.

Core Data Model

Tag
  id              BIGSERIAL PK
  name            TEXT UNIQUE NOT NULL   -- normalized: lowercase, trimmed
  slug            TEXT UNIQUE NOT NULL   -- URL-safe: 'machine-learning'
  usage_count     INT DEFAULT 0          -- denormalized: how many items use this tag
  created_at      TIMESTAMPTZ

ContentTag
  content_id      BIGINT NOT NULL
  content_type    TEXT NOT NULL          -- 'post', 'question', 'product'
  tag_id          BIGINT FK NOT NULL
  created_at      TIMESTAMPTZ
  PRIMARY KEY (content_id, content_type, tag_id)

-- Indexes
CREATE INDEX idx_content_tags ON ContentTag(tag_id, content_id);
CREATE INDEX idx_tag_search ON Tag USING GIN(name gin_trgm_ops);

Tag Normalization

def normalize_tag(raw_tag):
    """Normalize user input to canonical tag form."""
    tag = raw_tag.strip().lower()
    # Replace spaces and underscores with hyphens
    tag = re.sub(r'[s_]+', '-', tag)
    # Remove non-alphanumeric characters except hyphens
    tag = re.sub(r'[^a-z0-9-]', '', tag)
    # Collapse multiple hyphens
    tag = re.sub(r'-+', '-', tag).strip('-')
    if not tag:
        raise ValueError('Tag name is empty after normalization')
    if len(tag) > 50:
        raise ValueError('Tag name too long')
    return tag

def get_or_create_tag(raw_name):
    name = normalize_tag(raw_name)
    slug = name  # slug == normalized name in this scheme
    result = db.execute("""
        INSERT INTO Tag (name, slug, created_at)
        VALUES (%(name)s, %(slug)s, NOW())
        ON CONFLICT (name) DO UPDATE SET name=EXCLUDED.name
        RETURNING *
    """, {'name': name, 'slug': slug})
    return result

Adding and Removing Tags

def set_tags(content_id, content_type, raw_tags, max_tags=10):
    """Replace the full tag set for a piece of content."""
    if len(raw_tags) > max_tags:
        raise ValueError(f'Maximum {max_tags} tags allowed')

    tag_ids = [get_or_create_tag(t).id for t in raw_tags]

    with db.transaction():
        # Get current tags
        current_ids = {row.tag_id for row in db.execute("""
            SELECT tag_id FROM ContentTag
            WHERE content_id=%(cid)s AND content_type=%(ctype)s
        """, {'cid': content_id, 'ctype': content_type})}

        to_add = set(tag_ids) - current_ids
        to_remove = current_ids - set(tag_ids)

        # Add new tags
        for tag_id in to_add:
            db.execute("""
                INSERT INTO ContentTag (content_id, content_type, tag_id, created_at)
                VALUES (%(cid)s, %(ctype)s, %(tid)s, NOW())
                ON CONFLICT DO NOTHING
            """, {'cid': content_id, 'ctype': content_type, 'tid': tag_id})
            db.execute("UPDATE Tag SET usage_count=usage_count+1 WHERE id=%(id)s",
                       {'id': tag_id})

        # Remove old tags
        for tag_id in to_remove:
            db.execute("""
                DELETE FROM ContentTag
                WHERE content_id=%(cid)s AND content_type=%(ctype)s AND tag_id=%(tid)s
            """, {'cid': content_id, 'ctype': content_type, 'tid': tag_id})
            db.execute("UPDATE Tag SET usage_count=GREATEST(0, usage_count-1) WHERE id=%(id)s",
                       {'id': tag_id})

Tag Autocomplete

-- Trigram index enables fast ILIKE prefix and fuzzy matching
def autocomplete_tags(query, limit=10):
    normalized = query.strip().lower()
    return db.execute("""
        SELECT name, slug, usage_count
        FROM Tag
        WHERE name ILIKE %(prefix)s
        ORDER BY
          -- Exact prefix match first, then by popularity
          CASE WHEN name LIKE %(prefix)s THEN 0 ELSE 1 END,
          usage_count DESC
        LIMIT %(limit)s
    """, {'prefix': normalized + '%', 'limit': limit})

-- For fuzzy matching (handles typos): use similarity()
SELECT name, slug, similarity(name, %(query)s) AS sim
FROM Tag
WHERE name % %(query)s  -- trigram similarity threshold (default 0.3)
ORDER BY sim DESC, usage_count DESC
LIMIT 10;

Browsing Content by Tag

def get_content_by_tag(tag_slug, content_type, cursor=None, limit=20):
    tag = db.get_by(Tag, slug=tag_slug)
    if not tag:
        return []

    cursor_clause = 'AND ct.created_at < %(cursor)s' if cursor else ''
    return db.execute(f"""
        SELECT c.*, ct.created_at as tagged_at
        FROM ContentTag ct
        JOIN Content c ON ct.content_id=c.id
        WHERE ct.tag_id=%(tid)s
          AND ct.content_type=%(ctype)s
          AND c.status='published'
          {cursor_clause}
        ORDER BY ct.created_at DESC
        LIMIT %(limit)s
    """, {'tid': tag.id, 'ctype': content_type, 'limit': limit})
-- Tags that frequently appear together with a given tag
def get_related_tags(tag_id, limit=10):
    return db.execute("""
        SELECT t2.tag_id, t.name, t.slug, COUNT(*) as co_count
        FROM ContentTag t1
        JOIN ContentTag t2 ON t1.content_id=t2.content_id
                          AND t1.content_type=t2.content_type
                          AND t2.tag_id != t1.tag_id
        JOIN Tag t ON t2.tag_id=t.id
        WHERE t1.tag_id=%(tag_id)s
        GROUP BY t2.tag_id, t.name, t.slug
        ORDER BY co_count DESC
        LIMIT %(limit)s
    """, {'tag_id': tag_id, 'limit': limit})

-- Cache this result (TTL=1 hour) — it's expensive to compute and changes slowly

Key Interview Points

  • Normalize on write: Normalize tags (lowercase, trim, replace spaces) when they are created, not on every read. This ensures “Machine Learning”, “machine-learning”, and “machine_learning” all resolve to the same tag record.
  • usage_count denormalization: Counting ContentTag rows per tag is O(N). Maintain usage_count on Tag to support “popular tags” queries in O(1).
  • Set semantics for tag assignment: Use set_tags (replace entire tag set) rather than add_tag/remove_tag operations to avoid race conditions when the UI sends the updated list.
  • Trigram index for autocomplete: GIN(name gin_trgm_ops) supports both prefix and fuzzy matching efficiently. Without it, ILIKE queries require a full table scan.

Tagging system and hashtag design is discussed in Twitter system design interview questions.

Product tagging and categorization system design is covered in Shopify system design interview preparation.

Tagging and skill system design is discussed in LinkedIn system design interview guide.

Scroll to Top