A content tagging system lets users and systems attach descriptive labels to content — blog posts, products, images, videos — enabling discovery, filtering, and recommendations. Core challenges: normalizing tag input (avoiding “nodejs”, “Node.js”, “node js” proliferating as separate tags), enforcing tag cardinality limits, efficient tag-based search, tag suggestion/autocomplete, and managing machine-generated vs user-generated tags.
Core Data Model
-- Canonical tag registry (one row per unique tag)
CREATE TABLE Tag (
tag_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
slug TEXT NOT NULL UNIQUE, -- normalized: 'nodejs', 'machine-learning'
display_name TEXT NOT NULL, -- original casing: 'Node.js', 'Machine Learning'
description TEXT,
usage_count INT NOT NULL DEFAULT 0, -- denormalized for autocomplete ranking
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_tag_slug ON Tag (slug);
CREATE INDEX idx_tag_usage ON Tag (usage_count DESC);
-- Many-to-many: content ↔ tags
CREATE TABLE ContentTag (
content_type TEXT NOT NULL, -- 'post', 'product', 'video'
content_id TEXT NOT NULL,
tag_id UUID NOT NULL REFERENCES Tag(tag_id),
source TEXT NOT NULL DEFAULT 'user', -- 'user', 'ml_model', 'admin'
confidence NUMERIC(3,2), -- 0.0–1.0 for ML-generated tags
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (content_type, content_id, tag_id)
);
CREATE INDEX idx_contenttag_tag ON ContentTag (tag_id, content_type);
-- Synonym mapping for tag normalization
CREATE TABLE TagSynonym (
raw_input TEXT NOT NULL PRIMARY KEY, -- 'nodejs', 'node js', 'node.js'
canonical_tag_id UUID NOT NULL REFERENCES Tag(tag_id)
);
Normalizing and Applying Tags
import re
from uuid import uuid4
import psycopg2
MAX_TAGS_PER_CONTENT = 20
MAX_TAG_LENGTH = 50
def normalize_tag_input(raw: str) -> str:
"""
Normalize user tag input to a canonical slug.
'Node.JS' → 'nodejs', 'machine learning' → 'machine-learning'
"""
slug = raw.lower().strip()
slug = re.sub(r'[^a-z0-9]+', '-', slug) # replace non-alphanumeric with hyphen
slug = slug.strip('-') # strip leading/trailing hyphens
slug = re.sub(r'-+', '-', slug) # collapse multiple hyphens
return slug[:MAX_TAG_LENGTH]
def get_or_create_tag(conn, display_name: str) -> str:
"""
Get existing tag by slug or create new one.
Returns tag_id. Checks synonym table first.
"""
slug = normalize_tag_input(display_name)
with conn.cursor() as cur:
# Check synonym table first
cur.execute("SELECT canonical_tag_id FROM TagSynonym WHERE raw_input=%s", (slug,))
row = cur.fetchone()
if row:
return str(row[0])
# Check direct tag match
cur.execute("SELECT tag_id FROM Tag WHERE slug=%s", (slug,))
row = cur.fetchone()
if row:
return str(row[0])
# Create new tag
tag_id = str(uuid4())
cur.execute(
"INSERT INTO Tag (tag_id, slug, display_name) VALUES (%s,%s,%s)",
(tag_id, slug, display_name.strip())
)
conn.commit()
return tag_id
def apply_tags(conn, content_type: str, content_id: str,
tag_names: list[str], source: str = 'user') -> list[str]:
"""
Apply a list of tags to a content item.
Enforces tag limit, normalizes, creates missing tags.
Returns list of applied tag_ids.
"""
if len(tag_names) > MAX_TAGS_PER_CONTENT:
raise ValueError(f"Maximum {MAX_TAGS_PER_CONTENT} tags allowed")
tag_ids = []
with conn.cursor() as cur:
for name in tag_names:
if not name.strip():
continue
tag_id = get_or_create_tag(conn, name)
tag_ids.append(tag_id)
cur.execute("""
INSERT INTO ContentTag (content_type, content_id, tag_id, source)
VALUES (%s,%s,%s,%s)
ON CONFLICT (content_type, content_id, tag_id) DO NOTHING
""", (content_type, content_id, tag_id, source))
# Update usage counts
if tag_ids:
cur.execute(
"UPDATE Tag SET usage_count = usage_count + 1 WHERE tag_id = ANY(%s)",
(tag_ids,)
)
conn.commit()
return tag_ids
Tag Autocomplete (Redis Sorted Sets)
import redis
r = redis.Redis(host='redis', decode_responses=True)
AUTOCOMPLETE_KEY = "tags:autocomplete"
def index_tag_for_autocomplete(tag_id: str, slug: str, usage_count: int):
"""
Index all prefixes of the tag slug into a sorted set scored by usage.
'python' → index 'p', 'py', 'pyt', 'pyth', 'pytho', 'python'
"""
pipeline = r.pipeline()
for i in range(1, len(slug) + 1):
prefix = slug[:i]
pipeline.zadd(f"tags:prefix:{prefix}", {slug: usage_count})
pipeline.zremrangebyrank(f"tags:prefix:{prefix}", 0, -21) # keep top 20
pipeline.execute()
def autocomplete_tags(prefix: str, limit: int = 10) -> list[str]:
"""Return top matching tag slugs for a prefix, ordered by usage."""
key = f"tags:prefix:{normalize_tag_input(prefix)}"
return r.zrevrange(key, 0, limit - 1) # highest score = most used
def rebuild_autocomplete_index(conn):
"""
Batch job: rebuild autocomplete index from Tag table.
Run on deploy or after bulk tag imports.
"""
with conn.cursor() as cur:
cur.execute("SELECT tag_id, slug, usage_count FROM Tag WHERE usage_count > 0 ORDER BY usage_count DESC LIMIT 50000")
tags = cur.fetchall()
for tag_id, slug, count in tags:
index_tag_for_autocomplete(tag_id, slug, count)
Tag-Based Content Search
def search_by_tags(conn, content_type: str, tag_slugs: list[str],
operator: str = 'AND', limit: int = 20,
before_id: str | None = None) -> list[str]:
"""
Find content items that have ALL (AND) or ANY (OR) of the specified tags.
Cursor-paginated by content_id.
"""
# Resolve slugs to tag_ids
with conn.cursor() as cur:
cur.execute("SELECT tag_id FROM Tag WHERE slug = ANY(%s)", (tag_slugs,))
tag_ids = [row[0] for row in cur.fetchall()]
if not tag_ids:
return []
if operator == 'AND':
# Content must have ALL specified tags
with conn.cursor() as cur:
cur.execute("""
SELECT content_id
FROM ContentTag
WHERE content_type = %s AND tag_id = ANY(%s)
AND (%s IS NULL OR content_id > %s)
GROUP BY content_id
HAVING COUNT(DISTINCT tag_id) = %s
ORDER BY content_id
LIMIT %s
""", (content_type, tag_ids, before_id, before_id, len(tag_ids), limit))
return [row[0] for row in cur.fetchall()]
else: # OR
with conn.cursor() as cur:
cur.execute("""
SELECT DISTINCT content_id
FROM ContentTag
WHERE content_type = %s AND tag_id = ANY(%s)
AND (%s IS NULL OR content_id > %s)
ORDER BY content_id
LIMIT %s
""", (content_type, tag_ids, before_id, before_id, limit))
return [row[0] for row in cur.fetchall()]
Key Interview Points
- Normalization prevents tag fragmentation: Without normalization, “Node.js”, “nodejs”, “node js”, and “NodeJS” become four separate tags each with a fraction of the true count. The slug (lowercase, alphanumeric + hyphen) is the canonical form. The synonym table handles user-expected inputs: “k8s” → “kubernetes”, “js” → “javascript”. Run a periodic job to merge tags with high overlap in content (tags that appear on the same content 80%+ of the time are likely synonyms).
- Usage count for autocomplete ranking: The autocomplete sorted set is scored by usage_count — popular tags float to the top. Update usage_count in a background job (not inline) to avoid write amplification: every tag application would update the autocomplete sorted set’s score for all 6–15 prefixes of the tag. Instead, increment usage_count in PostgreSQL and re-sync to Redis hourly.
- ML-generated tags with confidence: Vision models auto-tag images; NLP models tag text. Store source=’ml_model’ and confidence=0.85 in ContentTag. Display ML tags with lower prominence than user-applied tags. Periodic review: find ML tags with confidence < 0.7 that users frequently remove — add to a negative feedback loop to retrain the model. ML tags above 0.9 can be auto-applied without user confirmation.
- AND search performance: Tag-AND searches require content that has all K tags. The HAVING COUNT(DISTINCT tag_id) = K approach works but scans all content with ANY of the tags. Optimization: start with the rarest tag (lowest usage_count), get its content IDs, then check if those content items have the other tags: SELECT content_id FROM ContentTag WHERE tag_id = rarest_tag_id, then filter by IN (SELECT content_id FROM ContentTag WHERE tag_id = next_rarest …). Intersect the smaller sets first.
- Tag moderation: Users apply offensive or misleading tags. Solutions: (1) allowlist for sensitive content types (product tags on a marketplace); (2) flagging + auto-blocking of tags matching a blocklist; (3) tag review queue for new tags by new users; (4) shadow-banning: hide flagged tags from other users while still showing them to the tagger (reduces gaming). Track most-applied tags daily for anomaly detection.
Content tagging and project organization system design is discussed in Atlassian system design interview questions.
Content tagging and product catalog design is covered in Shopify system design interview preparation.
Content tagging and skill taxonomy design is discussed in LinkedIn system design interview guide.
See also: Meta Interview Guide 2026: Facebook, Instagram, WhatsApp Engineering
See also: Anthropic Interview Guide 2026: Process, Questions, and AI Safety