Knowledge Base System Low-Level Design
A knowledge base must do three things well: let authors maintain accurate articles without losing history, let users find relevant content quickly, and continuously improve based on whether users found the content helpful. This article covers article versioning with an immutable version history, full-text search using PostgreSQL tsvector with ranking, category and tag taxonomy, helpful/unhelpful feedback aggregation, and related article suggestions via tag overlap scoring.
Article Versioning Design
The core invariant: published versions are immutable. An article is a pointer to its current published version; editors create new draft versions that replace the pointer on publish. This gives you a complete audit trail and the ability to roll back without any soft-delete trickery:
CREATE TABLE article (
id BIGSERIAL PRIMARY KEY,
slug TEXT NOT NULL UNIQUE,
current_version_id BIGINT, -- FK set after first publish
status TEXT NOT NULL DEFAULT 'draft'
CHECK (status IN ('draft','published','archived')),
created_by BIGINT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
published_at TIMESTAMPTZ,
archived_at TIMESTAMPTZ
);
CREATE TABLE article_version (
id BIGSERIAL PRIMARY KEY,
article_id BIGINT NOT NULL REFERENCES article(id),
version_number INT NOT NULL,
title TEXT NOT NULL,
body_markdown TEXT NOT NULL,
body_html TEXT NOT NULL, -- pre-rendered HTML
change_summary TEXT,
author_id BIGINT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
search_vector TSVECTOR, -- populated by trigger
UNIQUE (article_id, version_number)
);
-- Circular FK: article.current_version_id → article_version.id
ALTER TABLE article ADD CONSTRAINT fk_current_version
FOREIGN KEY (current_version_id) REFERENCES article_version(id)
DEFERRABLE INITIALLY DEFERRED;
CREATE INDEX idx_av_article ON article_version (article_id, version_number DESC);
CREATE INDEX idx_av_search ON article_version USING gin (search_vector);
CREATE TABLE article_tag (
article_id BIGINT NOT NULL REFERENCES article(id) ON DELETE CASCADE,
tag TEXT NOT NULL,
PRIMARY KEY (article_id, tag)
);
CREATE INDEX idx_at_tag ON article_tag (tag);
CREATE TABLE article_feedback (
id BIGSERIAL PRIMARY KEY,
article_id BIGINT NOT NULL REFERENCES article(id),
version_id BIGINT NOT NULL REFERENCES article_version(id),
user_id BIGINT, -- NULL for anonymous
session_id TEXT NOT NULL,
helpful BOOLEAN NOT NULL,
comment TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (article_id, session_id) -- one vote per session
);
CREATE INDEX idx_af_article ON article_feedback (article_id, created_at DESC);
tsvector Full-Text Search Setup
PostgreSQL’s tsvector stores pre-processed lexemes for fast GIN index lookups. Populate it via trigger so it stays current automatically:
CREATE OR REPLACE FUNCTION update_article_search_vector()
RETURNS TRIGGER LANGUAGE plpgsql AS $$
BEGIN
NEW.search_vector :=
setweight(to_tsvector('english', coalesce(NEW.title, '')), 'A') ||
setweight(to_tsvector('english', coalesce(NEW.body_markdown, '')), 'B');
RETURN NEW;
END;
$$;
CREATE TRIGGER trg_article_search_vector
BEFORE INSERT OR UPDATE OF title, body_markdown ON article_version
FOR EACH ROW EXECUTE FUNCTION update_article_search_vector();
Full-Text Search with Ranking
from typing import List, Dict, Any, Optional
def search_articles(
db,
query: str,
tag: Optional[str] = None,
limit: int = 20,
offset: int = 0
) -> List[Dict[str, Any]]:
"""
Full-text search over current published versions.
Returns results sorted by ts_rank_cd (cover density ranking).
"""
if not query.strip():
return []
# Convert user query to tsquery; use plainto_tsquery for natural language
params: list = [query, query]
tag_clause = ""
if tag:
tag_clause = "AND EXISTS (SELECT 1 FROM article_tag WHERE article_id=a.id AND tag=%s)"
params.append(tag)
params += [limit, offset]
rows = db.fetchall(
f"""
SELECT
a.id,
a.slug,
av.title,
av.version_number,
ts_rank_cd(av.search_vector, plainto_tsquery('english', %s)) AS rank,
ts_headline('english', av.body_markdown,
plainto_tsquery('english', %s),
'MaxWords=35, MinWords=15, StartSel=, StopSel='
) AS excerpt,
a.published_at,
COALESCE(f.helpful_count, 0) AS helpful_count,
COALESCE(f.unhelpful_count, 0) AS unhelpful_count
FROM article a
JOIN article_version av ON av.id = a.current_version_id
LEFT JOIN (
SELECT article_id,
COUNT(*) FILTER (WHERE helpful=TRUE) AS helpful_count,
COUNT(*) FILTER (WHERE helpful=FALSE) AS unhelpful_count
FROM article_feedback
GROUP BY article_id
) f ON f.article_id = a.id
WHERE a.status = 'published'
AND av.search_vector @@ plainto_tsquery('english', %s)
{tag_clause}
ORDER BY rank DESC, a.published_at DESC
LIMIT %s OFFSET %s
""",
(query, query, query) + ((tag,) if tag else ()) + (limit, offset)
)
return [dict(r) for r in rows]
Publishing a New Version
def publish_version(
db,
article_id: int,
title: str,
body_markdown: str,
author_id: int,
change_summary: str = ''
) -> int:
"""
Creates a new immutable article_version and updates the article pointer.
Returns new version_id.
"""
import markdown # pip install markdown
body_html = markdown.markdown(body_markdown, extensions=['tables', 'fenced_code', 'toc'])
# Next version number
row = db.fetchone(
"SELECT COALESCE(MAX(version_number), 0) + 1 AS next_v FROM article_version WHERE article_id=%s",
(article_id,)
)
next_v = row['next_v']
# Insert immutable version (trigger populates search_vector)
db.execute(
"""INSERT INTO article_version
(article_id, version_number, title, body_markdown, body_html, author_id, change_summary)
VALUES (%s,%s,%s,%s,%s,%s,%s)""",
(article_id, next_v, title, body_markdown, body_html, author_id, change_summary)
)
version_id = db.fetchone("SELECT lastval()")['lastval']
# Update current pointer and mark published (DEFERRABLE FK allows this order)
db.execute(
"""UPDATE article
SET current_version_id=%s, status='published', published_at=NOW()
WHERE id=%s""",
(version_id, article_id)
)
db.commit()
return version_id
Related Articles via Tag Overlap
def get_related_articles(db, article_id: int, limit: int = 5) -> List[Dict]:
"""
Returns articles sharing the most tags with the given article,
excluding the article itself.
"""
return db.fetchall(
"""
SELECT
a.id,
a.slug,
av.title,
COUNT(*)::int AS shared_tags,
a.published_at
FROM article_tag source_tag
JOIN article_tag related_tag ON related_tag.tag = source_tag.tag
AND related_tag.article_id != source_tag.article_id
JOIN article a ON a.id = related_tag.article_id AND a.status = 'published'
JOIN article_version av ON av.id = a.current_version_id
WHERE source_tag.article_id = %s
GROUP BY a.id, a.slug, av.title, a.published_at
ORDER BY shared_tags DESC, a.published_at DESC
LIMIT %s
""",
(article_id, limit)
)
Feedback Aggregation and Usefulness Score
A simple helpful/unhelpful ratio lets you surface low-quality articles for editorial review:
def record_feedback(db, article_id: int, version_id: int,
session_id: str, helpful: bool,
user_id: Optional[int] = None,
comment: str = '') -> bool:
"""
Records feedback. Returns False if session already voted (idempotent).
"""
try:
db.execute(
"""INSERT INTO article_feedback
(article_id, version_id, session_id, helpful, user_id, comment)
VALUES (%s,%s,%s,%s,%s,%s)""",
(article_id, version_id, session_id, helpful, user_id, comment or None)
)
db.commit()
return True
except Exception:
db.rollback()
return False # duplicate session_id
def get_low_quality_articles(db, min_votes: int = 20, max_helpful_ratio: float = 0.40):
"""
Returns articles where fewer than 40% of votes are helpful
and total votes exceed threshold — candidates for revision.
"""
return db.fetchall(
"""
SELECT a.id, a.slug, av.title,
COUNT(*) AS total_votes,
ROUND(AVG(CASE WHEN helpful THEN 1.0 ELSE 0.0 END), 3) AS helpful_ratio
FROM article_feedback af
JOIN article a ON a.id = af.article_id
JOIN article_version av ON av.id = a.current_version_id
WHERE a.status = 'published'
GROUP BY a.id, a.slug, av.title
HAVING COUNT(*) >= %s
AND AVG(CASE WHEN helpful THEN 1.0 ELSE 0.0 END) <= %s
ORDER BY helpful_ratio ASC
""",
(min_votes, max_helpful_ratio)
)
{
“@context”: “https://schema.org”,
“@type”: “FAQPage”,
“mainEntity”: [
{
“@type”: “Question”,
“name”: “How does PostgreSQL tsvector full-text search compare to Elasticsearch for a knowledge base?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “For a knowledge base with up to a few million articles, PostgreSQL tsvector with a GIN index provides excellent full-text search performance with no additional infrastructure. It supports stemming, stop words, phrase search, and result ranking via ts_rank_cd. Elasticsearch is worth the operational overhead when you need custom analyzers, fuzzy matching, faceted search across billions of documents, or multi-language tokenization that PostgreSQL does not handle well.”
}
},
{
“@type”: “Question”,
“name”: “Why store pre-rendered HTML alongside markdown in article_version?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Rendering markdown on every page view is wasteful and introduces rendering latency. Store the HTML alongside the source markdown at write time (when a version is published). The markdown remains the authoritative source for editing and search indexing; the HTML is a cached rendering artifact. If your markdown rendering library changes, regenerate the HTML in a background migration without touching the source.”
}
},
{
“@type”: “Question”,
“name”: “How do you prevent article versioning from causing table bloat?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Set a retention policy for old versions: keep the current version, all versions from the last 90 days, and every 10th version as a milestone. Archive older intermediate versions to cold storage (S3) as JSON. A simple background job enforces this policy. For most knowledge bases, the version table grows slowly enough that no special measures are needed for years.”
}
},
{
“@type”: “Question”,
“name”: “How do you weight the helpful/unhelpful feedback signal to avoid gaming?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Limit feedback to one vote per user session (enforced by UNIQUE constraint on article_id + session_id). For logged-in users, additionally deduplicate by user_id. Apply a minimum vote threshold (e.g., 20 votes) before using the ratio for editorial prioritization, so a single negative vote on a new article does not immediately flag it for review. Weight recent feedback more heavily by windowing to the last 90 days.”
}
}
]
}
{“@context”:”https://schema.org”,”@type”:”FAQPage”,”mainEntity”:[{“@type”:”Question”,”name”:”How is article versioning implemented?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Each publish creates an immutable ArticleVersion row; the Article table holds a pointer to the current_version_id; rollback is a pointer update not a content change.”}},{“@type”:”Question”,”name”:”How does full-text search work in PostgreSQL?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”A tsvector column is computed from title and body using to_tsvector(); queries use to_tsquery() with ts_rank() for relevance ordering and GIN index for performance.”}},{“@type”:”Question”,”name”:”How is article feedback aggregated?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”ArticleFeedback rows (helpful/not_helpful) are counted and cached in the Article row as helpful_count/unhelpful_count; a scheduled job refreshes these periodically.”}},{“@type”:”Question”,”name”:”How are related articles suggested?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Articles sharing the most tags are ranked by Jaccard similarity of their tag sets; the top-N are stored in a precomputed related_articles JSONB column updated on publish.”}}]}
See also: Atlassian Interview Guide
See also: Airbnb Interview Guide 2026: Search Systems, Trust and Safety, and Full-Stack Engineering