Document Versioning System: Low-Level Design
A document versioning system preserves every historical state of a document so users can view, compare, and restore previous versions. The design challenge is balancing storage efficiency (storing full snapshots vs. deltas) against read performance (reconstructing a version at any point in time). This design covers the snapshot-with-delta storage model, three-way merge for concurrent edits, and an efficient diffing strategy.
Core Data Model
CREATE TABLE Document (
doc_id BIGSERIAL PRIMARY KEY,
title VARCHAR(500) NOT NULL,
owner_id BIGINT NOT NULL,
current_version_id BIGINT, -- FK set after first version insert
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE DocumentVersion (
version_id BIGSERIAL PRIMARY KEY,
doc_id BIGINT NOT NULL REFERENCES Document(doc_id) ON DELETE CASCADE,
version_number INT NOT NULL, -- 1, 2, 3 ... monotonically increasing
author_id BIGINT NOT NULL,
commit_message VARCHAR(500),
content_type VARCHAR(20) NOT NULL DEFAULT 'snapshot', -- snapshot, delta
content TEXT, -- full content if snapshot; NULL if delta stored separately
parent_version_id BIGINT REFERENCES DocumentVersion(version_id),
checksum VARCHAR(64) NOT NULL, -- SHA-256 of full content (for integrity)
size_bytes INT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (doc_id, version_number)
);
CREATE TABLE VersionDelta (
delta_id BIGSERIAL PRIMARY KEY,
version_id BIGINT NOT NULL REFERENCES DocumentVersion(version_id) ON DELETE CASCADE,
patch TEXT NOT NULL, -- unified diff format patch
UNIQUE (version_id)
);
CREATE INDEX ON DocumentVersion(doc_id, version_number DESC);
CREATE INDEX ON DocumentVersion(doc_id, content_type);
Write: Snapshot-with-Delta Storage
import hashlib, difflib
from typing import Optional
SNAPSHOT_INTERVAL = 10 # store full snapshot every N versions
def save_version(doc_id: int, new_content: str, author_id: int,
commit_message: str = None) -> int:
"""
Save a new version. Every Nth version is a snapshot; others are stored as deltas.
Returns the new version_id.
"""
latest = db.fetchone("""
SELECT version_id, version_number, content, content_type
FROM DocumentVersion
WHERE doc_id=%s ORDER BY version_number DESC LIMIT 1
""", (doc_id,))
new_version_number = (latest['version_number'] + 1) if latest else 1
checksum = hashlib.sha256(new_content.encode()).hexdigest()
size_bytes = len(new_content.encode('utf-8'))
# Store snapshot every SNAPSHOT_INTERVAL versions, or for the first version
is_snapshot = (new_version_number == 1 or new_version_number % SNAPSHOT_INTERVAL == 0)
if is_snapshot:
version_id = db.fetchone("""
INSERT INTO DocumentVersion
(doc_id, version_number, author_id, commit_message,
content_type, content, parent_version_id, checksum, size_bytes)
VALUES (%s,%s,%s,%s,'snapshot',%s,%s,%s,%s)
RETURNING version_id
""", (doc_id, new_version_number, author_id, commit_message,
new_content, latest['version_id'] if latest else None,
checksum, size_bytes))['version_id']
else:
# Reconstruct current content for diffing
current_content = _reconstruct_version(doc_id, latest['version_id'])
patch = _make_patch(current_content, new_content)
version_id = db.fetchone("""
INSERT INTO DocumentVersion
(doc_id, version_number, author_id, commit_message,
content_type, content, parent_version_id, checksum, size_bytes)
VALUES (%s,%s,%s,%s,'delta',NULL,%s,%s,%s)
RETURNING version_id
""", (doc_id, new_version_number, author_id, commit_message,
latest['version_id'], checksum, size_bytes))['version_id']
db.execute("""
INSERT INTO VersionDelta (version_id, patch) VALUES (%s, %s)
""", (version_id, patch))
# Update document's current version pointer
db.execute("""
UPDATE Document SET current_version_id=%s, updated_at=NOW()
WHERE doc_id=%s
""", (version_id, doc_id))
return version_id
def _make_patch(old: str, new: str) -> str:
old_lines = old.splitlines(keepends=True)
new_lines = new.splitlines(keepends=True)
return ''.join(difflib.unified_diff(old_lines, new_lines, lineterm=''))
def _apply_patch(base: str, patch: str) -> str:
"""Apply a unified diff patch to base content."""
import patch as patch_lib # python-patch library
p = patch_lib.fromstring(patch.encode())
lines = base.splitlines(keepends=True)
patched = p.apply(lines)
return ''.join(patched)
Read: Version Reconstruction
def _reconstruct_version(doc_id: int, target_version_id: int) -> str:
"""
Reconstruct the content of any version efficiently.
Finds the nearest snapshot ancestor, then applies deltas forward.
"""
# Walk back through parent chain to find nearest snapshot
chain = [] # [(version_id, content_type, patch or content), ...]
vid = target_version_id
snapshot_content = None
while vid:
row = db.fetchone("""
SELECT v.version_id, v.content_type, v.content, v.parent_version_id,
d.patch
FROM DocumentVersion v
LEFT JOIN VersionDelta d ON d.version_id=v.version_id
WHERE v.version_id=%s
""", (vid,))
if not row:
break
if row['content_type'] == 'snapshot':
snapshot_content = row['content']
break
chain.append(row)
vid = row['parent_version_id']
if snapshot_content is None:
raise VersionReconstructError(f"No snapshot ancestor found for version {target_version_id}")
# Apply deltas in order (chain is in reverse order — reverse it)
content = snapshot_content
for row in reversed(chain):
content = _apply_patch(content, row['patch'])
return content
def get_version(doc_id: int, version_number: int) -> dict:
version = db.fetchone("""
SELECT version_id, version_number, author_id, commit_message, created_at, checksum
FROM DocumentVersion WHERE doc_id=%s AND version_number=%s
""", (doc_id, version_number))
if not version:
raise VersionNotFoundError(f"Version {version_number} not found")
content = _reconstruct_version(doc_id, version['version_id'])
# Verify integrity
actual_checksum = hashlib.sha256(content.encode()).hexdigest()
if actual_checksum != version['checksum']:
raise IntegrityError(f"Version {version_number} content checksum mismatch")
version['content'] = content
return version
Key Design Decisions
- Snapshot every 10 versions: with only deltas, reconstructing version N requires replaying all N-1 patches from the first snapshot — O(N) reconstruction time. A snapshot every 10 versions caps reconstruction at 9 delta applications (worst case). The storage cost: full snapshots add ~10× the average delta size every 10 versions, typically a 30–50% total storage increase over delta-only — a worthwhile trade for O(1) bounded reconstruction.
- SHA-256 checksum for integrity: every version stores a checksum of the full reconstructed content. On read, verify the reconstructed content matches the checksum — this detects both storage corruption and bugs in the patch apply logic.
- Unified diff format: standard unified diff (difflib.unified_diff) is human-readable, compact, and supported by many libraries. For binary or structured documents (JSON, XML), use a type-specific differ (e.g., jsondiff for JSON) that produces semantic patches rather than line-level patches.
Document versioning and revision history system design is discussed in Google system design interview questions.
Document versioning and collaborative editing design is covered in Atlassian system design interview preparation.
Document versioning and file history system design is discussed in Amazon system design interview guide.