Document Versioning System Low-Level Design: Snapshot-Delta Storage, Efficient Reconstruction, and Integrity Checks

Document Versioning System: Low-Level Design

A document versioning system preserves every historical state of a document so users can view, compare, and restore previous versions. The design challenge is balancing storage efficiency (storing full snapshots vs. deltas) against read performance (reconstructing a version at any point in time). This design covers the snapshot-with-delta storage model, three-way merge for concurrent edits, and an efficient diffing strategy.

Core Data Model

CREATE TABLE Document (
    doc_id         BIGSERIAL PRIMARY KEY,
    title          VARCHAR(500) NOT NULL,
    owner_id       BIGINT NOT NULL,
    current_version_id BIGINT,          -- FK set after first version insert
    created_at     TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at     TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

CREATE TABLE DocumentVersion (
    version_id     BIGSERIAL PRIMARY KEY,
    doc_id         BIGINT NOT NULL REFERENCES Document(doc_id) ON DELETE CASCADE,
    version_number INT NOT NULL,        -- 1, 2, 3 ... monotonically increasing
    author_id      BIGINT NOT NULL,
    commit_message VARCHAR(500),
    content_type   VARCHAR(20) NOT NULL DEFAULT 'snapshot',  -- snapshot, delta
    content        TEXT,               -- full content if snapshot; NULL if delta stored separately
    parent_version_id BIGINT REFERENCES DocumentVersion(version_id),
    checksum       VARCHAR(64) NOT NULL,  -- SHA-256 of full content (for integrity)
    size_bytes     INT NOT NULL,
    created_at     TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    UNIQUE (doc_id, version_number)
);

CREATE TABLE VersionDelta (
    delta_id       BIGSERIAL PRIMARY KEY,
    version_id     BIGINT NOT NULL REFERENCES DocumentVersion(version_id) ON DELETE CASCADE,
    patch          TEXT NOT NULL,       -- unified diff format patch
    UNIQUE (version_id)
);

CREATE INDEX ON DocumentVersion(doc_id, version_number DESC);
CREATE INDEX ON DocumentVersion(doc_id, content_type);

Write: Snapshot-with-Delta Storage

import hashlib, difflib
from typing import Optional

SNAPSHOT_INTERVAL = 10  # store full snapshot every N versions

def save_version(doc_id: int, new_content: str, author_id: int,
                 commit_message: str = None) -> int:
    """
    Save a new version. Every Nth version is a snapshot; others are stored as deltas.
    Returns the new version_id.
    """
    latest = db.fetchone("""
        SELECT version_id, version_number, content, content_type
        FROM DocumentVersion
        WHERE doc_id=%s ORDER BY version_number DESC LIMIT 1
    """, (doc_id,))

    new_version_number = (latest['version_number'] + 1) if latest else 1
    checksum = hashlib.sha256(new_content.encode()).hexdigest()
    size_bytes = len(new_content.encode('utf-8'))

    # Store snapshot every SNAPSHOT_INTERVAL versions, or for the first version
    is_snapshot = (new_version_number == 1 or new_version_number % SNAPSHOT_INTERVAL == 0)

    if is_snapshot:
        version_id = db.fetchone("""
            INSERT INTO DocumentVersion
                (doc_id, version_number, author_id, commit_message,
                 content_type, content, parent_version_id, checksum, size_bytes)
            VALUES (%s,%s,%s,%s,'snapshot',%s,%s,%s,%s)
            RETURNING version_id
        """, (doc_id, new_version_number, author_id, commit_message,
              new_content, latest['version_id'] if latest else None,
              checksum, size_bytes))['version_id']
    else:
        # Reconstruct current content for diffing
        current_content = _reconstruct_version(doc_id, latest['version_id'])
        patch = _make_patch(current_content, new_content)

        version_id = db.fetchone("""
            INSERT INTO DocumentVersion
                (doc_id, version_number, author_id, commit_message,
                 content_type, content, parent_version_id, checksum, size_bytes)
            VALUES (%s,%s,%s,%s,'delta',NULL,%s,%s,%s)
            RETURNING version_id
        """, (doc_id, new_version_number, author_id, commit_message,
              latest['version_id'], checksum, size_bytes))['version_id']

        db.execute("""
            INSERT INTO VersionDelta (version_id, patch) VALUES (%s, %s)
        """, (version_id, patch))

    # Update document's current version pointer
    db.execute("""
        UPDATE Document SET current_version_id=%s, updated_at=NOW()
        WHERE doc_id=%s
    """, (version_id, doc_id))

    return version_id

def _make_patch(old: str, new: str) -> str:
    old_lines = old.splitlines(keepends=True)
    new_lines = new.splitlines(keepends=True)
    return ''.join(difflib.unified_diff(old_lines, new_lines, lineterm=''))

def _apply_patch(base: str, patch: str) -> str:
    """Apply a unified diff patch to base content."""
    import patch as patch_lib  # python-patch library
    p = patch_lib.fromstring(patch.encode())
    lines = base.splitlines(keepends=True)
    patched = p.apply(lines)
    return ''.join(patched)

Read: Version Reconstruction

def _reconstruct_version(doc_id: int, target_version_id: int) -> str:
    """
    Reconstruct the content of any version efficiently.
    Finds the nearest snapshot ancestor, then applies deltas forward.
    """
    # Walk back through parent chain to find nearest snapshot
    chain = []  # [(version_id, content_type, patch or content), ...]
    vid = target_version_id
    snapshot_content = None

    while vid:
        row = db.fetchone("""
            SELECT v.version_id, v.content_type, v.content, v.parent_version_id,
                   d.patch
            FROM DocumentVersion v
            LEFT JOIN VersionDelta d ON d.version_id=v.version_id
            WHERE v.version_id=%s
        """, (vid,))
        if not row:
            break
        if row['content_type'] == 'snapshot':
            snapshot_content = row['content']
            break
        chain.append(row)
        vid = row['parent_version_id']

    if snapshot_content is None:
        raise VersionReconstructError(f"No snapshot ancestor found for version {target_version_id}")

    # Apply deltas in order (chain is in reverse order — reverse it)
    content = snapshot_content
    for row in reversed(chain):
        content = _apply_patch(content, row['patch'])

    return content

def get_version(doc_id: int, version_number: int) -> dict:
    version = db.fetchone("""
        SELECT version_id, version_number, author_id, commit_message, created_at, checksum
        FROM DocumentVersion WHERE doc_id=%s AND version_number=%s
    """, (doc_id, version_number))
    if not version:
        raise VersionNotFoundError(f"Version {version_number} not found")
    content = _reconstruct_version(doc_id, version['version_id'])
    # Verify integrity
    actual_checksum = hashlib.sha256(content.encode()).hexdigest()
    if actual_checksum != version['checksum']:
        raise IntegrityError(f"Version {version_number} content checksum mismatch")
    version['content'] = content
    return version

Key Design Decisions

  • Snapshot every 10 versions: with only deltas, reconstructing version N requires replaying all N-1 patches from the first snapshot — O(N) reconstruction time. A snapshot every 10 versions caps reconstruction at 9 delta applications (worst case). The storage cost: full snapshots add ~10× the average delta size every 10 versions, typically a 30–50% total storage increase over delta-only — a worthwhile trade for O(1) bounded reconstruction.
  • SHA-256 checksum for integrity: every version stores a checksum of the full reconstructed content. On read, verify the reconstructed content matches the checksum — this detects both storage corruption and bugs in the patch apply logic.
  • Unified diff format: standard unified diff (difflib.unified_diff) is human-readable, compact, and supported by many libraries. For binary or structured documents (JSON, XML), use a type-specific differ (e.g., jsondiff for JSON) that produces semantic patches rather than line-level patches.

Document versioning and revision history system design is discussed in Google system design interview questions.

Document versioning and collaborative editing design is covered in Atlassian system design interview preparation.

Document versioning and file history system design is discussed in Amazon system design interview guide.

Scroll to Top