System Design Interview: Dropbox / Google Drive File Storage

Designing a file sync and storage service like Dropbox or Google Drive is a popular system design interview question that tests chunked upload, deduplication, sync protocols, and conflict resolution. This question appears at Apple, Atlassian, Dropbox, and Microsoft interviews.

Requirements Clarification

  • Scale: 500M users, 200M daily active, 10PB stored data, 100M sync events/day
  • Features: Upload/download files, sync across devices, share files/folders, version history, offline access
  • File sizes: Up to 50GB per file; median 1MB
  • Latency: Small file (<1MB) sync within 2s; large file progress visible immediately
  • Consistency: Strong consistency within a device; eventual consistency across devices (with conflict resolution)

Chunked Upload with Deduplication

import hashlib
import os
from typing import List, Optional
from dataclasses import dataclass

CHUNK_SIZE = 4 * 1024 * 1024  # 4MB chunks

@dataclass
class Chunk:
    index:    int
    data:     bytes
    checksum: str   # SHA-256 of chunk data

    @classmethod
    def from_data(cls, index: int, data: bytes) -> "Chunk":
        checksum = hashlib.sha256(data).hexdigest()
        return cls(index=index, data=data, checksum=checksum)

def split_file_into_chunks(file_path: str) -> List[Chunk]:
    """Split file into fixed-size chunks for parallel upload and dedup."""
    chunks = []
    with open(file_path, "rb") as f:
        index = 0
        while True:
            data = f.read(CHUNK_SIZE)
            if not data:
                break
            chunks.append(Chunk.from_data(index, data))
            index += 1
    return chunks

class ChunkStore:
    """
    Content-addressable chunk store.
    Deduplication: if chunk with same SHA-256 exists, no re-upload needed.
    Different users uploading same file → only stored once.
    """
    def __init__(self, object_storage, db):
        self.storage = object_storage  # S3 or similar
        self.db = db

    def check_chunks(self, checksums: List[str]) -> List[str]:
        """Return which checksums are NOT yet in store (need upload)."""
        existing = self.db.execute(
            "SELECT checksum FROM chunks WHERE checksum = ANY(%s)",
            (checksums,)
        ).fetchall()
        existing_set = {row[0] for row in existing}
        return [c for c in checksums if c not in existing_set]

    def upload_chunk(self, chunk: Chunk) -> bool:
        """Upload chunk; skip if already exists (dedup)."""
        exists = self.db.execute(
            "SELECT 1 FROM chunks WHERE checksum = %s", (chunk.checksum,)
        ).fetchone()

        if exists:
            return False  # Already stored — dedup hit

        self.storage.put(f"chunks/{chunk.checksum}", chunk.data)
        self.db.execute(
            "INSERT INTO chunks (checksum, size, created_at) VALUES (%s, %s, NOW())",
            (chunk.checksum, len(chunk.data))
        )
        return True

    def get_chunk(self, checksum: str) -> bytes:
        return self.storage.get(f"chunks/{checksum}")

class FileUploadService:
    def __init__(self, chunk_store: ChunkStore, metadata_db, event_bus):
        self.chunks = chunk_store
        self.db = metadata_db
        self.events = event_bus

    def initiate_upload(self, user_id: str, path: str, file_size: int, checksums: List[str]) -> dict:
        """
        Phase 1: Tell client which chunks it needs to upload.
        Client sends all chunk checksums; server returns missing ones.
        This enables delta sync — only upload changed chunks.
        """
        missing = self.chunks.check_chunks(checksums)
        upload_id = self._create_upload_session(user_id, path, checksums)
        return {
            "upload_id": upload_id,
            "missing_chunks": missing,  # Only upload these
        }

    def complete_upload(self, upload_id: str, user_id: str) -> str:
        """Phase 2: Commit file record once all chunks are uploaded."""
        session = self._get_session(upload_id)
        file_id = self.db.insert_file(
            user_id=user_id,
            path=session["path"],
            chunk_checksums=session["checksums"],
            version=1,
        )
        self.events.publish("file.created", {
            "user_id": user_id,
            "file_id": file_id,
            "path":    session["path"],
        })
        return file_id

    def _create_upload_session(self, user_id, path, checksums) -> str:
        import uuid
        sid = str(uuid.uuid4())
        self.db.insert_session(sid, user_id, path, checksums)
        return sid

    def _get_session(self, upload_id: str) -> dict:
        return self.db.get_session(upload_id)

Sync Protocol and Conflict Resolution

from enum import Enum
from dataclasses import dataclass, field
from typing import Optional

class ConflictResolution(Enum):
    KEEP_MINE       = "keep_mine"
    KEEP_THEIRS     = "keep_theirs"
    KEEP_BOTH       = "keep_both"  # Dropbox default: create "file (conflicted copy)"
    LAST_WRITE_WINS = "last_write_wins"

@dataclass
class SyncState:
    """Per-device sync cursor — tracks what device has seen."""
    device_id:   str
    user_id:     str
    last_cursor: str  # Opaque token representing last seen change
    local_clock: int  # Lamport clock for ordering

class SyncService:
    """
    Long-poll based sync — client holds connection open,
    server sends delta when changes occur.
    """
    def get_latest_cursor(self, user_id: str) -> str:
        """Get opaque cursor representing current state."""
        latest = self.db.get_latest_change_id(user_id)
        return self._encode_cursor(latest)

    def list_changes(self, user_id: str, cursor: str) -> dict:
        """Return all changes since cursor."""
        since_id = self._decode_cursor(cursor)
        changes = self.db.get_changes_since(user_id, since_id)

        new_cursor = self._encode_cursor(
            changes[-1]["change_id"] if changes else since_id
        )
        return {
            "changes": [self._format_change(c) for c in changes],
            "cursor":  new_cursor,
            "has_more": len(changes) == 1000,  # Paginate large deltas
        }

    def apply_change(self, user_id: str, device_id: str, change: dict) -> dict:
        """
        Apply client change with conflict detection.
        Uses last-writer-wins with conflict file creation as fallback.
        """
        path = change["path"]
        client_revision = change["parent_revision"]

        current = self.db.get_file_metadata(user_id, path)

        if current is None:
            # New file — no conflict
            return self._commit_change(user_id, device_id, change)

        if current["revision"] == client_revision:
            # Client is up to date — clean commit
            return self._commit_change(user_id, device_id, change)

        # Conflict: client is behind
        # Dropbox strategy: rename client version with conflict suffix
        conflict_path = self._make_conflict_path(path, device_id)
        self._commit_change(user_id, device_id, {**change, "path": conflict_path})
        return {
            "result":         "conflict",
            "conflict_path":  conflict_path,
            "server_revision": current["revision"],
        }

    def _make_conflict_path(self, path: str, device_id: str) -> str:
        import os
        from datetime import datetime
        base, ext = os.path.splitext(path)
        ts = datetime.utcnow().strftime("%Y-%m-%d %H%M%S")
        return f"{base} (conflicted copy from {ts}){ext}"

    def _commit_change(self, user_id, device_id, change) -> dict:
        pass  # Write to metadata DB, publish sync event

    def _encode_cursor(self, change_id) -> str:
        import base64, json
        return base64.b64encode(json.dumps({"id": change_id}).encode()).decode()

    def _decode_cursor(self, cursor: str) -> int:
        import base64, json
        return json.loads(base64.b64decode(cursor))["id"]

Metadata Storage Schema

METADATA_SCHEMA = """
CREATE TABLE files (
    file_id     UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    user_id     UUID NOT NULL,
    path        TEXT NOT NULL,          -- Full path including filename
    revision    INTEGER NOT NULL DEFAULT 1,
    size_bytes  BIGINT NOT NULL,
    is_deleted  BOOLEAN DEFAULT FALSE,
    created_at  TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    modified_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    UNIQUE (user_id, path)              -- One file per path per user
);

-- Chunk manifest: ordered list of chunk checksums per file
CREATE TABLE file_chunks (
    file_id   UUID NOT NULL REFERENCES files(file_id),
    chunk_idx INTEGER NOT NULL,
    checksum  VARCHAR(64) NOT NULL REFERENCES chunks(checksum),
    PRIMARY KEY (file_id, chunk_idx)
);

-- Change log for sync delta
CREATE TABLE file_changes (
    change_id   BIGSERIAL PRIMARY KEY,
    user_id     UUID NOT NULL,
    file_id     UUID NOT NULL,
    change_type VARCHAR(20) NOT NULL,   -- created | modified | deleted | moved
    path        TEXT NOT NULL,
    revision    INTEGER NOT NULL,
    changed_at  TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    INDEX idx_changes_user_cursor (user_id, change_id)
);
"""

Architecture and Key Decisions

Component Technology Rationale
Chunk storage S3 (or similar) Cheap, durable, CDN-accessible; content-addressed keys
Metadata DB PostgreSQL Strong consistency for file state; ACID transactions
Sync notification Long-poll or WebSocket Push deltas to connected clients in real-time
Upload URL generation Pre-signed S3 URLs Client uploads directly to S3 — no proxy overhead
Search Elasticsearch Full-text file name and content search
Deduplication SHA-256 content hash Same data stored once across all users
CDN CloudFront Popular files served from edge; reduces origin load

Companies That Ask This System Design Question

This problem type commonly appears in interviews at:

See our company interview guides for full interview process, compensation, and preparation tips.

Scroll to Top