Designing a file sync and storage service like Dropbox or Google Drive is a popular system design interview question that tests chunked upload, deduplication, sync protocols, and conflict resolution. This question appears at Apple, Atlassian, Dropbox, and Microsoft interviews.
Requirements Clarification
- Scale: 500M users, 200M daily active, 10PB stored data, 100M sync events/day
- Features: Upload/download files, sync across devices, share files/folders, version history, offline access
- File sizes: Up to 50GB per file; median 1MB
- Latency: Small file (<1MB) sync within 2s; large file progress visible immediately
- Consistency: Strong consistency within a device; eventual consistency across devices (with conflict resolution)
Chunked Upload with Deduplication
import hashlib
import os
from typing import List, Optional
from dataclasses import dataclass
CHUNK_SIZE = 4 * 1024 * 1024 # 4MB chunks
@dataclass
class Chunk:
index: int
data: bytes
checksum: str # SHA-256 of chunk data
@classmethod
def from_data(cls, index: int, data: bytes) -> "Chunk":
checksum = hashlib.sha256(data).hexdigest()
return cls(index=index, data=data, checksum=checksum)
def split_file_into_chunks(file_path: str) -> List[Chunk]:
"""Split file into fixed-size chunks for parallel upload and dedup."""
chunks = []
with open(file_path, "rb") as f:
index = 0
while True:
data = f.read(CHUNK_SIZE)
if not data:
break
chunks.append(Chunk.from_data(index, data))
index += 1
return chunks
class ChunkStore:
"""
Content-addressable chunk store.
Deduplication: if chunk with same SHA-256 exists, no re-upload needed.
Different users uploading same file → only stored once.
"""
def __init__(self, object_storage, db):
self.storage = object_storage # S3 or similar
self.db = db
def check_chunks(self, checksums: List[str]) -> List[str]:
"""Return which checksums are NOT yet in store (need upload)."""
existing = self.db.execute(
"SELECT checksum FROM chunks WHERE checksum = ANY(%s)",
(checksums,)
).fetchall()
existing_set = {row[0] for row in existing}
return [c for c in checksums if c not in existing_set]
def upload_chunk(self, chunk: Chunk) -> bool:
"""Upload chunk; skip if already exists (dedup)."""
exists = self.db.execute(
"SELECT 1 FROM chunks WHERE checksum = %s", (chunk.checksum,)
).fetchone()
if exists:
return False # Already stored — dedup hit
self.storage.put(f"chunks/{chunk.checksum}", chunk.data)
self.db.execute(
"INSERT INTO chunks (checksum, size, created_at) VALUES (%s, %s, NOW())",
(chunk.checksum, len(chunk.data))
)
return True
def get_chunk(self, checksum: str) -> bytes:
return self.storage.get(f"chunks/{checksum}")
class FileUploadService:
def __init__(self, chunk_store: ChunkStore, metadata_db, event_bus):
self.chunks = chunk_store
self.db = metadata_db
self.events = event_bus
def initiate_upload(self, user_id: str, path: str, file_size: int, checksums: List[str]) -> dict:
"""
Phase 1: Tell client which chunks it needs to upload.
Client sends all chunk checksums; server returns missing ones.
This enables delta sync — only upload changed chunks.
"""
missing = self.chunks.check_chunks(checksums)
upload_id = self._create_upload_session(user_id, path, checksums)
return {
"upload_id": upload_id,
"missing_chunks": missing, # Only upload these
}
def complete_upload(self, upload_id: str, user_id: str) -> str:
"""Phase 2: Commit file record once all chunks are uploaded."""
session = self._get_session(upload_id)
file_id = self.db.insert_file(
user_id=user_id,
path=session["path"],
chunk_checksums=session["checksums"],
version=1,
)
self.events.publish("file.created", {
"user_id": user_id,
"file_id": file_id,
"path": session["path"],
})
return file_id
def _create_upload_session(self, user_id, path, checksums) -> str:
import uuid
sid = str(uuid.uuid4())
self.db.insert_session(sid, user_id, path, checksums)
return sid
def _get_session(self, upload_id: str) -> dict:
return self.db.get_session(upload_id)
Sync Protocol and Conflict Resolution
from enum import Enum
from dataclasses import dataclass, field
from typing import Optional
class ConflictResolution(Enum):
KEEP_MINE = "keep_mine"
KEEP_THEIRS = "keep_theirs"
KEEP_BOTH = "keep_both" # Dropbox default: create "file (conflicted copy)"
LAST_WRITE_WINS = "last_write_wins"
@dataclass
class SyncState:
"""Per-device sync cursor — tracks what device has seen."""
device_id: str
user_id: str
last_cursor: str # Opaque token representing last seen change
local_clock: int # Lamport clock for ordering
class SyncService:
"""
Long-poll based sync — client holds connection open,
server sends delta when changes occur.
"""
def get_latest_cursor(self, user_id: str) -> str:
"""Get opaque cursor representing current state."""
latest = self.db.get_latest_change_id(user_id)
return self._encode_cursor(latest)
def list_changes(self, user_id: str, cursor: str) -> dict:
"""Return all changes since cursor."""
since_id = self._decode_cursor(cursor)
changes = self.db.get_changes_since(user_id, since_id)
new_cursor = self._encode_cursor(
changes[-1]["change_id"] if changes else since_id
)
return {
"changes": [self._format_change(c) for c in changes],
"cursor": new_cursor,
"has_more": len(changes) == 1000, # Paginate large deltas
}
def apply_change(self, user_id: str, device_id: str, change: dict) -> dict:
"""
Apply client change with conflict detection.
Uses last-writer-wins with conflict file creation as fallback.
"""
path = change["path"]
client_revision = change["parent_revision"]
current = self.db.get_file_metadata(user_id, path)
if current is None:
# New file — no conflict
return self._commit_change(user_id, device_id, change)
if current["revision"] == client_revision:
# Client is up to date — clean commit
return self._commit_change(user_id, device_id, change)
# Conflict: client is behind
# Dropbox strategy: rename client version with conflict suffix
conflict_path = self._make_conflict_path(path, device_id)
self._commit_change(user_id, device_id, {**change, "path": conflict_path})
return {
"result": "conflict",
"conflict_path": conflict_path,
"server_revision": current["revision"],
}
def _make_conflict_path(self, path: str, device_id: str) -> str:
import os
from datetime import datetime
base, ext = os.path.splitext(path)
ts = datetime.utcnow().strftime("%Y-%m-%d %H%M%S")
return f"{base} (conflicted copy from {ts}){ext}"
def _commit_change(self, user_id, device_id, change) -> dict:
pass # Write to metadata DB, publish sync event
def _encode_cursor(self, change_id) -> str:
import base64, json
return base64.b64encode(json.dumps({"id": change_id}).encode()).decode()
def _decode_cursor(self, cursor: str) -> int:
import base64, json
return json.loads(base64.b64decode(cursor))["id"]
Metadata Storage Schema
METADATA_SCHEMA = """
CREATE TABLE files (
file_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL,
path TEXT NOT NULL, -- Full path including filename
revision INTEGER NOT NULL DEFAULT 1,
size_bytes BIGINT NOT NULL,
is_deleted BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
modified_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
UNIQUE (user_id, path) -- One file per path per user
);
-- Chunk manifest: ordered list of chunk checksums per file
CREATE TABLE file_chunks (
file_id UUID NOT NULL REFERENCES files(file_id),
chunk_idx INTEGER NOT NULL,
checksum VARCHAR(64) NOT NULL REFERENCES chunks(checksum),
PRIMARY KEY (file_id, chunk_idx)
);
-- Change log for sync delta
CREATE TABLE file_changes (
change_id BIGSERIAL PRIMARY KEY,
user_id UUID NOT NULL,
file_id UUID NOT NULL,
change_type VARCHAR(20) NOT NULL, -- created | modified | deleted | moved
path TEXT NOT NULL,
revision INTEGER NOT NULL,
changed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
INDEX idx_changes_user_cursor (user_id, change_id)
);
"""
Architecture and Key Decisions
| Component | Technology | Rationale |
|---|---|---|
| Chunk storage | S3 (or similar) | Cheap, durable, CDN-accessible; content-addressed keys |
| Metadata DB | PostgreSQL | Strong consistency for file state; ACID transactions |
| Sync notification | Long-poll or WebSocket | Push deltas to connected clients in real-time |
| Upload URL generation | Pre-signed S3 URLs | Client uploads directly to S3 — no proxy overhead |
| Search | Elasticsearch | Full-text file name and content search |
| Deduplication | SHA-256 content hash | Same data stored once across all users |
| CDN | CloudFront | Popular files served from edge; reduces origin load |
Companies That Ask This System Design Question
This problem type commonly appears in interviews at:
See our company interview guides for full interview process, compensation, and preparation tips.