A file deduplication system eliminates redundant storage by detecting files with identical content and storing each unique payload exactly once. This is the foundation of services like Dropbox, Backblaze, and enterprise backup systems. Key challenges: computing content hashes efficiently for large files, chunk-level dedup for partial matches, and safely managing reference counts so data is deleted only when no references remain.
Core Data Model
-- One row per unique file content (content-addressed)
CREATE TABLE FileBlob (
blob_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
sha256_hash CHAR(64) NOT NULL UNIQUE, -- hex SHA-256 of full content
size_bytes BIGINT NOT NULL,
s3_key TEXT NOT NULL, -- e.g. "blobs/{sha256_hash[:2]}/{sha256_hash}"
ref_count INT NOT NULL DEFAULT 0, -- number of FileRecord rows pointing here
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_blob_hash ON FileBlob (sha256_hash);
-- One row per user-facing file (can share a FileBlob)
CREATE TABLE FileRecord (
file_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL,
blob_id UUID NOT NULL REFERENCES FileBlob(blob_id),
filename TEXT NOT NULL,
folder_id UUID,
uploaded_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
deleted_at TIMESTAMPTZ
);
CREATE INDEX idx_file_user ON FileRecord (user_id, deleted_at);
-- For large files: chunk-level deduplication
CREATE TABLE FileChunk (
chunk_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
sha256_hash CHAR(64) NOT NULL UNIQUE,
size_bytes INT NOT NULL,
s3_key TEXT NOT NULL,
ref_count INT NOT NULL DEFAULT 0
);
CREATE TABLE BlobChunk (
blob_id UUID NOT NULL REFERENCES FileBlob(blob_id),
chunk_order INT NOT NULL,
chunk_id UUID NOT NULL REFERENCES FileChunk(chunk_id),
PRIMARY KEY (blob_id, chunk_order)
);
Upload Flow: Hash, Check, Upload
import hashlib
import boto3
from uuid import uuid4
s3 = boto3.client('s3', region_name='us-east-1')
BUCKET = 'techinterview-blobs'
def sha256_of_file(path: str) -> tuple[str, int]:
"""Compute SHA-256 hash and size of a file in streaming fashion."""
h = hashlib.sha256()
size = 0
with open(path, 'rb') as f:
for block in iter(lambda: f.read(65536), b''):
h.update(block)
size += len(block)
return h.hexdigest(), size
def upload_file(conn, user_id: str, local_path: str, filename: str, folder_id: str | None = None) -> str:
"""
Deduplicating upload:
1. Hash the file locally.
2. Check if the blob already exists in DB.
3. Only upload to S3 if new.
4. Create a FileRecord for the user.
Returns file_id.
"""
sha256, size_bytes = sha256_of_file(local_path)
with conn.cursor() as cur:
# Step 1: Check for existing blob
cur.execute("SELECT blob_id FROM FileBlob WHERE sha256_hash = %s", (sha256,))
row = cur.fetchone()
if row:
blob_id = row[0]
# Increment ref count atomically
cur.execute("UPDATE FileBlob SET ref_count = ref_count + 1 WHERE blob_id = %s", (blob_id,))
else:
# Step 2: Upload to S3 (deduplicated key prevents overwrite collisions)
s3_key = f"blobs/{sha256[:2]}/{sha256}"
s3.upload_file(local_path, BUCKET, s3_key)
# Step 3: Insert blob record
blob_id = str(uuid4())
cur.execute(
"""INSERT INTO FileBlob (blob_id, sha256_hash, size_bytes, s3_key, ref_count)
VALUES (%s, %s, %s, %s, 1)""",
(blob_id, sha256, size_bytes, s3_key)
)
# Step 4: Create file record
file_id = str(uuid4())
cur.execute(
"""INSERT INTO FileRecord (file_id, user_id, blob_id, filename, folder_id)
VALUES (%s, %s, %s, %s, %s)""",
(file_id, user_id, blob_id, filename, folder_id)
)
conn.commit()
return file_id
Delete Flow: Reference Counting and S3 Cleanup
def delete_file(conn, user_id: str, file_id: str):
"""
Soft-delete the FileRecord. Decrement blob ref_count.
If ref_count reaches 0, schedule S3 deletion.
"""
with conn.cursor() as cur:
# Verify ownership
cur.execute(
"SELECT blob_id FROM FileRecord WHERE file_id = %s AND user_id = %s AND deleted_at IS NULL",
(file_id, user_id)
)
row = cur.fetchone()
if not row:
raise ValueError("File not found or not owned by user")
blob_id = row[0]
# Soft-delete the file record
cur.execute(
"UPDATE FileRecord SET deleted_at = NOW() WHERE file_id = %s",
(file_id,)
)
# Decrement ref count and check if blob is now orphaned
cur.execute(
"UPDATE FileBlob SET ref_count = ref_count - 1 WHERE blob_id = %s RETURNING ref_count, s3_key",
(blob_id,)
)
ref_count, s3_key = cur.fetchone()
conn.commit()
if ref_count == 0:
# Enqueue S3 deletion (async — do not delete synchronously to avoid inconsistency on rollback)
enqueue_blob_deletion(blob_id, s3_key)
def enqueue_blob_deletion(blob_id: str, s3_key: str):
"""Enqueue a background job to delete blob from S3 after a safety delay."""
import json, redis
r = redis.Redis()
job = json.dumps({"blob_id": blob_id, "s3_key": s3_key, "delete_after": "2h"})
r.rpush("blob_deletion_queue", job)
Chunk-Level Deduplication for Large Files
CHUNK_SIZE = 4 * 1024 * 1024 # 4 MiB fixed chunks (variable CDC for better ratios)
def upload_chunked(conn, user_id: str, local_path: str, filename: str) -> str:
"""
Split file into 4MiB chunks, dedup each chunk independently.
Only upload chunks that don't already exist in S3.
"""
chunk_ids = []
with open(local_path, 'rb') as f:
order = 0
while True:
data = f.read(CHUNK_SIZE)
if not data:
break
chunk_hash = hashlib.sha256(data).hexdigest()
chunk_id = _get_or_create_chunk(conn, chunk_hash, data)
chunk_ids.append((order, chunk_id))
order += 1
# Compute whole-file hash for the FileBlob record
full_hash, size = sha256_of_file(local_path)
with conn.cursor() as cur:
cur.execute("SELECT blob_id FROM FileBlob WHERE sha256_hash = %s", (full_hash,))
row = cur.fetchone()
if not row:
blob_id = str(uuid4())
cur.execute(
"INSERT INTO FileBlob (blob_id, sha256_hash, size_bytes, s3_key, ref_count) VALUES (%s,%s,%s,%s,1)",
(blob_id, full_hash, size, f"blobs/{full_hash[:2]}/{full_hash}")
)
for order, chunk_id in chunk_ids:
cur.execute(
"INSERT INTO BlobChunk (blob_id, chunk_order, chunk_id) VALUES (%s,%s,%s)",
(blob_id, order, chunk_id)
)
else:
blob_id = row[0]
cur.execute("UPDATE FileBlob SET ref_count = ref_count + 1 WHERE blob_id = %s", (blob_id,))
file_id = str(uuid4())
cur.execute(
"INSERT INTO FileRecord (file_id, user_id, blob_id, filename) VALUES (%s,%s,%s,%s)",
(file_id, user_id, blob_id, filename)
)
conn.commit()
return file_id
def _get_or_create_chunk(conn, chunk_hash: str, data: bytes) -> str:
with conn.cursor() as cur:
cur.execute("SELECT chunk_id FROM FileChunk WHERE sha256_hash = %s", (chunk_hash,))
row = cur.fetchone()
if row:
cur.execute("UPDATE FileChunk SET ref_count = ref_count + 1 WHERE chunk_id = %s", (row[0],))
return row[0]
chunk_id = str(uuid4())
s3_key = f"chunks/{chunk_hash[:2]}/{chunk_hash}"
s3.put_object(Bucket=BUCKET, Key=s3_key, Body=data)
cur.execute(
"INSERT INTO FileChunk (chunk_id, sha256_hash, size_bytes, s3_key, ref_count) VALUES (%s,%s,%s,%s,1)",
(chunk_id, chunk_hash, len(data), s3_key)
)
conn.commit()
return chunk_id
Key Interview Points
- Hash collision probability: SHA-256 has 2^256 possible values. The birthday problem probability of a collision among 10 billion files is approximately 10^-58 — effectively zero. In practice, use SHA-256 for dedup keys and verify content equality on first access if paranoid about hardware corruption (bitrot).
- Race condition on upload: Two clients uploading the same file simultaneously may both find “blob not found” and try to insert. Mitigate with: INSERT … ON CONFLICT (sha256_hash) DO UPDATE SET ref_count = ref_count + 1. The unique index on sha256_hash serializes concurrent inserts safely.
- S3 key design: Use the full SHA-256 hex as the S3 key (prefix with first 2 chars for key distribution: blobs/ab/abc123…). This is content-addressed storage — uploading the same content twice to the same key is idempotent in S3 (PUT is idempotent).
- Fixed vs content-defined chunking: Fixed 4MiB chunks are simple but sensitive to insertion — inserting 1 byte at the start shifts all chunk boundaries. Content-Defined Chunking (CDC) uses a rolling hash (Rabin fingerprint) to identify natural split points in content, giving much better dedup ratios for partially modified files. Used by Borg, Restic, and Duplicati.
- Storage savings calculation: For a 1,000-user system where each user uploads similar documents, dedup can reduce storage by 60–80%. Track savings as: sum(size_bytes) across FileRecord minus sum(size_bytes) across FileBlob. Report in the admin dashboard.
{“@context”:”https://schema.org”,”@type”:”FAQPage”,”mainEntity”:[{“@type”:”Question”,”name”:”Why is SHA-256 sufficient for deduplication keys instead of a faster hash?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”SHA-256 produces a 256-bit hash — the probability of a collision among 10 billion files is approximately 10^-58, making it effectively zero for practical use. Faster hashes like MD5 (128-bit) or CRC32 (32-bit) have higher collision probabilities and known collision attacks. In a deduplication system, a false positive (two different files with the same hash) means data corruption: user A’s file is silently served when user B requests their file. SHA-256 eliminates this risk at the cost of ~2x slower hashing vs. MD5 — negligible for files where upload I/O is the bottleneck. For performance-sensitive chunk-level dedup, xxHash128 is used in some systems because it provides 128-bit output with no known attacks and is ~10x faster than SHA-256.”}},{“@type”:”Question”,”name”:”How does the reference count prevent premature S3 deletion?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Each FileBlob row has a ref_count tracking how many FileRecord rows point to it. When a file is uploaded and matched to an existing blob, ref_count is incremented. When a file is deleted, ref_count is decremented. S3 deletion only happens when ref_count reaches 0 — no FileRecord references the blob. Critically, the S3 deletion is asynchronous: it’s enqueued as a background job with a safety delay (e.g., 2 hours). This delay handles the race condition where: (1) user A deletes their copy (ref_count → 0, deletion enqueued); (2) user B uploads the same file and inserts a new FileBlob row (ref_count → 1) within 2 hours; (3) the deletion job checks ref_count before deleting — finds ref_count > 0 and aborts. Without the delay, user B gets a dangling S3 reference.”}},{“@type”:”Question”,”name”:”How does Content-Defined Chunking improve dedup ratios over fixed-size chunking?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Fixed 4MiB chunks are sensitive to prepend insertions: inserting 1 byte at the start of a 100MiB file shifts all chunk boundaries, making every chunk unique even though 99.99% of content is unchanged — 0% dedup for a nearly identical file. Content-Defined Chunking (CDC) uses a rolling hash (Rabin-Karp fingerprint) over a sliding window (typically 64 bytes) to identify natural split points in the content — places where the hash matches a bitmask pattern. These split points are content-anchored rather than position-anchored, so inserting 1 byte only affects the chunk containing the insertion; all other chunks remain identical. Borg, Restic, and Casync all use CDC. Typical dedup improvement: CDC achieves 70%+ dedup on modified files; fixed-size achieves 10–40%.”}},{“@type”:”Question”,”name”:”How do you handle the race condition when two users upload the same file simultaneously?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Both clients compute SHA-256, both query FileBlob and find no row, both attempt to INSERT. Without guards, the second insert fails with a unique constraint violation on sha256_hash. Correct handling: INSERT INTO FileBlob (…) ON CONFLICT (sha256_hash) DO UPDATE SET ref_count = ref_count + 1 RETURNING blob_id, ref_count. The first insert wins; the second hits the conflict and increments ref_count instead. Both transactions get the correct blob_id and proceed to create their FileRecord. The S3 upload of the same key from both clients is also safe: S3 PUT is idempotent — uploading the same content to the same key twice results in no data corruption, just a redundant upload. In production, add a pre-signed URL upload path that checks FileBlob existence before issuing the S3 upload URL.”}},{“@type”:”Question”,”name”:”How do you calculate and report storage savings from deduplication?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Storage used without dedup = SUM(size_bytes) across all FileRecord rows (one row per user upload). Storage used with dedup = SUM(size_bytes) across all FileBlob rows (one row per unique content hash). Savings = FileRecord total – FileBlob total. Savings ratio = (FileRecord total – FileBlob total) / FileRecord total. Track this in a daily report: SELECT (SELECT SUM(size_bytes) FROM FileRecord WHERE deleted_at IS NULL) AS logical_bytes, (SELECT SUM(size_bytes) FROM FileBlob WHERE ref_count > 0) AS physical_bytes. For per-user reporting: a user who uploads a 1GB file that already exists in the system uses 0 physical bytes of new storage but consumes 1GB of logical quota. Charge users based on logical bytes (fair); save physical costs via dedup.”}}]}
File deduplication and content-addressed storage design is discussed in Google system design interview questions.
File deduplication and document storage system design is covered in Atlassian system design interview preparation.
File deduplication and distributed storage design is discussed in Amazon system design interview guide.