Chunked File Upload System — Low-Level Design
A chunked file upload system breaks large files into smaller pieces, uploads each piece independently, and reassembles them. It supports resumable uploads, parallel chunk transfers, and progress tracking. This design is asked at Google, Dropbox, and any platform handling large file uploads.
Core Data Model
UploadSession
id TEXT PK -- UUID, returned to client on session create
user_id BIGINT NOT NULL
filename TEXT NOT NULL
total_size BIGINT NOT NULL -- bytes
chunk_size INT NOT NULL -- bytes per chunk (e.g., 5MB)
total_chunks INT NOT NULL
mime_type TEXT
s3_upload_id TEXT -- S3 multipart upload ID
s3_key TEXT -- destination key in S3
status TEXT DEFAULT 'in_progress' -- in_progress, complete, failed
created_at TIMESTAMPTZ
completed_at TIMESTAMPTZ
UploadChunk
session_id TEXT FK NOT NULL
chunk_number INT NOT NULL -- 0-indexed
size_bytes INT NOT NULL
checksum_md5 TEXT NOT NULL -- client-computed MD5 of chunk bytes
s3_etag TEXT -- returned by S3 on part upload
uploaded_at TIMESTAMPTZ
PRIMARY KEY (session_id, chunk_number)
Phase 1: Initiate Upload Session
POST /uploads/sessions
{
"filename": "video.mp4",
"total_size": 524288000, // 500MB
"chunk_size": 5242880, // 5MB (S3 minimum for multipart)
"mime_type": "video/mp4"
}
def create_upload_session(user_id, filename, total_size, chunk_size, mime_type):
session_id = generate_uuid()
s3_key = f'uploads/{user_id}/{session_id}/{filename}'
total_chunks = ceil(total_size / chunk_size)
# Initiate S3 multipart upload
response = s3.create_multipart_upload(
Bucket='my-bucket',
Key=s3_key,
ContentType=mime_type,
Metadata={'user_id': str(user_id), 'session_id': session_id}
)
db.insert(UploadSession, {
'id': session_id, 'user_id': user_id, 'filename': filename,
'total_size': total_size, 'chunk_size': chunk_size,
'total_chunks': total_chunks, 's3_upload_id': response['UploadId'],
's3_key': s3_key
})
return {'session_id': session_id, 'total_chunks': total_chunks}
Phase 2: Upload Individual Chunks
PUT /uploads/sessions/{session_id}/chunks/{chunk_number}
Content-Type: application/octet-stream
X-Checksum-MD5: {base64_md5_of_chunk}
[chunk bytes in body]
def upload_chunk(session_id, chunk_number, chunk_bytes, client_checksum):
session = db.get(UploadSession, session_id)
if session.status != 'in_progress':
raise InvalidState()
# Verify checksum
server_checksum = base64.b64encode(md5(chunk_bytes).digest()).decode()
if server_checksum != client_checksum:
raise ChecksumMismatch(f'Chunk {chunk_number} checksum mismatch')
# S3 multipart part numbers are 1-indexed
part_number = chunk_number + 1
response = s3.upload_part(
Bucket='my-bucket',
Key=session.s3_key,
UploadId=session.s3_upload_id,
PartNumber=part_number,
Body=chunk_bytes,
)
db.execute("""
INSERT INTO UploadChunk
(session_id, chunk_number, size_bytes, checksum_md5, s3_etag, uploaded_at)
VALUES (%(sid)s, %(cn)s, %(size)s, %(cs)s, %(etag)s, NOW())
ON CONFLICT (session_id, chunk_number) DO UPDATE
SET s3_etag=EXCLUDED.s3_etag, uploaded_at=NOW()
""", {'sid': session_id, 'cn': chunk_number, 'size': len(chunk_bytes),
'cs': client_checksum, 'etag': response['ETag']})
return {'chunk_number': chunk_number, 'uploaded': True}
Phase 3: Complete Upload
POST /uploads/sessions/{session_id}/complete
def complete_upload(session_id):
session = db.get(UploadSession, session_id)
chunks = db.query("""
SELECT chunk_number, s3_etag FROM UploadChunk
WHERE session_id=%(sid)s ORDER BY chunk_number
""", {'sid': session_id})
if len(chunks) != session.total_chunks:
missing = set(range(session.total_chunks)) - {c.chunk_number for c in chunks}
raise IncompleteUpload(f'Missing chunks: {missing}')
# Complete S3 multipart upload
parts = [{'PartNumber': c.chunk_number + 1, 'ETag': c.s3_etag} for c in chunks]
s3.complete_multipart_upload(
Bucket='my-bucket',
Key=session.s3_key,
UploadId=session.s3_upload_id,
MultipartUpload={'Parts': parts}
)
db.execute("""
UPDATE UploadSession SET status='complete', completed_at=NOW()
WHERE id=%(id)s
""", {'id': session_id})
# Trigger post-processing (virus scan, transcoding, thumbnail generation)
queue_post_processing(session_id, session.s3_key, session.mime_type)
return {'status': 'complete', 's3_key': session.s3_key}
Resuming an Interrupted Upload
GET /uploads/sessions/{session_id}/status
def get_upload_status(session_id):
chunks = db.query("""
SELECT chunk_number FROM UploadChunk WHERE session_id=%(sid)s
""", {'sid': session_id})
uploaded = {c.chunk_number for c in chunks}
session = db.get(UploadSession, session_id)
missing = [i for i in range(session.total_chunks) if i not in uploaded]
return {
'session_id': session_id,
'uploaded_chunks': list(uploaded),
'missing_chunks': missing,
'progress_pct': len(uploaded) / session.total_chunks * 100
}
# Client resumes by uploading only the missing_chunks
Key Interview Points
- Minimum S3 chunk size is 5MB: S3 multipart upload requires each part (except the last) to be at least 5MB. Use 5MB or 10MB as the chunk size.
- MD5 checksum per chunk: Verifies integrity end-to-end. A corrupted chunk is detected at upload time and can be re-uploaded before the session completes, rather than discovering corruption after assembly.
- Parallel chunk uploads: The client can upload multiple chunks concurrently (e.g., 3 in parallel). The server handles them independently. This dramatically reduces total upload time for large files on fast connections.
- Abort incomplete sessions: Run a background job to abort S3 multipart uploads for sessions not completed within 24 hours. Incomplete multipart uploads incur S3 storage costs even though no final object exists.
Chunked file upload and large file handling design is discussed in Google system design interview questions.
S3 multipart upload and file handling system design is covered in Amazon system design interview preparation.