Chunked File Upload System Low-Level Design

Chunked File Upload System — Low-Level Design

A chunked file upload system breaks large files into smaller pieces, uploads each piece independently, and reassembles them. It supports resumable uploads, parallel chunk transfers, and progress tracking. This design is asked at Google, Dropbox, and any platform handling large file uploads.

Core Data Model

UploadSession
  id              TEXT PK             -- UUID, returned to client on session create
  user_id         BIGINT NOT NULL
  filename        TEXT NOT NULL
  total_size      BIGINT NOT NULL     -- bytes
  chunk_size      INT NOT NULL        -- bytes per chunk (e.g., 5MB)
  total_chunks    INT NOT NULL
  mime_type       TEXT
  s3_upload_id    TEXT                -- S3 multipart upload ID
  s3_key          TEXT                -- destination key in S3
  status          TEXT DEFAULT 'in_progress'  -- in_progress, complete, failed
  created_at      TIMESTAMPTZ
  completed_at    TIMESTAMPTZ

UploadChunk
  session_id      TEXT FK NOT NULL
  chunk_number    INT NOT NULL        -- 0-indexed
  size_bytes      INT NOT NULL
  checksum_md5    TEXT NOT NULL       -- client-computed MD5 of chunk bytes
  s3_etag         TEXT                -- returned by S3 on part upload
  uploaded_at     TIMESTAMPTZ
  PRIMARY KEY (session_id, chunk_number)

Phase 1: Initiate Upload Session

POST /uploads/sessions
{
  "filename": "video.mp4",
  "total_size": 524288000,   // 500MB
  "chunk_size": 5242880,     // 5MB (S3 minimum for multipart)
  "mime_type": "video/mp4"
}

def create_upload_session(user_id, filename, total_size, chunk_size, mime_type):
    session_id = generate_uuid()
    s3_key = f'uploads/{user_id}/{session_id}/{filename}'
    total_chunks = ceil(total_size / chunk_size)

    # Initiate S3 multipart upload
    response = s3.create_multipart_upload(
        Bucket='my-bucket',
        Key=s3_key,
        ContentType=mime_type,
        Metadata={'user_id': str(user_id), 'session_id': session_id}
    )

    db.insert(UploadSession, {
        'id': session_id, 'user_id': user_id, 'filename': filename,
        'total_size': total_size, 'chunk_size': chunk_size,
        'total_chunks': total_chunks, 's3_upload_id': response['UploadId'],
        's3_key': s3_key
    })

    return {'session_id': session_id, 'total_chunks': total_chunks}

Phase 2: Upload Individual Chunks

PUT /uploads/sessions/{session_id}/chunks/{chunk_number}
Content-Type: application/octet-stream
X-Checksum-MD5: {base64_md5_of_chunk}
[chunk bytes in body]

def upload_chunk(session_id, chunk_number, chunk_bytes, client_checksum):
    session = db.get(UploadSession, session_id)
    if session.status != 'in_progress':
        raise InvalidState()

    # Verify checksum
    server_checksum = base64.b64encode(md5(chunk_bytes).digest()).decode()
    if server_checksum != client_checksum:
        raise ChecksumMismatch(f'Chunk {chunk_number} checksum mismatch')

    # S3 multipart part numbers are 1-indexed
    part_number = chunk_number + 1
    response = s3.upload_part(
        Bucket='my-bucket',
        Key=session.s3_key,
        UploadId=session.s3_upload_id,
        PartNumber=part_number,
        Body=chunk_bytes,
    )

    db.execute("""
        INSERT INTO UploadChunk
          (session_id, chunk_number, size_bytes, checksum_md5, s3_etag, uploaded_at)
        VALUES (%(sid)s, %(cn)s, %(size)s, %(cs)s, %(etag)s, NOW())
        ON CONFLICT (session_id, chunk_number) DO UPDATE
          SET s3_etag=EXCLUDED.s3_etag, uploaded_at=NOW()
    """, {'sid': session_id, 'cn': chunk_number, 'size': len(chunk_bytes),
          'cs': client_checksum, 'etag': response['ETag']})

    return {'chunk_number': chunk_number, 'uploaded': True}

Phase 3: Complete Upload

POST /uploads/sessions/{session_id}/complete

def complete_upload(session_id):
    session = db.get(UploadSession, session_id)
    chunks = db.query("""
        SELECT chunk_number, s3_etag FROM UploadChunk
        WHERE session_id=%(sid)s ORDER BY chunk_number
    """, {'sid': session_id})

    if len(chunks) != session.total_chunks:
        missing = set(range(session.total_chunks)) - {c.chunk_number for c in chunks}
        raise IncompleteUpload(f'Missing chunks: {missing}')

    # Complete S3 multipart upload
    parts = [{'PartNumber': c.chunk_number + 1, 'ETag': c.s3_etag} for c in chunks]
    s3.complete_multipart_upload(
        Bucket='my-bucket',
        Key=session.s3_key,
        UploadId=session.s3_upload_id,
        MultipartUpload={'Parts': parts}
    )

    db.execute("""
        UPDATE UploadSession SET status='complete', completed_at=NOW()
        WHERE id=%(id)s
    """, {'id': session_id})

    # Trigger post-processing (virus scan, transcoding, thumbnail generation)
    queue_post_processing(session_id, session.s3_key, session.mime_type)

    return {'status': 'complete', 's3_key': session.s3_key}

Resuming an Interrupted Upload

GET /uploads/sessions/{session_id}/status

def get_upload_status(session_id):
    chunks = db.query("""
        SELECT chunk_number FROM UploadChunk WHERE session_id=%(sid)s
    """, {'sid': session_id})
    uploaded = {c.chunk_number for c in chunks}
    session = db.get(UploadSession, session_id)
    missing = [i for i in range(session.total_chunks) if i not in uploaded]
    return {
        'session_id': session_id,
        'uploaded_chunks': list(uploaded),
        'missing_chunks': missing,
        'progress_pct': len(uploaded) / session.total_chunks * 100
    }
# Client resumes by uploading only the missing_chunks

Key Interview Points

  • Minimum S3 chunk size is 5MB: S3 multipart upload requires each part (except the last) to be at least 5MB. Use 5MB or 10MB as the chunk size.
  • MD5 checksum per chunk: Verifies integrity end-to-end. A corrupted chunk is detected at upload time and can be re-uploaded before the session completes, rather than discovering corruption after assembly.
  • Parallel chunk uploads: The client can upload multiple chunks concurrently (e.g., 3 in parallel). The server handles them independently. This dramatically reduces total upload time for large files on fast connections.
  • Abort incomplete sessions: Run a background job to abort S3 multipart uploads for sessions not completed within 24 hours. Incomplete multipart uploads incur S3 storage costs even though no final object exists.

Chunked file upload and large file handling design is discussed in Google system design interview questions.

S3 multipart upload and file handling system design is covered in Amazon system design interview preparation.

Scroll to Top