Link Preview Service Low-Level Design: SSRF Prevention, Open Graph Parsing, and Caching

A link preview service fetches URL metadata (title, description, image) to render rich embeds when users share links — familiar from Slack, Twitter, iMessage, and Notion. Core challenges: fetching external URLs safely (SSRF prevention), extracting Open Graph / Twitter Card metadata reliably, caching aggressively to avoid refetching on every view, and handling slow or malicious origin servers.

Core Data Model

CREATE TABLE LinkPreview (
    url_hash      CHAR(64) PRIMARY KEY,  -- SHA-256 of normalized URL
    url           TEXT NOT NULL,
    title         TEXT,
    description   TEXT,
    image_url     TEXT,
    site_name     TEXT,
    favicon_url   TEXT,
    content_type  TEXT,                  -- 'article', 'video', 'image', 'website'
    status        TEXT NOT NULL DEFAULT 'pending',  -- pending, ready, failed, blocked
    fetch_status  SMALLINT,              -- HTTP status code from origin
    fetched_at    TIMESTAMPTZ,
    expires_at    TIMESTAMPTZ NOT NULL,  -- cache TTL
    error_message TEXT
);
CREATE INDEX idx_preview_expires ON LinkPreview (expires_at) WHERE status = 'ready';

SSRF Prevention and URL Validation

import ipaddress, socket, urllib.parse
from typing import NamedTuple

BLOCKED_RANGES = [
    ipaddress.ip_network("10.0.0.0/8"),
    ipaddress.ip_network("172.16.0.0/12"),
    ipaddress.ip_network("192.168.0.0/16"),
    ipaddress.ip_network("127.0.0.0/8"),
    ipaddress.ip_network("169.254.0.0/16"),   # link-local (AWS metadata: 169.254.169.254)
    ipaddress.ip_network("::1/128"),
    ipaddress.ip_network("fc00::/7"),
]
ALLOWED_SCHEMES = {'http', 'https'}
MAX_REDIRECTS = 3
FETCH_TIMEOUT_SEC = 5

def is_ssrf_safe(url: str) -> tuple[bool, str]:
    """
    Returns (is_safe, reason).
    Blocks private IPs, localhost, link-local ranges, and non-http(s) schemes.
    """
    try:
        parsed = urllib.parse.urlparse(url)
    except Exception:
        return False, "Invalid URL"

    if parsed.scheme not in ALLOWED_SCHEMES:
        return False, f"Scheme {parsed.scheme} not allowed"

    hostname = parsed.hostname
    if not hostname:
        return False, "No hostname"

    # Resolve DNS to IP
    try:
        infos = socket.getaddrinfo(hostname, None)
    except socket.gaierror:
        return False, "DNS resolution failed"

    for info in infos:
        ip_str = info[4][0]
        try:
            ip = ipaddress.ip_address(ip_str)
        except ValueError:
            continue
        for blocked in BLOCKED_RANGES:
            if ip in blocked:
                return False, f"IP {ip} is in blocked range {blocked}"

    return True, ""

Fetching and Parsing Metadata

import hashlib, requests
from bs4 import BeautifulSoup
from datetime import datetime, timezone, timedelta
import psycopg2

PREVIEW_CACHE_TTL_HOURS = 24
MAX_RESPONSE_BYTES = 1 * 1024 * 1024  # 1 MiB — don't fetch entire large pages

def normalize_url(url: str) -> str:
    """Lowercase scheme+host, strip tracking params, sort query params."""
    parsed = urllib.parse.urlparse(url.strip())
    # Strip common tracking params
    TRACKING_PARAMS = {'utm_source','utm_medium','utm_campaign','utm_content','utm_term','fbclid','gclid'}
    qs = urllib.parse.parse_qs(parsed.query, keep_blank_values=True)
    qs_clean = {k: v for k, v in qs.items() if k not in TRACKING_PARAMS}
    clean_query = urllib.parse.urlencode(sorted(qs_clean.items()), doseq=True)
    normalized = parsed._replace(
        scheme=parsed.scheme.lower(),
        netloc=parsed.netloc.lower(),
        query=clean_query,
        fragment=""
    )
    return urllib.parse.urlunparse(normalized)

def fetch_link_preview(conn, url: str) -> dict:
    normalized = normalize_url(url)
    url_hash = hashlib.sha256(normalized.encode()).hexdigest()

    # Check cache first
    with conn.cursor() as cur:
        cur.execute(
            "SELECT title, description, image_url, site_name, status FROM LinkPreview WHERE url_hash = %s AND expires_at > NOW()",
            (url_hash,)
        )
        cached = cur.fetchone()
    if cached:
        return {"title": cached[0], "description": cached[1], "image_url": cached[2],
                "site_name": cached[3], "status": cached[4], "cached": True}

    # SSRF check
    safe, reason = is_ssrf_safe(normalized)
    if not safe:
        store_preview(conn, url_hash, normalized, status='blocked', error=reason)
        raise ValueError(f"URL blocked: {reason}")

    # Fetch with timeout and size limit
    try:
        resp = requests.get(
            normalized,
            timeout=FETCH_TIMEOUT_SEC,
            headers={"User-Agent": "TechInterviewBot/1.0 (+https://techinterview.org)"},
            allow_redirects=True,
            stream=True,
            max_redirects=MAX_REDIRECTS
        )
        # Read only the first 1MiB
        content = b""
        for chunk in resp.iter_content(chunk_size=8192):
            content += chunk
            if len(content) > MAX_RESPONSE_BYTES:
                break
        resp_content = content.decode('utf-8', errors='replace')
    except requests.exceptions.Timeout:
        store_preview(conn, url_hash, normalized, status='failed', error='Fetch timeout')
        raise

    # Parse Open Graph and Twitter Card tags
    soup = BeautifulSoup(resp_content, 'html.parser')

    def og(prop):
        tag = soup.find('meta', property=f'og:{prop}') or soup.find('meta', attrs={'name': f'twitter:{prop}'})
        return tag['content'].strip() if tag and tag.get('content') else None

    title = og('title') or (soup.title.string.strip() if soup.title else None)
    description = og('description') or (soup.find('meta', attrs={'name': 'description'}) or {}).get('content', '')
    image_url = og('image')
    site_name = og('site_name') or urllib.parse.urlparse(normalized).netloc

    # Validate image URL (must also be SSRF-safe)
    if image_url:
        img_safe, _ = is_ssrf_safe(image_url)
        if not img_safe:
            image_url = None

    preview = store_preview(conn, url_hash, normalized, status='ready',
                             title=title, description=description,
                             image_url=image_url, site_name=site_name,
                             fetch_status=resp.status_code)
    return preview

def store_preview(conn, url_hash, url, status, title=None, description=None,
                   image_url=None, site_name=None, fetch_status=None, error=None) -> dict:
    expires_at = datetime.now(timezone.utc) + timedelta(hours=PREVIEW_CACHE_TTL_HOURS)
    with conn.cursor() as cur:
        cur.execute("""
            INSERT INTO LinkPreview (url_hash, url, title, description, image_url, site_name,
                status, fetch_status, fetched_at, expires_at, error_message)
            VALUES (%s,%s,%s,%s,%s,%s,%s,%s,NOW(),%s,%s)
            ON CONFLICT (url_hash) DO UPDATE SET
                title=EXCLUDED.title, description=EXCLUDED.description,
                image_url=EXCLUDED.image_url, site_name=EXCLUDED.site_name,
                status=EXCLUDED.status, fetch_status=EXCLUDED.fetch_status,
                fetched_at=NOW(), expires_at=EXCLUDED.expires_at,
                error_message=EXCLUDED.error_message
        """, (url_hash, url, title, description, image_url, site_name,
              status, fetch_status, expires_at, error))
    conn.commit()
    return {"title": title, "description": description, "image_url": image_url,
            "site_name": site_name, "status": status}

Key Interview Points

SSRF is the critical security issue: Without validation, an attacker submits URLs like http://169.254.169.254/latest/meta-data/ (AWS metadata endpoint) or http://localhost:6379/ (Redis). Your server fetches these internal endpoints and returns credentials or data. Defenses: (1) resolve DNS and check the resulting IP against blocked ranges before fetching; (2) use a dedicated egress proxy (Smokescreen, Squid with ACLs) that enforces allowlists; (3) disable redirects to private IPs even if the initial URL passes.
URL normalization for cache deduplication: https://example.com/post?id=1&utm_source=twitter and https://example.com/post?id=1 point to the same page. Normalize before hashing: lowercase scheme/host, sort query params, strip tracking params, remove fragment. This maximizes cache hit rate — critical because every cache miss is an outbound HTTP request.
Cache TTL tradeoffs: 24-hour TTL is a reasonable default — og:image and titles rarely change. For news articles, shorter TTL (1-2 hours) captures headline changes. For user-generated content, respect Cache-Control: max-age from the origin server. Never cache status=’failed’ permanently — retry after 1 hour.
Asynchronous fetch pattern: Don’t block the user’s message send waiting for the fetch. Return the message immediately with status=’pending’. Fetch in a background job. Push a WebSocket update when the preview is ready. This decouples perceived latency (instant send) from actual fetch time (up to 5 seconds). Prefetch on paste in the client-side editor — the preview is usually ready by the time the user hits Send.
Image proxying: Serve og:image through your own proxy (proxy.example.com/img?url=…) rather than embedding third-party URLs. Benefits: (1) security — prevent mixed content warnings (http:// images on https:// pages); (2) privacy — the origin doesn’t see your users’ IP addresses; (3) reliability — cache the image so it persists even if origin deletes it. Apply same SSRF validation to proxied image URLs.

{“@context”:”https://schema.org”,”@type”:”FAQPage”,”mainEntity”:[{“@type”:”Question”,”name”:”What is SSRF and how does it specifically threaten a link preview service?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Server-Side Request Forgery (SSRF) occurs when an attacker causes your server to make HTTP requests to unintended targets by supplying crafted URLs. A link preview service is a natural SSRF target: it fetches arbitrary user-supplied URLs. Attacks: (1) http://169.254.169.254/latest/meta-data/iam/security-credentials/ — AWS metadata endpoint returns IAM credentials, giving the attacker full cloud access; (2) http://redis:6379/ — internal Redis instance responds to raw HTTP with error messages revealing topology; (3) http://10.0.0.1/admin — internal admin panel; (4) file:///etc/passwd — file:// scheme reads local files. Defense: resolve the URL’s hostname to an IP address before fetching and block private ranges (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, 127.0.0.0/8, 169.254.0.0/16). Also block non-http(s) schemes. Check after every redirect — a redirect from a public URL to a private IP is a common bypass.”}},{“@type”:”Question”,”name”:”How does URL normalization increase cache hit rate?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”The same article might be shared with different URLs: https://Example.com/post?id=1&utm_source=twitter vs https://example.com/post?id=1&utm_medium=email. Without normalization, both are cache misses, both trigger an outbound HTTP fetch. Normalized, they both reduce to https://example.com/post?id=1 — the same cache key. Normalization steps: lowercase scheme and hostname, remove fragment (#section — not sent to server), strip known tracking parameters (utm_*, fbclid, gclid), sort remaining query parameters alphabetically, remove trailing slash inconsistencies. This can double cache hit rates for URLs shared via social media (which always append utm_* parameters). Hash the normalized URL with SHA-256 for a fixed-length primary key.”}},{“@type”:”Question”,”name”:”How do you handle slow or hanging origin servers?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”An origin server that accepts the TCP connection but never sends data will hang the worker thread for the full timeout duration. Mitigations: (1) Set a connect timeout (5 seconds) and a read timeout (5 seconds) separately — requests library: requests.get(url, timeout=(5, 5)). (2) Use streaming mode (stream=True) and read only the first 1 MiB — large pages and binary files are not useful for preview. (3) Run fetches in a worker pool — if one fetch hangs, other workers continue. (4) Use a circuit breaker per domain: if example.com fails 5 times in 60 seconds, stop fetching from it for 10 minutes. Store blocked domains in Redis: SET blocked:domain:example.com EX 600. (5) Set maximum redirects (3) to prevent redirect loops.”}},{“@type”:”Question”,”name”:”What is the async fetch pattern and why does it improve user experience?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Synchronous fetch: user sends message with URL → server fetches URL (up to 5 seconds) → message sent. The user waits up to 5 seconds for their message to be delivered. Asynchronous fetch: user sends message with URL → message sent immediately with preview status=’pending’ → background job fetches URL → pushes preview data via WebSocket when ready. The user sees their message instantly; the preview populates within 1–3 seconds on fast origins without blocking the send flow. Prefetch optimization: trigger the fetch when the user pastes the URL in the composer (before send). By the time they click Send, the preview is often already cached. This hides the latency entirely for typical typing cadence. Store previews keyed by normalized URL hash — any future message with the same URL gets an instant cached preview.”}},{“@type”:”Question”,”name”:”Why proxy og:image URLs instead of embedding third-party image URLs directly?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Embedding a third-party image URL directly creates four problems: (1) Mixed content: if the og:image is http:// and your page is https://, modern browsers block the image. (2) Privacy: the third-party server logs your users’ IP addresses when their browser loads the image. (3) Broken images: if the origin deletes or moves the image (common), your preview shows a broken image icon indefinitely. (4) Tracking pixels: the image request can contain tracking parameters. Proxy solution: fetch the image from your server during preview generation, store it in S3 or your CDN, and embed your CDN URL in the preview. Now: the image is always https://, your users’ IPs are not exposed, the image persists in your cache even if origin deletes it, and tracking parameters are stripped. Apply the same SSRF validation to og:image URLs.”}}]}

Link preview and URL unfurling system design is discussed in Twitter system design interview questions.

Link preview and content sharing metadata design is covered in LinkedIn system design interview preparation.

Link preview and rich media embed design is discussed in Snap system design interview guide.