A follower graph stores directed relationships between users — Alice follows Bob, but Bob may not follow Alice. It powers feed generation, notification targeting, and social recommendations. Core challenges: efficiently querying follower/following counts, generating personalized feeds without O(followers) fanout per write, handling celebrity accounts with millions of followers, and supporting mutual follow (friendship) detection.
Core Data Model
-- Directed follow relationship
CREATE TABLE Follow (
follower_id UUID NOT NULL,
followee_id UUID NOT NULL,
followed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (follower_id, followee_id)
);
-- Lookup both directions efficiently
CREATE INDEX idx_follow_followee ON Follow (followee_id, followed_at DESC);
-- Denormalized counts for O(1) reads
CREATE TABLE UserStats (
user_id UUID PRIMARY KEY,
follower_count INT NOT NULL DEFAULT 0,
following_count INT NOT NULL DEFAULT 0,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Feed: pre-computed fan-out (for non-celebrity accounts)
-- Key in Redis: feed:{user_id} — sorted set scored by post_timestamp
-- Member: post_id
-- Only maintained for users with < CELEBRITY_THRESHOLD followers
Follow and Unfollow
import redis
from uuid import uuid4
r = redis.Redis(host='redis', decode_responses=True)
CELEBRITY_THRESHOLD = 10_000 # users with > 10K followers use pull-based feed
def follow_user(conn, follower_id: str, followee_id: str) -> bool:
"""
Returns True if newly followed, False if already following (idempotent).
Updates denormalized counts atomically.
"""
if follower_id == followee_id:
raise ValueError("Users cannot follow themselves")
with conn.cursor() as cur:
cur.execute("""
INSERT INTO Follow (follower_id, followee_id)
VALUES (%s, %s)
ON CONFLICT (follower_id, followee_id) DO NOTHING
RETURNING followed_at
""", (follower_id, followee_id))
inserted = cur.fetchone() is not None
if not inserted:
return False # Already following
with conn.cursor() as cur:
cur.execute("""
INSERT INTO UserStats (user_id, following_count) VALUES (%s, 1)
ON CONFLICT (user_id) DO UPDATE SET following_count = UserStats.following_count + 1
""", (follower_id,))
cur.execute("""
INSERT INTO UserStats (user_id, follower_count) VALUES (%s, 1)
ON CONFLICT (user_id) DO UPDATE SET follower_count = UserStats.follower_count + 1
""", (followee_id,))
conn.commit()
return True
def unfollow_user(conn, follower_id: str, followee_id: str) -> bool:
with conn.cursor() as cur:
cur.execute(
"DELETE FROM Follow WHERE follower_id=%s AND followee_id=%s RETURNING followed_at",
(follower_id, followee_id)
)
deleted = cur.fetchone() is not None
if not deleted:
return False
with conn.cursor() as cur:
cur.execute(
"UPDATE UserStats SET following_count = GREATEST(0, following_count - 1) WHERE user_id=%s",
(follower_id,)
)
cur.execute(
"UPDATE UserStats SET follower_count = GREATEST(0, follower_count - 1) WHERE user_id=%s",
(followee_id,)
)
conn.commit()
return True
Feed Fan-out on Post
import time
FEED_MAX_SIZE = 200 # keep only the 200 most recent posts in feed
def fan_out_post(conn, post_id: str, author_id: str, post_timestamp: float):
"""
When a non-celebrity user posts, push the post_id into each follower's feed.
For celebrity users, skip fan-out — followers pull on read instead.
"""
# Check if author is a celebrity
with conn.cursor() as cur:
cur.execute("SELECT follower_count FROM UserStats WHERE user_id = %s", (author_id,))
row = cur.fetchone()
follower_count = row[0] if row else 0
if follower_count >= CELEBRITY_THRESHOLD:
return # Celebrity: pull-based feed handles this
# Get all follower IDs (batch for large follower counts)
with conn.cursor() as cur:
cur.execute("SELECT follower_id FROM Follow WHERE followee_id = %s", (author_id,))
followers = [row[0] for row in cur.fetchall()]
if not followers:
return
# Fan-out using Redis pipeline for efficiency
pipeline = r.pipeline()
for follower_id in followers:
feed_key = f"feed:{follower_id}"
pipeline.zadd(feed_key, {post_id: post_timestamp})
# Trim to most recent FEED_MAX_SIZE posts
pipeline.zremrangebyrank(feed_key, 0, -(FEED_MAX_SIZE + 1))
pipeline.execute()
def get_user_feed(conn, user_id: str, before_ts: float | None = None, limit: int = 20) -> list[str]:
"""
Get post IDs for a user's feed. Merges pre-computed (push) feed with
celebrity posts (pull).
"""
now_ts = before_ts or time.time()
feed_key = f"feed:{user_id}"
# Push-based posts
push_post_ids = r.zrangebyscore(
feed_key, '-inf', f'({now_ts}',
start=0, num=limit, withscores=False
)[::-1] # reverse: newest first
# Pull-based: get celebrity accounts this user follows
celebrity_post_ids = get_celebrity_posts(conn, user_id, before_ts=now_ts, limit=limit)
# Merge and sort by timestamp, return top `limit`
all_ids = merge_by_timestamp(push_post_ids, celebrity_post_ids, limit)
return all_ids
def get_celebrity_posts(conn, user_id: str, before_ts: float, limit: int) -> list[str]:
"""Fetch recent posts from celebrity accounts the user follows."""
with conn.cursor() as cur:
cur.execute("""
SELECT p.post_id FROM Follow f
JOIN UserStats s ON f.followee_id = s.user_id
JOIN Post p ON f.followee_id = p.author_id
WHERE f.follower_id = %s
AND s.follower_count >= %s
AND p.created_at < to_timestamp(%s)
ORDER BY p.created_at DESC
LIMIT %s
""", (user_id, CELEBRITY_THRESHOLD, before_ts, limit))
return [row[0] for row in cur.fetchall()]
Mutual Follow (Friendship) Detection
def are_mutual_followers(conn, user_a: str, user_b: str) -> bool:
"""Check if A follows B AND B follows A — mutual follow = friendship."""
with conn.cursor() as cur:
cur.execute("""
SELECT COUNT(*) FROM Follow
WHERE (follower_id=%s AND followee_id=%s)
OR (follower_id=%s AND followee_id=%s)
""", (user_a, user_b, user_b, user_a))
count = cur.fetchone()[0]
return count == 2
def get_mutual_followers(conn, user_id: str, limit: int = 20) -> list[str]:
"""Get users who follow user_id AND user_id follows back."""
with conn.cursor() as cur:
cur.execute("""
SELECT f1.follower_id
FROM Follow f1
JOIN Follow f2 ON f1.follower_id = f2.followee_id
AND f2.follower_id = %s
AND f1.followee_id = %s
ORDER BY f1.followed_at DESC
LIMIT %s
""", (user_id, user_id, limit))
return [row[0] for row in cur.fetchall()]
Key Interview Points
- The celebrity problem: When Cristiano Ronaldo (500M followers) posts, fan-out to 500M Redis sorted sets takes hours. Solution: hybrid push-pull. Define a celebrity threshold (10K followers). Non-celebrities: push fan-out on post (writes post_id to each follower’s feed sorted set). Celebrities: no fan-out — each follower pulls celebrity posts on feed read by querying the Post table filtered by celebrity followees. Most users follow few celebrities, so the per-user pull query is fast.
- Feed sorted set design: Score by Unix timestamp, member is post_id. ZRANGEBYSCORE feed:{user_id} -inf (before_ts gives cursor-paginated reads (newest first). ZREMRANGEBYRANK trims to MAX_SIZE entries to bound memory: each feed uses ~20 bytes/post × 200 posts = 4KB/user × 10M users = 40GB Redis — manageable with a dedicated Redis instance.
- Denormalized counts: follower_count and following_count are updated on every follow/unfollow. This is a write-time cost that buys O(1) read performance. Alternative: COUNT(*) at read time — accurate but adds a query for every profile page load. At scale (10M profile views/day), pre-computed counts are necessary. Ensure GREATEST(0, …) in decrement to prevent negative counts from race conditions.
- Graph pagination: GET /users/{id}/followers should return cursor-paginated results, not offset-paginated. The followers list changes in real time — offset pagination skips or repeats users as the list shifts. Cursor: ORDER BY followed_at DESC, follower_id — opaque cursor encodes (followed_at, follower_id) for the last seen row.
- “People you may know” recommendations: Find second-degree connections (users followed by people you follow but not by you): SELECT followee_id FROM Follow WHERE follower_id IN (my followees) AND followee_id NOT IN (my followees) GROUP BY followee_id ORDER BY COUNT(*) DESC LIMIT 10. This is expensive — run offline daily and cache in Redis as a recommendation list.
{“@context”:”https://schema.org”,”@type”:”FAQPage”,”mainEntity”:[{“@type”:”Question”,”name”:”What is the celebrity problem in social graph systems and how do you solve it?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”The celebrity problem: when a user with millions of followers posts, the fan-out system must write their post ID into millions of followers’ feed sorted sets. At 1M followers, 1 write per follower = 1M Redis writes per post. If the celebrity posts 10 times per hour, that is 10M writes/hour just for one user. This overwhelms the fan-out worker. Solution: hybrid push-pull. Define a threshold (e.g., 10K followers). Non-celebrities use push fan-out — their post is written to each follower’s feed at post time. Celebrities bypass fan-out — their followers’ feeds are NOT updated when they post. Instead, when a user loads their feed, the system pulls recent posts from any celebrities they follow directly from the Post table, then merges these with the pre-built push feed. The read-time pull is bounded by: SELECT FROM Post WHERE author_id IN (celebrity followees of user) LIMIT 20.”}},{“@type”:”Question”,”name”:”How does the Redis feed sorted set work for timeline generation?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Each user has a feed sorted set: feed:{user_id}. Members are post_ids; scores are Unix timestamps of the post. ZADD feed:{user_id} {timestamp} {post_id} on fan-out. ZREVRANGEBYSCORE feed:{user_id} -inf (before_ts LIMIT 0 20 for cursor-paginated reads (newest first, 20 at a time). The cursor is the timestamp of the last seen post — passing it as the BEFORE parameter to the next page request prevents duplicates as new posts arrive. ZREMRANGEBYRANK feed:{user_id} 0 -(MAX+1) trims the feed to MAX_SIZE entries after each add, bounding memory: 200 posts × 32 bytes = 6.4KB per user. At 10M users: 64GB Redis — manageable with a dedicated instance. Use RESP3 protocol and Redis 7+ for better pipeline performance.”}},{“@type”:”Question”,”name”:”How do you paginate through a user’s followers list without offset pagination problems?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Offset pagination (LIMIT 20 OFFSET 400) is problematic for follower lists: as new users follow during pagination, entries shift positions — some followers appear on two pages, others are skipped. Use cursor pagination instead. The Follow table has an index on (followee_id, followed_at DESC, follower_id). Cursor: encode the last seen (followed_at, follower_id) as an opaque token. Query: WHERE followee_id=X AND (followed_at, follower_id) < (cursor_time, cursor_id) ORDER BY followed_at DESC, follower_id DESC LIMIT 20. This is a stable keyset that doesn’t shift as new follows are added (new follows have newer timestamps and appear at the start of the list, not in the middle of already-paginated pages).”}},{“@type”:”Question”,”name”:”How do you efficiently check if user A follows user B (for follow button state)?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”Profile page must show: "Follow" or "Unfollow" button. SELECT 1 FROM Follow WHERE follower_id = A AND followee_id = B — O(1) with the primary key (follower_id, followee_id). For bulk follow state (loading a list of 20 profiles): SELECT followee_id FROM Follow WHERE follower_id = A AND followee_id = ANY([list of 20 ids]). Cache the result in Redis: SET follows:{user_a}:{user_b} 1 EX 300 (5-minute cache). On follow/unfollow, invalidate: DEL follows:{user_a}:{user_b}. This avoids N individual DB queries when rendering a list of profiles with follow buttons.”}},{“@type”:”Question”,”name”:”How do you compute suggested "people you may know" at scale?”,”acceptedAnswer”:{“@type”:”Answer”,”text”:”The SQL query for second-degree connections — users followed by my followees whom I don’t follow — is expensive at scale: it involves two joins on a large Follow table. Efficient approach: (1) precompute offline (daily Spark/Flink job); (2) for each user, find their followees’ followees, exclude existing connections, rank by mutual connection count; (3) store top-N suggestions in Redis (suggestions:{user_id}) with a 24-hour TTL; (4) serve from Redis on request. The offline job can process the entire social graph using graph algorithms (common neighbor count, Jaccard similarity). For smaller systems: run the query with a depth limit (only look 2 hops out, not 3+) and cache at the application level.”}}]}
Follower graph and social feed design is discussed in Twitter system design interview questions.
Follower graph and friend network design is covered in Snap system design interview preparation.
Follower graph and professional network design is discussed in LinkedIn system design interview guide.