
This tutorial guides you through building an elastic vector database simulator, demonstrating how modern RAG systems distribute embeddings across distributed storage nodes. We implement consistent hashing with virtual nodes for balanced placement and minimal reshuffling during scaling. The solution visualizes the hashing ring in real-time, allowing interactive addition or removal of nodes to observe the minimal fraction of embeddings that move. This approach connects infrastructure theory directly to practical behavior in distributed AI systems.
!pip -q install networkx ipywidgets
import hashlib
import bisect
import random
from dataclasses import dataclass
from typing import Dict, List, Optional
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import ipywidgets as widgetsWe first establish the execution environment, installing essential libraries for visualization and interactivity. All core Python, numerical, and graphing dependencies are imported into a single location to ensure the notebook operates self-containedly and runs smoothly on platforms like Google Colab without requiring external setup.
def _u64_hash(s: str) -> int:
h = hashlib.sha256(s.encode("utf-8")).digest()[:8]
return int.from_bytes(h, byteorder="big", signed=False)
@dataclass(frozen=True)
class StorageNode:
node_id: str
class ConsistentHashRing:
def __init__(self, vnodes_per_node: int = 80):
self.vnodes_per_node = int(vnodes_per_node)
self.ring_keys: List[int] = []
self.ring_map: Dict[int, str] = {}
self.nodes: Dict[str, StorageNode] = {}
def _vnode_key(self, node_id: str, v: int) -> int:
return _u64_hash(f"node:{node_id}#vnode:{v}")
def add_node(self, node: StorageNode) -> None:
if node.node_id in self.nodes:
return
self.nodes[node.node_id] = node
for v in range(self.vnodes_per_node):
k = self._vnode_key(node.node_id, v)
if k in self.ring_map:
k = _u64_hash(f"node:{node.node_id}#vnode:{v}#salt:{random.random()}")
bisect.insort(self.ring_keys, k)
self.ring_map[k] = node.node_id
def remove_node(self, node_id: str) -> None:
if node_id not in self.nodes:
return
del self.nodes[node_id]
to_remove = [k for k, nid in self.ring_map.items() if nid == node_id]
for k in to_remove:
del self.ring_map[k]
self.ring_keys = sorted(self.ring_map.keys())
def get_node(self, key: str) -> Optional[str]:
if not self.ring_keys:
return None
hk = _u64_hash(f"key:{key}")
idx = bisect.bisect_left(self.ring_keys, hk)
if idx == len(self.ring_keys):
idx = 0
return self.ring_map[self.ring_keys[idx]]
def snapshot() -> Dict[str, int]:
counts = {nid: 0 for nid in self.nodes}
for _, nid in self.ring_map.items():
counts[nid] = counts.get(nid, 0) + 1
return dict(sorted(counts.items()))We implement the core consistent hashing ring logic and define the structure for storage nodes. Virtual nodes are introduced to enhance load balancing, ensuring a more even distribution of embeddings across the system. The system maps keys to nodes using a deterministic hash function, which guarantees stability even as the system scales.
class VectorDBSimulator:
def __init__(self, ring: ConsistentHashRing, dim: int = 256, seed: int = 42):
self.ring = ring
self.dim = int(dim)
self.rng = np.random.default_rng(seed)
self.vectors: Dict[str, np.ndarray] = {}
def add_vectors(self, n: int) -> None:
start = len(self.vectors)
for i in range(start, start + int(n)):
vid = f"vec_{i:06d}"
emb = self.rng.normal(size=(self.dim,)).astype(np.float32)
emb /= (np.linalg.norm(emb) + 1e-12)
self.vectors[vid] = emb
def shard_map() -> Dict[str, str]:
out = {}
for vid in self.vectors.keys():
nid = self.ring.get_node(vid)
out[vid] = nid if nid is not None else "∅"
return out
def distribution() -> Dict[str, int]:
dist: Dict[str, int] = {}
for nid in self.shard_map().values():
dist[nid] = dist.get(nid, 0) + 1
return dict(sorted(dist.items()))
@staticmethod
def movement_fraction(before: Dict[str, str], after: Dict[str, str]) -> float:
moved = sum(1 for k in before if before[k] != after[k])
return moved / max(1, len(before))We simulate the vector database by generating and storing normalized embedding vectors. Each vector is assigned to a shard using the consistent hashing ring, and we track the distribution of data across nodes. Crucially, we quantify data movement to precisely measure the impact of any topology changes on system stability.
def draw_ring(ring: ConsistentHashRing, dist: Dict[str, int], title: str):
node_ids = sorted(ring.nodes.keys())
plt.figure(figsize=(8, 8))
ax = plt.gca()
ax.set_title(title)
if not node_ids:
plt.text(0.5, 0.5, "Ring is empty", ha="center", va="center")
plt.axis("off")
plt.show()
return
G = nx.Graph()
for nid in node_ids:
G.add_node(nid)
for i in range(len(node_ids)):
G.add_edge(node_ids[i], node_ids[(i + 1) % len(node_ids)])
pos = nx.circular_layout(G)
vnode_counts = ring.snapshot()
labels = {
nid: f"{nid}
keys={dist.get(nid,0)}
vnodes={vnode_counts.get(nid,0)}"
for nid in node_ids
}
nx.draw_networkx_edges(G, pos, alpha=0.4, width=2)
nx.draw_networkx_nodes(G, pos, node_size=2200)
nx.draw_networkx_labels(G, pos, labels=labels, font_size=9)
plt.axis("off")
plt.show()We visualize the consistent hashing ring as a clear circular graph, directly displaying live shard and virtual node statistics on each node for immediate understanding. This visualization updates dynamically, accurately reflecting any changes in the system's state.
ring = ConsistentHashRing(vnodes_per_node=80)
sim = VectorDBSimulator(ring)
sim.add_vectors(6000)
node_name = widgets.Text(value="nodeA", description="Node ID:")
add_btn = widgets.Button(description="Add node", button_style="success")
rm_btn = widgets.Button(description="Remove node", button_style="danger")
vnodes_slider = widgets.IntSlider(value=80, min=20, max=300, step=20, description="VNodes/node")
regen_btn = widgets.Button(description="Rebuild ring", button_style="warning")
status = widgets.Output()
def render(msg: str = ""):
clear_output(wait=True)
display(widgets.HBox([node_name, add_btn, rm_btn]))
display(widgets.HBox([vnodes_slider, regen_btn]))
dist = sim.distribution()
title = f"Consistent Hash Ring | nodes={len(ring.nodes)} | vectors={len(sim.vectors)}"
if msg:
title += f"\
{msg}"
draw_ring(ring, dist, title)
with status:
status.clear_output()
print("Shard distribution:", dist)
display(status)
def on_add(_):
before = sim.shard_map()
ring.add_node(StorageNode(node_name.value.strip() or f"node{len(ring.nodes)+1}"))
after = sim.shard_map()
moved = sim.movement_fraction(before, after)
render(f"Added node | moved={moved:.3f} (~{moved*100:.1f}%)")
def on_remove(_):
before = sim.shard_map()
ring.remove_node(node_name.value.strip())
after = sim.shard_map()
moved = sim.movement_fraction(before, after)
render(f"Removed node | moved={moved:.3f} (~{moved*100:.1f}%)")
def on_regen(_):
ids = list(ring.nodes.keys())
new_ring = ConsistentHashRing(vnodes_per_node=int(vnodes_slider.value))
for nid in ids:
new_ring.add_node(StorageNode(nid))
sim.ring = new_ring
globals()["ring"] = new_ring
render(f"Rebuilt ring with vnodes_per_node={vnodes_slider.value}")
add_btn.on_click(on_add)
rm_btn.on_click(on_remove)
regen_btn.on_click(on_regen)
render("Add or remove nodes to observe data movement")We enable interactive control, allowing you to add, remove, and rebalance nodes in real time. Observe how the shard distribution immediately adapts after each action. These interactions serve to empirically validate the principle of minimal data movement inherent in elastic distributed systems.
In conclusion, we successfully demonstrated how consistent hashing facilitates scalable vector storage while minimizing data movement during topology changes. Our observations confirm that adding or removing nodes impacts only a small subset of embeddings, validating the core design principle of elastic distributed databases. By employing visualization and interactive elements, we transformed abstract sharding concepts into tangible system behavior. This provides a clear mental model for how production RAG systems maintain stability while scaling dynamically.