New blog

2026-06-08 19:31:19 -04:00
parent 581f888218
commit 358e9cee4a
20 changed files with 586 additions and 102 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 temp
 # dependencies
 /node_modules
 /.pnp
--- a/app/blog/[slug]/page.tsx
+++ b/app/blog/[slug]/page.tsx
@@ -10,7 +10,6 @@ import type { ComponentPropsWithoutRef, ReactNode } from 'react';
 import { TableOfContents } from '@/components/mdx/TableOfContents';
 import { SideNote } from '@/components/mdx/SideNote';
 import { Citation, Bibliography } from '@/components/mdx/Citation';
 import { MobileTableOfContents } from '@/components/mdx/MobileTableOfContents';
 import { formatPostDate } from '@/lib/format';
 const slugify = (text: string): string => {
@@ -261,8 +260,6 @@ export default async function BlogPost({ params }: Props) {
                        </p>
                    </header>
                    <MobileTableOfContents headings={headings} />
                    <div className="essay-prose relative mt-8 max-w-none xl:max-w-[44rem]">
                        <MDXRemote
                            source={post.content}
--- a/app/blog/page.tsx
+++ b/app/blog/page.tsx
@@ -5,10 +5,10 @@ import { getAllPosts } from '@/lib/mdx';
 export const metadata: Metadata = {
    title: 'Blog',
-    description: 'Thoughts on software, deep learning, and research by Akshay Kolli.',
+    description: 'Essays and notes by Akshay Kolli on GPUs, ML hardware, deep learning, and research.',
    openGraph: {
        title: 'Blog',
-        description: 'Thoughts on software, deep learning, and research by Akshay Kolli.',
+        description: 'Essays and notes by Akshay Kolli on GPUs, ML hardware, deep learning, and research.',
    },
 };
@@ -22,10 +22,11 @@ export default function BlogIndex() {
                    <div className="space-y-4">
                        <p className="eyebrow">Writing</p>
                        <h1 className="max-w-[40rem] text-balance font-sans text-5xl font-medium leading-[0.96] text-ink sm:text-6xl lg:text-7xl">
-                            Notes on software, deep learning, and research.
+                            Notes on GPUs, models, and the details underneath.
                        </h1>
                        <p className="max-w-[34rem] text-[1rem] leading-8 text-muted">
-                            A small archive of ideas, experiments, and things worth slowing down enough to explain.
+                            Hardware quirks, deep learning systems, and research problems
                            that needed more than the product page.
                        </p>
                    </div>
                </header>
@@ -57,7 +58,7 @@ export default function BlogIndex() {
                                        </span>
                                    ))}
                                    <Link href={`/blog/${post.slug}`} className="text-ink transition-colors hover:text-accent">
-                                        Read essay
+                                        Read
                                    </Link>
                                </div>
                            </div>
@@ -68,7 +69,7 @@ export default function BlogIndex() {
                <section className="grid gap-4 border-t border-line pt-6 md:grid-cols-[8rem_minmax(0,1fr)]">
                    <p className="eyebrow md:pt-1">Archive</p>
                    <p className="max-w-[34rem] text-[0.96rem] leading-7 text-muted">
-                        {posts.length} published {posts.length === 1 ? 'essay' : 'essays'}. The archive stays selective.
+                        {posts.length} {posts.length === 1 ? 'essay' : 'essays'} published so far. I publish when I have a real question to chase down.
                    </p>
                </section>
            </div>
--- a/app/layout.tsx
+++ b/app/layout.tsx
@@ -31,10 +31,10 @@ export const metadata: Metadata = {
    default: "Akshay Kolli",
    template: "%s | Akshay Kolli",
  },
-  description: "Personal website of Akshay Kolli — CS PhD Student at UMass Lowell researching World Models, Reinforcement Learning, and Multi-Agent Systems.",
+  description: "Akshay Kolli is a CS PhD student at UMass Lowell working on world models for RL, ML hardware, GPUs, and deep learning systems.",
  openGraph: {
    title: "Akshay Kolli",
-    description: "CS PhD Student at UMass Lowell researching World Models, Reinforcement Learning, and Multi-Agent Systems.",
+    description: "CS PhD student at UMass Lowell working on world models for RL, ML hardware, GPUs, and deep learning systems.",
    siteName: "Akshay Kolli",
    type: "website",
  },
--- a/app/page.tsx
+++ b/app/page.tsx
@@ -12,16 +12,17 @@ export default function Home() {
    <div className="page-frame py-16 sm:py-20">
      <section className="mx-auto grid min-h-[calc(100vh-11rem)] max-w-[64rem] gap-6 lg:grid-cols-[minmax(0,39rem)_15rem] lg:items-center lg:gap-4">
        <div className="max-w-[39rem] space-y-6">
-          <p className="eyebrow">Akshay Kolli / Research + Writing</p>
+          <p className="eyebrow">Akshay Kolli / UMass Lowell</p>
          <div className="space-y-4">
            <h1 className="max-w-[34rem] text-balance font-sans text-5xl font-medium leading-[0.96] text-ink sm:text-6xl lg:text-7xl">
-              World models and reinforcement learning.
+              World models for RL.
            </h1>
            <div className="max-w-[32rem] space-y-3 text-[1.04rem] leading-8 text-ink-soft sm:text-[1.1rem]">
              <p>
-                I&apos;m a CS PhD student at UMass Lowell building world models for autonomous agents.
+                I&apos;m a CS PhD student working on world models for reinforcement learning,
                ML hardware, GPUs, and deep learning systems.
              </p>
            </div>
          </div>
--- a/app/projects/graph-attention-topology/page.tsx
+++ b/app/projects/graph-attention-topology/page.tsx
@@ -14,34 +14,34 @@ const paperLinks = [
 const highlights = [
  {
-    value: "No known graph",
+    value: "No graph labels",
-    label: "Learns topology without prior adjacency examples.",
+    label: "Learns from trajectories without seeing adjacency examples.",
  },
  {
    value: "Unknown dynamics",
-    label: "Uses observed states instead of hand-specified equations.",
+    label: "Works from observed state sequences, not hand-written dynamics.",
  },
  {
-    value: "Attention as edges",
+    value: "Attention becomes edges",
-    label: "Interprets learned attention scores as the graph estimate.",
+    label: "Uses attention scores as the topology estimate.",
  },
 ];
 const steps = [
-  "Embed every agent into a shared latent space.",
+  "Put each agent in a shared latent space.",
-  "Project embeddings into key/query vectors and compute pairwise attention.",
+  "Build key/query pairs and compute attention between agents.",
-  "Translate observed agent states into values.",
+  "Use observed states as the values passed through attention.",
-  "Predict the next state and read the attention matrix as topology.",
+  "Forecast the next state, then threshold attention to recover topology.",
 ];
 export const metadata: Metadata = {
  title: "Graph Attention Topology Inference",
  description:
-    "Project page for Graph Attention Inference of Network Topology in Multi-Agent Systems.",
+    "A project page for inferring hidden multi-agent network topology with graph attention.",
  openGraph: {
    title: "Graph Attention Inference of Network Topology in Multi-Agent Systems",
    description:
-      "A graph-attention approach for inferring hidden network topology from multi-agent trajectories.",
+      "Inferring hidden network topology from multi-agent trajectories with graph attention.",
    type: "article",
    publishedTime: "2024-10-26",
    images: [
@@ -117,7 +117,7 @@ export default function GraphAttentionProjectPage() {
                Graph Attention Inference of Network Topology
              </h1>
              <p className="max-w-[43rem] text-[1.12rem] leading-8 text-graph-soft sm:text-[1.25rem]">
-                Inferring the hidden graph behind a multi-agent system by training attention to predict what each agent does next.
+                We use next-state prediction to expose the interaction graph behind a multi-agent system.
              </p>
            </div>
@@ -196,14 +196,14 @@ export default function GraphAttentionProjectPage() {
              <div className="space-y-5">
                <p className="graph-kicker">Overview</p>
                <h2 className="max-w-[42rem] text-balance text-4xl font-semibold leading-tight text-graph-ink sm:text-5xl">
-                  Predict the next state, then read the graph from what the model attended to.
+                  Train on motion. Read the graph from attention.
                </h2>
                <div className="space-y-5 text-[1.02rem] leading-8 text-graph-soft">
                  <p>
-                    The paper studies a practical problem in networked multi-agent systems: the agents move, synchronize, or converge, but the interaction graph behind that behavior is not given.
+                    In many multi-agent systems, you can watch agents move, synchronize, or settle, but you are not given who influences whom.
                  </p>
                  <p>
-                    The model is trained on state trajectories from consensus dynamics and Kuramoto oscillators. During prediction, attention scores between agent embeddings become an interpretable approximation of the adjacency matrix.
+                    This project tests whether a predictor trained on trajectories can recover that missing adjacency matrix. On consensus dynamics and Kuramoto oscillators, the learned attention scores become a usable proxy for edges.
                  </p>
                </div>
              </div>
@@ -211,7 +211,7 @@ export default function GraphAttentionProjectPage() {
              <Figure
                src={simulationImage}
                alt="Consensus and Kuramoto simulation examples with adjacency matrices"
-                caption="Simulation examples: consensus dynamics and Kuramoto oscillators paired with their hidden adjacency matrices."
+                caption="Consensus and Kuramoto trajectories, each paired with the hidden adjacency matrix used for simulation."
                className="rounded-md"
              />
            </section>
@@ -221,10 +221,10 @@ export default function GraphAttentionProjectPage() {
                <div className="space-y-5">
                  <p className="graph-kicker">Method</p>
                  <h2 className="text-balance text-4xl font-semibold leading-tight text-graph-ink sm:text-5xl">
-                    A topology estimate falls out of the attention layer.
+                    The graph is never supervised directly.
                  </h2>
                  <p className="text-[1.02rem] leading-8 text-graph-soft">
-                    Instead of supervising the graph directly, the model learns to forecast the system. The attention matrix is then thresholded and compared with the true graph.
+                    The model learns the dynamics first. After training, we threshold the attention matrix and compare the inferred edges with the true graph.
                  </p>
                </div>
@@ -243,7 +243,7 @@ export default function GraphAttentionProjectPage() {
              <Figure
                src={architectureImage}
                alt="Architecture diagram showing agent embeddings, key-query attention, values, predictions, and loss"
-                caption="Architecture: embeddings produce key/query vectors; observed states produce values; the attention matrix approximates adjacency."
+                caption="Model layout: embeddings create key/query vectors, states provide values, and attention is later read as adjacency."
                className="rounded-md"
              />
            </section>
@@ -252,31 +252,31 @@ export default function GraphAttentionProjectPage() {
              <div className="max-w-[46rem] space-y-5">
                <p className="graph-kicker">Results</p>
                <h2 className="text-balance text-4xl font-semibold leading-tight text-graph-ink sm:text-5xl">
-                  Strongest graph recovery appears with smaller systems, and more simulations help larger systems.
+                  Smaller systems recover cleanly; larger ones improve with more simulations.
                </h2>
                <p className="text-[1.02rem] leading-8 text-graph-soft">
-                  F1 link-prediction scores are above a random baseline for both tested dynamics. Consensus dynamics are easier than Kuramoto oscillators, while additional simulation data improves larger-agent inference.
+                  F1 scores beat a random baseline for both dynamics. Consensus is the easier case; Kuramoto needs more data as the number of agents grows.
                </p>
              </div>
              <Figure
                src={resultsImage}
                alt="F1 score results for consensus dynamics and Kuramoto oscillators"
-                caption="F1 results across system size and number of simulations for consensus dynamics and Kuramoto oscillators."
+                caption="F1 link-prediction results across system size and simulation count."
                className="rounded-md"
              />
              <div className="grid gap-6 md:grid-cols-[minmax(0,0.78fr)_minmax(0,1fr)] md:items-start">
                <div className="rounded-md border border-graph-line bg-graph-panel p-5">
-                  <p className="font-mono text-[0.72rem] uppercase text-graph-muted">training behavior</p>
+                  <p className="font-mono text-[0.72rem] uppercase text-graph-muted">training trace</p>
                  <p className="mt-3 text-[1.02rem] leading-8 text-graph-soft">
-                    Attention first learns the obvious self-dependencies, then gradually recovers inter-agent structure as training progresses.
+                    Early training mostly finds self-dependencies. Later epochs start assigning weight to the agent-to-agent links.
                  </p>
                </div>
                <Figure
                  src={attentionImage}
                  alt="Attention matrices over training epochs and predicted graphs"
-                  caption="Attention values through training stages, from true graph to predicted graph."
+                  caption="Attention during training, from the true graph reference to the final predicted graph."
                  className="rounded-md"
                />
              </div>
--- a/app/projects/page.tsx
+++ b/app/projects/page.tsx
@@ -9,8 +9,8 @@ const projects = [
    title: "Graph Attention Inference of Network Topology",
    eyebrow: "MECC 2024 / Multi-Agent Systems",
    description:
-      "Inferring hidden interaction graphs from multi-agent trajectories by training attention to predict each agent's next state.",
+      "A graph-attention model that recovers hidden interaction graphs from trajectory data while learning to predict the next state.",
-    status: "Paper project",
+    status: "Published paper",
    year: "2024",
    image: graphAttentionImage,
    imageAlt: "Graph attention model architecture diagram",
@@ -20,10 +20,10 @@ const projects = [
 export const metadata: Metadata = {
  title: "Projects",
-  description: "Research projects and paper pages by Akshay Kolli.",
+  description: "Selected projects on world models, ML hardware, GPUs, deep learning, and the occasional multi-agent systems paper.",
  openGraph: {
    title: "Projects",
-    description: "Research projects and paper pages by Akshay Kolli.",
+    description: "Selected projects on world models, ML hardware, GPUs, deep learning, and the occasional multi-agent systems paper.",
  },
 };
@@ -35,10 +35,11 @@ export default function ProjectsPage() {
          <div className="space-y-4">
            <p className="eyebrow">Projects</p>
            <h1 className="max-w-[44rem] text-balance font-sans text-5xl font-medium leading-[0.96] text-ink sm:text-6xl lg:text-7xl">
-              Research pages and technical artifacts.
+              World models, GPUs, and deep learning systems.
            </h1>
            <p className="max-w-[36rem] text-[1rem] leading-8 text-muted">
-              A compact index of paper pages, experiments, and systems work.
+              Most of my current work sits around ML hardware, deep learning models,
              and world models for RL. This topology paper is the multi-agent exception.
            </p>
          </div>
          <p className="font-mono text-[0.72rem] uppercase leading-6 text-muted-strong lg:text-right">
@@ -77,7 +78,7 @@ export default function ProjectsPage() {
                    href={project.href}
                    className="font-medium text-ink underline decoration-line-strong underline-offset-4 transition-colors hover:text-accent hover:decoration-accent"
                  >
-                    Open project
+                    Read project
                  </Link>
                  <div className="flex flex-wrap gap-3 font-mono text-[0.68rem] uppercase text-muted-strong">
                    {project.tags.map((tag) => (
--- a/app/resume/page.tsx
+++ b/app/resume/page.tsx
@@ -2,27 +2,27 @@ import type { Metadata } from 'next';
 export const metadata: Metadata = {
    title: 'Resume',
-    description: 'Experience, education, and technical skills of Akshay Kolli.',
+    description: "Akshay Kolli's resume: research, engineering, education, and technical skills.",
    openGraph: {
        title: 'Resume',
-        description: 'Experience, education, and technical skills of Akshay Kolli.',
+        description: "Akshay Kolli's resume: research, engineering, education, and technical skills.",
    },
 };
 const experience = [
    {
        title: 'PhD in Computer Science',
-        org: 'University of Massachusetts',
+        org: 'University of Massachusetts Lowell',
        period: 'Jan 2025 – Present',
        bullets: [],
    },
    {
        title: 'Research Assistant',
-        org: 'Exalabs, University of Massachusetts',
+        org: 'Exalabs, University of Massachusetts Lowell',
        period: 'Sep 2022 – Present',
        bullets: [
-            'Designing state-of-the-art ML pipelines for graph properties and trajectory forecasting.',
+            'Building ML pipelines for graph-property prediction and trajectory forecasting.',
-            'Reduced multi-agent simulation time by 1000x.',
+            'Cut multi-agent simulation time by 1000x.',
        ],
    },
    {
@@ -72,10 +72,10 @@ export default function ResumePage() {
                    <div className="space-y-4">
                        <p className="eyebrow">Resume</p>
                        <h1 className="max-w-[40rem] text-balance font-sans text-5xl font-medium leading-[0.96] text-ink sm:text-6xl lg:text-7xl">
-                            Experience, education, and technical depth.
+                            Research and engineering work.
                        </h1>
                        <p className="max-w-[34rem] text-[1rem] leading-8 text-muted">
-                            Research, engineering, and systems work across academia and industry.
+                            My background across ML research, simulation tooling, and production software.
                        </p>
                        <div className="flex flex-wrap gap-x-5 gap-y-2 text-[0.92rem] text-ink">
                            <a href="/files/Resume_latest.pdf" className="transition-colors hover:text-accent">
--- a/components/layout/Footer.tsx
+++ b/components/layout/Footer.tsx
@@ -3,7 +3,7 @@ export function Footer() {
        <footer className="mt-auto border-t border-line">
            <div className="mx-auto flex max-w-[72rem] flex-col gap-3 px-5 py-6 sm:flex-row sm:items-center sm:justify-between sm:px-6">
                <p className="text-[0.82rem] leading-6 text-muted">
-                    Research, writing, and software.
+                    World models for RL, GPUs, and deep learning systems.
                </p>
                <div className="flex flex-wrap gap-4 text-[0.82rem] text-ink">
                    <a href="mailto:akshaykolli@hotmail.com" className="transition-colors hover:text-accent">
--- a/content/archive/hello-world.mdx
+++ b/content/archive/hello-world.mdx
@@ -1,48 +1,48 @@
 ---
-title: 'Hello World: The Art of Minimalism'
+title: 'Hello World'
 date: '2024-01-01'
-description: 'An introduction to this minimal digital garden and the philosophy behind it.'
+description: 'A short MDX smoke test for this site.'
-tags: ['minimalism', 'design', 'intro']
+tags: ['site', 'mdx', 'notes']
 ---
-Welcome to my new digital garden. This space is designed to be a reflection of my philosophy: **simplicity is the ultimate sophistication**.
+First post for the site. Mostly a smoke test: headings, lists, code blocks, sidenotes, and citations all need to survive the MDX pipeline.
-## Why Minimalism?
+## What this checks
-In a world full of noise, I find peace in clarity. This website is built with:
+The site is built with:
- **Next.js 15**: For bleeding-edge performance.
+- **Next.js** for routing and rendering.
- **Tailwind CSS**: For utility-first styling.
+- **Tailwind CSS** for the interface.
- **MDX**: For rich, interactive content.
+- **MDX** for posts that need components inline.
 ## The Detail
-Design is not just what it looks like and feels like. Design is how it works. <Citation id="jobs" index={1} />
+The goal is a writing setup that stays plain until a post needs diagrams, notes, or citations. <Citation id="jobs" index={1} />
-<SideNote title="Minimalism">
+<SideNote title="MDX note">
-  "Simplicity is the ultimate sophistication." — Leonardo da Vinci
+  This archived post is mostly here to exercise custom components.
 </SideNote>
 ### Code Snippet Test
-Here is how I configured the font:
+Here is the font setup pattern:
 ```typescript
-const inter = Inter({
+const instrumentSans = Instrument_Sans({
-  variable: "--font-inter",
+  variable: "--font-instrument-sans",
  subsets: ["latin"],
  display: "swap",
 });
 ```
-## Future Plans
+## Next
-I plan to write about:
+Things worth writing down:
-1. Software Architecture
+
-2. Design Systems
+1. GPU kernels
-3. The Zen of Coding
+2. Multi-agent systems
 3. Research code that survives contact with experiments
 <Bibliography items={[
  { id: 'jobs', content: 'Steve Jobs, The New York Times Magazine, 2003' }
 ]} />
--- a/content/posts/blackwell_datacenter_vs_geforce.mdx
+++ b/content/posts/blackwell_datacenter_vs_geforce.mdx
@@ -1,32 +1,72 @@
 ---
-title: 'Blackwell: Datacenter vs GeForce GPUs'
+title: 'The RTX 5090 Is Blackwell, But Not That Blackwell'
 date: '2026-02-27'
-description: 'GeForce Blackwell and datacenter Blackwell expose meaningfully different tensor-core capabilities.'
+description: 'Why GeForce and datacenter Blackwell GPUs can share a name but expose different tensor-core programming paths.'
 tags: ['Nvidia', 'GPU', 'GPU Kernel']
 ---
 I bought an RTX 5090 FE mostly for machine learning work. I still play games on it, but the real reason I wanted the card was FP4 support on Blackwell Tensor Cores.
-I'm a proud owner for an RTX 5090 FE. I occasionally play games on it, but it's mostly used for ML workloads.
+FP4 matters because deep learning is mostly a story about moving numbers through matrix multiplications. If you can use fewer bits per number without breaking the model, you can often run faster and fit more work into the same memory. For someone interested in low-precision compute, a consumer card with Blackwell Tensor Cores sounded like a very good deal.
 I jumped on the 50-series especially for the fp4 support on their 5th generation blackwell tensor cores, cause I'm actively working on some pretty exciting low precision computing.
 Imagine my surprise when I was perusing the GPU mode discord and find people calling the GeForce blackwell cards "Fake blackwell"?!!
 Looking online, I found next to no resources on the difference. I foolishly assumed that my GeForce card 
 <SideNote>The GeForce Cards are `sm_120` with compute capability 12 and the Datacenter cards are `sm_100` with compute capability 10
 You'd expect a higher compute capability to mean something.</SideNote> would contain all the features from the datacenter cards, as
 it seemed to be a later arch. No, Nvidia just made it confusing, and managed to obscure the technical details extremely well. Going through the [CUDA documentation](https://docs.nvidia.com/CUDA/parallel-thread-execution/),
 you'll see that the new tensor core gen 5 instructions are only compatible with `sm_100[a-f]` (Datacenter Blackwell) and `sm_101` (Jetson Thor). What does this mean? That involved a lot more digging.
 Then I saw people in the GPU Mode Discord calling GeForce Blackwell "fake Blackwell."
-### What's in the new tensor cores?
+That sounded ridiculous at first. The RTX 5090 is a Blackwell GPU. It runs Blackwell-era FP4 kernels. It is also very fast. But after digging through Nvidia's docs and running a few benchmarks, I understand what people were reacting to.
-The Blackwell Tensor Cores now support lower precision, namely FP6 and FP4, which the previous Hopper generation didn't. This enables extremely fast low precision matrix multiplications.
+The short version: the 5090 is real Blackwell, but it is not a small B200.
 The PTX ISA also introduces `tcgen05` instructions, which make use of `TMEM` or tensor memory, which only the datacenter cards support. This additional memory sits next to the Tensor cores, and can
 be used independent of the registers used in CUDA cores. The GeForce cards get 128KB of shared memory per SM, while the datacenter card and the Jetson thor get 228KB SMEM + 256KB TMEM. This is absolutely insane for
 any kind of work load. Why did I have to dig so hard to find this information? The 5090 is an enthusiast tier card, which I feel deserves a clear description of what you're buying. 
 ### The takeaway
-### Benchmarking NVFP4 performance
+If you only remember one thing, remember this:
-I needed to confirm this myself. <SideNote>NVFP4 is Nvidia's new low precision format. </SideNote> I downloaded the cutlass repo and ran the nvfp4 matrix multiply example. Here's what I got
+- GeForce Blackwell and datacenter Blackwell share a product-generation name.
 - They do not expose the same low-level tensor-core programming model.
 - The RTX 5090 can run very fast FP4 workloads.
 - It does not expose datacenter Blackwell's `tcgen05` path or tensor memory.
 - Compute capability is not a simple "higher number means all the features" ranking.
 That last point is what fooled me.
 The GeForce cards are `sm_120`, with compute capability 12. Datacenter Blackwell cards such as B200 are `sm_100`, with compute capability 10. A normal person would look at 12 versus 10 and assume the consumer card is newer, broader, or at least a superset.
 That is not how this works.
 Compute capability is a target for CUDA code generation. It tells the compiler what family of instructions and hardware behavior to expect. It is not a feature score. Starting with Blackwell, Nvidia is also leaning harder on family-specific feature sets, which means some low-level features exist for one Blackwell family and not another. Nvidia explains this in its writeup on [family-specific architecture features](https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/).
 ### A quick vocabulary reset
 Here is the minimum context for the rest of the post.
 - **Tensor Cores** are specialized GPU units built for the matrix multiplications that dominate deep learning.
 - **Precision** means how many bits you use to store each number. FP32 uses 32 bits. FP16 uses 16. FP4 uses 4.
 - **FP4** is interesting because it can reduce memory traffic and increase throughput, if the model can tolerate the lower precision.
 - **NVFP4** is Nvidia's FP4 format for deep learning workloads.
 - **A kernel** is a small program that runs on the GPU.
 - **CUTLASS** is Nvidia's library for writing fast matrix multiplication kernels.
 - **PTX** is Nvidia's low-level instruction language for CUDA GPUs.
 - **An SM**, or streaming multiprocessor, is one of the GPU's main compute blocks.
 - **Shared memory** is fast memory inside an SM.
 - **TMEM**, or tensor memory, is extra memory near the Tensor Cores on datacenter Blackwell.
 With that out of the way, the argument becomes much easier to follow.
 ### What datacenter Blackwell gets
 Datacenter Blackwell introduces a new Tensor Core instruction family called `tcgen05`. Nvidia's CUTLASS documentation describes Blackwell SM100 GEMMs as targeting `tcgen05.mma` instructions, including support for 4-bit, 6-bit, and 8-bit floating point data types.
 Those instructions are important because they are not just a new spelling for old matrix multiply code. They are part of a different programming path for Tensor Cores.
 The big extra piece is TMEM. You can think of shared memory as the fast staging area CUDA programmers already use to feed GPU work efficiently. TMEM adds another staging area closer to the Tensor Cores. For the kind of low-precision matrix multiplication deep learning cares about, that matters because the math units can be so fast that feeding them becomes the problem.
 This is where the GeForce and datacenter stories split.
 The RTX 5090 has Blackwell Tensor Cores and can run NVFP4 workloads. But when I looked through the [CUDA PTX documentation](https://docs.nvidia.com/CUDA/parallel-thread-execution/), the `tcgen05` features I cared about were tied to the datacenter Blackwell family, not GeForce Blackwell. The Blackwell tuning guide also lists B200 shared-memory configurations up to 228 KB per SM; GeForce does not get the same TMEM story.
 So the practical distinction is not "does this GPU support FP4 at all?" It does. The distinction is "does this GPU expose the datacenter Blackwell Tensor Core programming model?" It does not.
 ### What the 5090 actually does
 I still wanted to measure the card instead of just reading target tables. So I pulled CUTLASS and ran its NVFP4 matrix multiplication example on the RTX 5090.
 <Image
  src="/images/1_blackwell_dc_vs_gf/5090_65536_cropped.png"
@@ -35,8 +75,11 @@ I needed to confirm this myself. <SideNote>NVFP4 is Nvidia's new low precision f
  height={77}
 />
-Over a PETA FLOP of nvfp4 compute! ggs. This is already insane, and I'm very happy with it. I didn't get `wgmma` from hopper, nor the `tcgen05` instructions and the `TMEM`, but I did get a petaflop of nvfp4 compute.
+That is over a petaflop of NVFP4 compute.
-Nsight Compute tells us exactly what we would expect
+
 A petaflop means one quadrillion floating-point operations per second. For a desktop GPU, that is not a fake result in any meaningful everyday sense. The card is doing serious low-precision work.
 But the next question is whether the Tensor Cores are being fed efficiently. Nsight Compute gives the more interesting picture.
 <Image
  src="/images/1_blackwell_dc_vs_gf/geforce_ncu.png"
@@ -45,7 +88,13 @@ Nsight Compute tells us exactly what we would expect
  height={807}
 />
-Tensor cores are so fast that the memory is bottlenecking them. All of the shared memory is filling up. Huh, I guess nvidia realised this and created `tcgen05` but we don't get to see any of that.
+The short read is: memory is the bottleneck. The Tensor Cores can chew through math faster than the rest of the kernel can keep them supplied. Shared memory pressure shows up immediately.
 That is exactly the problem the datacenter path is designed to address.
 ### What happens on B200
 To compare against datacenter Blackwell, I rented a B200 instance on Vast.ai and ran the same kind of matrix multiplication with CUTLASS kernels targeting `sm_100a`.
 <Image
  src="/images/1_blackwell_dc_vs_gf/nvtop_b200.png"
@@ -54,9 +103,6 @@ Tensor cores are so fast that the memory is bottlenecking them. All of the share
  height={106}
 />
 To see how the GPU folk with datacenters live, I booted up a vast ai instance and ran the same matmul, but with cutlass kernels for `sm_100a`.
 <Image
  src="/images/1_blackwell_dc_vs_gf/b200_65536_cropped.png"
  alt="A CUTLASS benchmark result from a B200 GPU"
@@ -64,7 +110,30 @@ To see how the GPU folk with datacenters live, I booted up a vast ai instance an
  height={108}
 />
-We're getting over 2 petaflops, and I'm sure these things can go even faster with better code. Not having `tcgen05` really holds back the geforce cards. 
+That run gets past 2 petaflops, and I suspect better kernels can push it further. The point is not just the number. The point is that the B200 has access to the datacenter Blackwell path that the 5090 does not.
-This is amazing, I wish I'd be able to get a taste of this locally.
+
 This is the part that makes the "fake Blackwell" complaint emotionally understandable, even if I would not phrase it that way.
 The RTX 5090 is not fake. It is a powerful GeForce card with real NVFP4 capability. But if you heard "Blackwell" and expected the Tensor Core programming model described in SM100 docs, you bought the wrong mental model.
 ### Why this should be clearer
 I do not think the problem is that the 5090 is bad. I like the card.
 The problem is that Nvidia's naming makes it easy to assume one Blackwell label means one Blackwell feature set. It does not. The details are discoverable, but they are scattered across CUDA docs, CUTLASS docs, tuning guides, target suffixes, and benchmark behavior.
 That is fine for compiler engineers. It is not fine for people buying expensive hardware for machine learning.
 If a GPU is marketed into a world where students, researchers, indie labs, and small companies are all trying to run deep learning workloads locally, the feature split should be much easier to understand before purchase.
 ### Bottom line
 The RTX 5090 gives you a lot of low-precision compute for a desktop machine. It can run NVFP4 kernels, and in my tests it crossed a petaflop.
 But GeForce Blackwell is not datacenter Blackwell. The missing `tcgen05` and TMEM path is the real split.
 So yes, the 5090 is Blackwell.
 It is just not that Blackwell.
 Why Jensen, why.
--- a/content/posts/qwen36_tokens_per_kwh.mdx
+++ b/content/posts/qwen36_tokens_per_kwh.mdx
@@ -0,0 +1,291 @@
 ---
 title: 'Can an RTX 5090 Be an Always On Local Agent Box?'
 date: '2026-05-26'
 description: 'Qwen3.6-27B, MTP, clock limits, and trying to make a 5090 something I would leave running all day.'
 tags: ['LLM Inference', 'GPU', 'Local Agents']
 ---
 I wanted a local agent I could leave running all day.
 Not a benchmark setup. Not a machine I turn on for one prompt, screenshot the result, then shut down. I wanted something that could sit there with a coding agent or research agent running for hours.
 A lot of people use Macs for this. That makes sense.
 The idle draw is very low. Energy Star lists the 2025 M3 Ultra Mac Studio at **7.7 W short idle** and **6.3 W long idle** for the 512 GB model. Apple lists the same class of machine at **9 W idle** and **270 W max** from the wall.
 That is a good power envelope for local agents. Agents spend a lot of time waiting on tools, tests, file reads, and network calls. Peak speed matters, but so does idle power. So does the power draw during a long decode.
 And **30 to 40 generated tokens per second** is fine for a lot of agent work. It may not feel great if you are staring at every token during interactive coding. But if the agent is reading files, running commands, and waiting on tests, the model is only one part of the loop.
 My question was simple. I wanted to see whether my RTX 5090 could fit into that kind of all day setup.
 Not at Mac idle power. That is not going to happen. But low enough that I would be comfortable leaving it on.
 ### Why decode is awkward
 LLMs generate one token at a time.
 The model reads the prompt, predicts a token, adds that token to the context, then runs again. Each new token depends on what came before it.
 That is awkward for a large GPU. A 5090 has a lot of compute, but a single decode stream often cannot use all of it. The GPU keeps moving model weights and KV cache around to produce one new token. Then it repeats.
 So raw GPU size does not always turn into efficient local inference.
 <img
  src="/images/2_qwen36_tokens_per_kwh/fig5_autoregressive_decode.svg"
  alt="Autoregressive decoding diagram showing one model pass producing one accepted token at a time"
 />
 Speculative decoding tries to fix this shape.
 A cheap drafter guesses future tokens. The main model checks those guesses. Accepted tokens stay. Bad guesses get dropped.
 The goal is simple. Get more than one useful token out of an expensive model pass.
 ### Why MTP helps
 MTP means multi token prediction.
 Instead of using a fully separate draft model, an MTP model has a lookahead path built in. It can draft future tokens, then the main model verifies them.
 That is why the Qwen3.6 MTP model caught my attention. `Qwen3.6-27B` is a dense 27B model with a long context window. The model card recommends at least 128K context for complex thinking work. That is the kind of model I want for a local agent.
 Large enough to be useful. Small enough to run locally.
 <img
  src="/images/2_qwen36_tokens_per_kwh/fig6_mtp_decode.svg"
  alt="MTP speculative decoding diagram showing a lookahead head proposing several draft tokens and the target model verifying them"
 />
 ### The 5090 settings
 I used the Qwen MTP GGUF setup. I also locked the clocks instead of letting the card boost freely.
 ```bash
 sudo nvidia-smi -lgc 1200,1200
 sudo nvidia-smi -lmc 7001,7001
 ```
 This part matters. On my card, `nvidia-smi -pl` cannot go low enough.
 ```text
 Default Power Limit: 575 W
 Min Power Limit:     400 W
 Max Power Limit:     600 W
 ```
 The normal power limit knob can stop the card from going past 400 W. It cannot ask the card to behave like a 215 W inference card.
 Clock limiting is what got me there.
 The operating point I liked was this.
 ```text
 Qwen3.6-27B MTP
 87.2 generated tok/s
 215 W GPU power
 ```
 The same power settings without MTP gave me **32.1 tok/s**.
 So MTP was not a small change. It moved the setup from usable to comfortable.
 <Image
  src="/images/2_qwen36_tokens_per_kwh/author_nvidia_mtp_x6_215w_report.png"
  alt="Author measurement screenshot showing Qwen3.6-27B MTP throughput and a 215 watt GPU power reading"
  width={1656}
  height={807}
 />
 ### The math
 The metric I care about is generated tokens per watt second.
 A watt second is a joule. So I will write it as **tokens/J**.
 ```text
 tokens/J = generated tok/s / watts
 ```
 For the 215 W MTP run, the math is this.
 ```text
 87.2 tok/s / 215 W = 0.4056 tokens/J
 1 / 0.4056          = 2.47 J/token
 ```
 For the no MTP run at the same power settings, it is this.
 ```text
 32.1 tok/s / 215 W = 0.1493 tokens/J
 1 / 0.1493         = 6.70 J/token
 ```
 So MTP improved generated token efficiency by **2.72x** at the same 215 W operating point.
 ```text
 0.4056 / 0.1493 = 2.72x
 ```
 There is a second question. Did clock limiting itself improve efficiency compared with stock clocks?
 I do not have a clean stock clock run with both throughput and power recorded. So I will not claim that yet.
 But the cutoff is easy to calculate.
 ```text
 To match 0.4056 tokens/J at 400 W, stock would need 162.2 tok/s.
 To match 0.4056 tokens/J at 450 W, stock would need 182.5 tok/s.
 ```
 My MTP x4 screenshot shows about **151.5 tok/s**, but I did not record watts for that run. If that run was near 400 W, the 215 W point is more efficient. If a stock or high power run gets past **162.2 tok/s at 400 W**, it matches the 215 W point. At **450 W**, it needs **182.5 tok/s**.
 That is the measurement I still need.
 ### Daily power
 At 87.2 tok/s, the machine can generate this much text in a day.
 ```text
 87.2 tok/s * 86,400 sec/day = 7,534,080 tokens/day
 ```
 The GPU only number is this.
 ```text
 0.4056 tokens/J * 3,600,000 J/kWh = 1.46M tokens/kWh
 215 W * 24 h / 1000 = 5.16 kWh/day
 7,534,080 / 5.16    = 1.46M tokens/kWh
 ```
 That is clean, but it is not whole machine power.
 My CPU pulls about **60 W** by default. If I add that, I get a rough box number.
 ```text
 215 W GPU + 60 W CPU = 275 W
 275 W * 24 h / 1000 = 6.60 kWh/day
 7,534,080 / 6.60    = 1.14M tokens/kWh
 ```
 That gives two comparison points.
 ```text
 GPU only:       87.2 tok/s / 215 W = 0.4056 tokens/J
 GPU plus CPU:   87.2 tok/s / 275 W = 0.3171 tokens/J
 ```
 Adding my current CPU baseline moves the run from **2.47 J/token** to **3.15 J/token**.
 That is still fine for me. I would leave that running.
 Could I push the GPU lower? Yes. I can get the 5090 closer to **150 W**. I need a proper sweep before I say where the best point is. For now, 215 W is the point I like. It is fast enough, and it does not feel wasteful.
 <Image
  src="/images/2_qwen36_tokens_per_kwh/fig1_tokens_per_day.png"
  alt="Bar chart comparing generated tokens per day across the measured NVIDIA run and M3 Ultra benchmark rows"
  width={1800}
  height={990}
 />
 ### The Mac comparison
 The public Apple rows I found are oMLX results for M3 Ultra. They include throughput. They do not include wall power during the run.
 The strongest 80 core rows I found for `Qwen3.6-27B-oQ4-mtp` were these.
 ```text
 8k context    42.0 generated tok/s
 16k context   41.1 generated tok/s
 32k context   38.6 generated tok/s
 ```
 I also found a `Qwen3.6-27B-MXFP4-MTP` row at **25.7 tok/s** for 8k.
 Those are good numbers for agent work. The missing number is power.
 Against my GPU only 215 W point, the M3 Ultra 80 core oQ4 MTP rows need to run around **95 to 104 W at the wall** to match the 5090 on generated tokens per kWh.
 <Image
  src="/images/2_qwen36_tokens_per_kwh/fig3_m3_break_even_power.png"
  alt="Bar chart showing the M3 Ultra wall-power levels needed to match the measured NVIDIA GPU-side tokens per kWh"
  width={1800}
  height={990}
 />
 Against my rough 275 W whole box estimate, the break even numbers move up.
 ```text
 M3 Ultra 80 core oQ4 MTP at 8k    42.0 tok/s / 0.3171 tokens/J = 132.5 W
 M3 Ultra 80 core oQ4 MTP at 16k   41.1 tok/s / 0.3171 tokens/J = 129.6 W
 M3 Ultra 80 core oQ4 MTP at 32k   38.6 tok/s / 0.3171 tokens/J = 121.7 W
 M3 Ultra 80 core MXFP4 MTP at 8k  25.7 tok/s / 0.3171 tokens/J = 81.0 W
 ```
 So there are two fair comparisons.
 Against GPU only, the Mac needs about **95 to 104 W** for the oQ4 MTP rows.
 Against my rough whole box number, it needs about **122 to 132 W**.
 That is why I do not want to turn this into a clean win for either side. The Mac has very low idle power, a low peak envelope, and unified memory. The 5090 has much more decode headroom if I tune it.
 The real comparison needs outlet power during the same workload.
 <Image
  src="/images/2_qwen36_tokens_per_kwh/fig4_m3_power_sensitivity.png"
  alt="Line chart showing M3 Ultra tokens per kWh across different assumed wall-power levels"
  width={1800}
  height={990}
 />
 ### What changed for me
 Before this, I thought of the 5090 as a burst machine.
 Use it for a heavy run. Finish the job. Shut it down.
 After this, I am more willing to treat it as a local agent box.
 Not at stock settings. Not with the motherboard and GPU doing whatever they want. But with clock limits, MTP, and a power point I chose, the setup lands somewhere I can live with.
 ```text
 87.2 generated tok/s
 215 W GPU power
 about 275 W with my current CPU baseline
 7.53M generated tokens/day
 ```
 That is enough throughput for an agent to read, edit, run tools, make mistakes, and keep going.
 I do not need the maximum token rate. I need a machine that can keep working for hours without feeling wasteful.
 ### Next test
 MTP is one way to deal with the one token at a time problem.
 Next I want to test diffusion based speculative decoding.
 DFlash is the paper I am looking at. It uses a lightweight block diffusion model as the drafter. The draft side can propose a block of tokens in parallel. The target model still verifies the draft.
 The paper reports lossless acceleration. That is useful. But the question I care about is narrower.
 ```text
 Does this improve tokens/J for a local agent that runs for hours?
 ```
 That is the next article.
 ### Sources
 Apple power numbers came from [Apple Support](https://support.apple.com/en-us/102027).
 Energy Star idle numbers came from [Energy Star product 4513877](https://www.energystar.gov/productfinder/product/certified-computers/details/4513877/export/pdf).
 The Qwen context note came from the [`Qwen3.6-27B` model card](https://huggingface.co/Qwen/Qwen3.6-27B).
 The MTP explanation follows the [vLLM MTP docs](https://docs.vllm.ai/en/latest/features/speculative_decoding/mtp/) and the [vLLM speculative decoding docs](https://docs.vllm.ai/usage/speculative_decoding/).
 The diffusion speculation follow up is based on [DFlash](https://arxiv.org/abs/2602.06036).
 The Apple throughput comparison uses the oMLX [quantization table](https://omlx.ai/benchmarks?order=desc&page=7143&sort=quantization) and [memory table](https://omlx.ai/benchmarks?order=desc&sort=memory_gb).
--- a/public/images/2_qwen36_tokens_per_kwh/author_nvidia_mtp_x2_x4_report.png
+++ b/public/images/2_qwen36_tokens_per_kwh/author_nvidia_mtp_x2_x4_report.png
--- a/public/images/2_qwen36_tokens_per_kwh/author_nvidia_mtp_x6_215w_report.png
+++ b/public/images/2_qwen36_tokens_per_kwh/author_nvidia_mtp_x6_215w_report.png
--- a/public/images/2_qwen36_tokens_per_kwh/fig1_tokens_per_day.png
+++ b/public/images/2_qwen36_tokens_per_kwh/fig1_tokens_per_day.png
--- a/public/images/2_qwen36_tokens_per_kwh/fig2_tokens_per_kwh_envelope.png
+++ b/public/images/2_qwen36_tokens_per_kwh/fig2_tokens_per_kwh_envelope.png
--- a/public/images/2_qwen36_tokens_per_kwh/fig3_m3_break_even_power.png
+++ b/public/images/2_qwen36_tokens_per_kwh/fig3_m3_break_even_power.png
--- a/public/images/2_qwen36_tokens_per_kwh/fig4_m3_power_sensitivity.png
+++ b/public/images/2_qwen36_tokens_per_kwh/fig4_m3_power_sensitivity.png
--- a/public/images/2_qwen36_tokens_per_kwh/fig5_autoregressive_decode.svg
+++ b/public/images/2_qwen36_tokens_per_kwh/fig5_autoregressive_decode.svg
@@ -0,0 +1,49 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="1200" height="520" viewBox="0 0 1200 520" role="img" aria-labelledby="title desc">
  <title id="title">Autoregressive decoding</title>
  <desc id="desc">A simplified diagram showing one target model pass producing one accepted token, repeated sequentially.</desc>
  <defs>
    <style>
      .bg { fill: #f7f4ed; }
      .panel { fill: #fffdf7; stroke: #cfc8ba; stroke-width: 2; }
      .ink { fill: #181612; font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; }
      .muted { fill: #6b6254; font-family: ui-monospace, "SFMono-Regular", Consolas, monospace; }
      .box-title { fill: #181612; font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; text-anchor: middle; }
      .box-sub { fill: #6b6254; font-family: ui-monospace, "SFMono-Regular", Consolas, monospace; text-anchor: middle; }
      .accent { fill: #0f6f77; }
      .arrow-line { stroke: #5c554a; stroke-width: 3; fill: none; stroke-linecap: round; stroke-linejoin: round; }
      .arrow-head { fill: #5c554a; }
      .dash { stroke: #b8ad9b; stroke-width: 2; stroke-dasharray: 8 8; fill: none; }
    </style>
  </defs>
  <rect class="bg" width="1200" height="520" rx="24"/>
  <text class="ink" x="56" y="72" font-size="34" font-weight="650">Autoregressive decode</text>
  <text class="muted" x="56" y="112" font-size="18">one target model pass gives one accepted token</text>
  <rect class="panel" x="72" y="188" width="220" height="118" rx="18"/>
  <text class="box-title" x="182" y="238" font-size="24" font-weight="650">context</text>
  <text class="box-sub" x="182" y="274" font-size="17">prompt + KV cache</text>
  <path class="arrow-line" d="M292 247 L385 247"/>
  <path class="arrow-head" d="M410 247 L385 234 L385 260 Z"/>
  <rect class="panel" x="430" y="168" width="280" height="158" rx="20"/>
  <text class="box-title" x="570" y="228" font-size="25" font-weight="650">target model</text>
  <text class="box-sub" x="570" y="265" font-size="17">full forward pass</text>
  <rect class="accent" x="485" y="287" width="170" height="8" rx="4"/>
  <path class="arrow-line" d="M710 247 L810 247"/>
  <path class="arrow-head" d="M835 247 L810 234 L810 260 Z"/>
  <rect class="panel" x="855" y="188" width="166" height="118" rx="18"/>
  <text class="box-title" x="938" y="238" font-size="24" font-weight="650">token</text>
  <text class="box-sub" x="938" y="274" font-size="17">+1</text>
  <path class="arrow-line" d="M1021 247 C1110 247 1102 397 940 417 C650 452 290 427 210 333"/>
  <path class="arrow-head" d="M188 309 L214 320 L201 346 Z"/>
  <path class="dash" d="M430 344 L710 344"/>
  <text class="muted" x="458" y="382" font-size="16">the GPU pays this cost one token at a time</text>
  <text class="muted" x="454" y="456" font-size="17">append token, then run again</text>
 </svg>
--- a/public/images/2_qwen36_tokens_per_kwh/fig6_mtp_decode.svg
+++ b/public/images/2_qwen36_tokens_per_kwh/fig6_mtp_decode.svg
@@ -0,0 +1,73 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="1200" height="560" viewBox="0 0 1200 560" role="img" aria-labelledby="title desc">
  <title id="title">MTP speculative decoding</title>
  <desc id="desc">A simplified diagram showing a multi token prediction path proposing draft tokens and the target model verifying them together.</desc>
  <defs>
    <style>
      .bg { fill: #f7f4ed; }
      .panel { fill: #fffdf7; stroke: #cfc8ba; stroke-width: 2; }
      .ink { fill: #181612; font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; }
      .muted { fill: #6b6254; font-family: ui-monospace, "SFMono-Regular", Consolas, monospace; }
      .box-title { fill: #181612; font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; text-anchor: middle; }
      .box-sub { fill: #6b6254; font-family: ui-monospace, "SFMono-Regular", Consolas, monospace; text-anchor: middle; }
      .accent { fill: #0f6f77; }
      .warm { fill: #c06922; }
      .cool { fill: #315f9f; }
      .arrow-line { stroke: #5c554a; stroke-width: 3; fill: none; stroke-linecap: round; stroke-linejoin: round; }
      .arrow-head { fill: #5c554a; }
      .thin { stroke: #b8ad9b; stroke-width: 2; fill: none; }
    </style>
  </defs>
  <rect class="bg" width="1200" height="560" rx="24"/>
  <text class="ink" x="56" y="72" font-size="34" font-weight="650">MTP speculative decode</text>
  <text class="muted" x="56" y="112" font-size="18">draft several tokens, verify them, keep the accepted prefix</text>
  <rect class="panel" x="72" y="218" width="220" height="118" rx="18"/>
  <text class="box-title" x="182" y="268" font-size="24" font-weight="650">context</text>
  <text class="box-sub" x="182" y="304" font-size="17">prompt + KV cache</text>
  <path class="arrow-line" d="M292 277 C342 277 342 196 395 196"/>
  <path class="arrow-head" d="M420 196 L395 183 L395 209 Z"/>
  <path class="arrow-line" d="M292 277 C342 277 342 374 395 374"/>
  <path class="arrow-head" d="M420 374 L395 361 L395 387 Z"/>
  <rect class="panel" x="440" y="138" width="270" height="116" rx="20"/>
  <text class="box-title" x="575" y="188" font-size="24" font-weight="650">MTP lookahead</text>
  <text class="box-sub" x="575" y="224" font-size="17">propose draft tokens</text>
  <rect class="panel" x="440" y="316" width="270" height="116" rx="20"/>
  <text class="box-title" x="575" y="366" font-size="24" font-weight="650">target model</text>
  <text class="box-sub" x="575" y="402" font-size="17">verify in one pass</text>
  <path class="arrow-line" d="M710 196 L738 196"/>
  <path class="arrow-head" d="M763 196 L738 183 L738 209 Z"/>
  <g transform="translate(782 170)">
    <rect class="warm" x="0" y="0" width="54" height="54" rx="12"/>
    <rect class="warm" x="66" y="0" width="54" height="54" rx="12"/>
    <rect class="warm" x="132" y="0" width="54" height="54" rx="12"/>
    <rect class="warm" x="198" y="0" width="54" height="54" rx="12"/>
    <rect class="warm" x="264" y="0" width="54" height="54" rx="12"/>
    <rect class="warm" x="330" y="0" width="54" height="54" rx="12"/>
  </g>
  <text class="muted" x="874" y="252" font-size="17">draft block</text>
  <path class="arrow-line" d="M710 374 L748 374"/>
  <path class="arrow-head" d="M773 374 L748 361 L748 387 Z"/>
  <path class="arrow-line" d="M1030 232 L1030 340"/>
  <path class="arrow-head" d="M1030 365 L1017 340 L1043 340 Z"/>
  <rect class="panel" x="790" y="356" width="344" height="102" rx="18"/>
  <rect class="accent" x="820" y="384" width="50" height="50" rx="11"/>
  <rect class="accent" x="880" y="384" width="50" height="50" rx="11"/>
  <rect class="accent" x="940" y="384" width="50" height="50" rx="11"/>
  <rect class="cool" x="1000" y="384" width="50" height="50" rx="11"/>
  <rect class="cool" x="1060" y="384" width="50" height="50" rx="11"/>
  <path class="thin" d="M820 448 L990 448"/>
  <text class="muted" x="820" y="492" font-size="16">accepted prefix, retry on mismatch</text>
  <text class="muted" x="720" y="530" font-size="16">best case, one pass advances several tokens</text>
 </svg>