From 534142c5fc5d7f577c2b5a1346481d0d156f1a9f Mon Sep 17 00:00:00 2001
From: mmcky <mamckay@gmail.com>
Date: Thu, 27 Nov 2025 15:52:08 +1100
Subject: [PATCH 1/4] DEBUG: Add hardware benchmark for GitHub Actions CPU
 comparison

- Add benchmark-hardware.py script for CPU/GPU performance testing
- Install JAX CPU version for comparison
- This PR tests standard GitHub Actions runner performance
---
 .github/workflows/ci.yml      |   5 +
 scripts/benchmark-hardware.py | 264 ++++++++++++++++++++++++++++++++++
 2 files changed, 269 insertions(+)
 create mode 100644 scripts/benchmark-hardware.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 51755ea9..3a52c3db 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -35,6 +35,11 @@ jobs:
       - name: Display Pip Versions
         shell: bash -l {0}
         run: pip list
+      - name: Run Hardware Benchmarks
+        shell: bash -l {0}
+        run: |
+          pip install jax  # Install JAX for CPU
+          python scripts/benchmark-hardware.py
       - name: Download "build" folder (cache)
         uses: dawidd6/action-download-artifact@v11
         with:
diff --git a/scripts/benchmark-hardware.py b/scripts/benchmark-hardware.py
new file mode 100644
index 00000000..45a1604c
--- /dev/null
+++ b/scripts/benchmark-hardware.py
@@ -0,0 +1,264 @@
+"""
+Hardware benchmark script for CI runners.
+Compares CPU and GPU performance to diagnose slowdowns.
+Works on both CPU-only (GitHub Actions) and GPU (RunsOn) runners.
+"""
+import time
+import platform
+import os
+
+def get_cpu_info():
+    """Get CPU information."""
+    print("=" * 60)
+    print("SYSTEM INFORMATION")
+    print("=" * 60)
+    print(f"Platform: {platform.platform()}")
+    print(f"Processor: {platform.processor()}")
+    print(f"Python: {platform.python_version()}")
+    
+    # Try to get CPU frequency
+    try:
+        with open('/proc/cpuinfo', 'r') as f:
+            for line in f:
+                if 'model name' in line:
+                    print(f"CPU Model: {line.split(':')[1].strip()}")
+                    break
+    except:
+        pass
+    
+    # Try to get CPU frequency
+    try:
+        with open('/proc/cpuinfo', 'r') as f:
+            for line in f:
+                if 'cpu MHz' in line:
+                    print(f"CPU MHz: {line.split(':')[1].strip()}")
+                    break
+    except:
+        pass
+    
+    # CPU count
+    print(f"CPU Count: {os.cpu_count()}")
+    
+    # Check for GPU
+    try:
+        import subprocess
+        result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'], 
+                              capture_output=True, text=True, timeout=5)
+        if result.returncode == 0:
+            print(f"GPU: {result.stdout.strip()}")
+        else:
+            print("GPU: None detected")
+    except:
+        print("GPU: None detected (nvidia-smi not available)")
+    
+    print()
+
+def benchmark_cpu_pure_python():
+    """Pure Python CPU benchmark."""
+    print("=" * 60)
+    print("CPU BENCHMARK: Pure Python")
+    print("=" * 60)
+    
+    # Integer computation
+    start = time.perf_counter()
+    total = sum(i * i for i in range(10_000_000))
+    elapsed = time.perf_counter() - start
+    print(f"Integer sum (10M iterations): {elapsed:.3f} seconds")
+    
+    # Float computation
+    start = time.perf_counter()
+    total = 0.0
+    for i in range(1_000_000):
+        total += (i * 0.1) ** 0.5
+    elapsed = time.perf_counter() - start
+    print(f"Float sqrt (1M iterations): {elapsed:.3f} seconds")
+    print()
+
+def benchmark_cpu_numpy():
+    """NumPy CPU benchmark."""
+    import numpy as np
+    
+    print("=" * 60)
+    print("CPU BENCHMARK: NumPy")
+    print("=" * 60)
+    
+    # Matrix multiplication
+    n = 3000
+    A = np.random.randn(n, n)
+    B = np.random.randn(n, n)
+    
+    start = time.perf_counter()
+    C = A @ B
+    elapsed = time.perf_counter() - start
+    print(f"Matrix multiply ({n}x{n}): {elapsed:.3f} seconds")
+    
+    # Element-wise operations
+    x = np.random.randn(50_000_000)
+    
+    start = time.perf_counter()
+    y = np.cos(x**2) + np.sin(x)
+    elapsed = time.perf_counter() - start
+    print(f"Element-wise ops (50M elements): {elapsed:.3f} seconds")
+    print()
+
+def benchmark_gpu_jax():
+    """JAX benchmark (GPU if available, otherwise CPU)."""
+    try:
+        import jax
+        import jax.numpy as jnp
+        
+        devices = jax.devices()
+        default_backend = jax.default_backend()
+        
+        # Check if GPU is available
+        has_gpu = any('cuda' in str(d).lower() or 'gpu' in str(d).lower() for d in devices)
+        
+        print("=" * 60)
+        if has_gpu:
+            print("JAX BENCHMARK: GPU")
+        else:
+            print("JAX BENCHMARK: CPU (no GPU detected)")
+        print("=" * 60)
+        
+        print(f"JAX devices: {devices}")
+        print(f"Default backend: {default_backend}")
+        print(f"GPU Available: {has_gpu}")
+        print()
+        
+        # Warm-up JIT compilation
+        print("Warming up JIT compilation...")
+        n = 1000
+        key = jax.random.PRNGKey(0)
+        A = jax.random.normal(key, (n, n))
+        B = jax.random.normal(key, (n, n))
+        
+        @jax.jit
+        def matmul(a, b):
+            return jnp.dot(a, b)
+        
+        # Warm-up run (includes compilation)
+        start = time.perf_counter()
+        C = matmul(A, B).block_until_ready()
+        warmup_time = time.perf_counter() - start
+        print(f"Warm-up (includes JIT compile, {n}x{n}): {warmup_time:.3f} seconds")
+        
+        # Actual benchmark (compiled)
+        start = time.perf_counter()
+        C = matmul(A, B).block_until_ready()
+        elapsed = time.perf_counter() - start
+        print(f"Matrix multiply compiled ({n}x{n}): {elapsed:.3f} seconds")
+        
+        # Larger matrix
+        n = 3000
+        A = jax.random.normal(key, (n, n))
+        B = jax.random.normal(key, (n, n))
+        
+        # Warm-up for new size
+        start = time.perf_counter()
+        C = matmul(A, B).block_until_ready()
+        warmup_time = time.perf_counter() - start
+        print(f"Warm-up (recompile for {n}x{n}): {warmup_time:.3f} seconds")
+        
+        # Benchmark compiled
+        start = time.perf_counter()
+        C = matmul(A, B).block_until_ready()
+        elapsed = time.perf_counter() - start
+        print(f"Matrix multiply compiled ({n}x{n}): {elapsed:.3f} seconds")
+        
+        # Element-wise GPU benchmark
+        x = jax.random.normal(key, (50_000_000,))
+        
+        @jax.jit
+        def elementwise_ops(x):
+            return jnp.cos(x**2) + jnp.sin(x)
+        
+        # Warm-up
+        start = time.perf_counter()
+        y = elementwise_ops(x).block_until_ready()
+        warmup_time = time.perf_counter() - start
+        print(f"Element-wise warm-up (50M): {warmup_time:.3f} seconds")
+        
+        # Compiled
+        start = time.perf_counter()
+        y = elementwise_ops(x).block_until_ready()
+        elapsed = time.perf_counter() - start
+        print(f"Element-wise compiled (50M): {elapsed:.3f} seconds")
+        
+        print()
+        
+    except ImportError as e:
+        print(f"JAX not available: {e}")
+    except Exception as e:
+        print(f"JAX benchmark failed: {e}")
+
+def benchmark_numba():
+    """Numba CPU benchmark."""
+    try:
+        import numba
+        import numpy as np
+        
+        print("=" * 60)
+        print("CPU BENCHMARK: Numba")
+        print("=" * 60)
+        
+        @numba.jit(nopython=True)
+        def numba_sum(n):
+            total = 0
+            for i in range(n):
+                total += i * i
+            return total
+        
+        # Warm-up (compilation)
+        start = time.perf_counter()
+        result = numba_sum(10_000_000)
+        warmup_time = time.perf_counter() - start
+        print(f"Integer sum warm-up (includes compile): {warmup_time:.3f} seconds")
+        
+        # Compiled run
+        start = time.perf_counter()
+        result = numba_sum(10_000_000)
+        elapsed = time.perf_counter() - start
+        print(f"Integer sum compiled (10M): {elapsed:.3f} seconds")
+        
+        @numba.jit(nopython=True, parallel=True)
+        def numba_parallel_sum(arr):
+            total = 0.0
+            for i in numba.prange(len(arr)):
+                total += arr[i] ** 2
+            return total
+        
+        arr = np.random.randn(50_000_000)
+        
+        # Warm-up
+        start = time.perf_counter()
+        result = numba_parallel_sum(arr)
+        warmup_time = time.perf_counter() - start
+        print(f"Parallel sum warm-up (50M): {warmup_time:.3f} seconds")
+        
+        # Compiled
+        start = time.perf_counter()
+        result = numba_parallel_sum(arr)
+        elapsed = time.perf_counter() - start
+        print(f"Parallel sum compiled (50M): {elapsed:.3f} seconds")
+        
+        print()
+        
+    except ImportError as e:
+        print(f"Numba not available: {e}")
+    except Exception as e:
+        print(f"Numba benchmark failed: {e}")
+
+if __name__ == "__main__":
+    print("\n" + "=" * 60)
+    print("HARDWARE BENCHMARK FOR CI RUNNER")
+    print("=" * 60 + "\n")
+    
+    get_cpu_info()
+    benchmark_cpu_pure_python()
+    benchmark_cpu_numpy()
+    benchmark_numba()
+    benchmark_gpu_jax()
+    
+    print("=" * 60)
+    print("BENCHMARK COMPLETE")
+    print("=" * 60)

From f8829a494c804ae19a45ba589e686922db16f6ed Mon Sep 17 00:00:00 2001
From: mmcky <mamckay@gmail.com>
Date: Thu, 27 Nov 2025 16:21:31 +1100
Subject: [PATCH 2/4] Add multi-pathway benchmark tests (bare metal, Jupyter,
 jupyter-book)

---
 .github/workflows/ci.yml         |  26 +++++-
 scripts/benchmark-jupyter.ipynb  |   0
 scripts/benchmark-jupyterbook.md | 156 +++++++++++++++++++++++++++++++
 3 files changed, 181 insertions(+), 1 deletion(-)
 create mode 100644 scripts/benchmark-jupyter.ipynb
 create mode 100644 scripts/benchmark-jupyterbook.md

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3a52c3db..686572d6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -35,11 +35,35 @@ jobs:
       - name: Display Pip Versions
         shell: bash -l {0}
         run: pip list
-      - name: Run Hardware Benchmarks
+      - name: Run Hardware Benchmarks (Bare Metal)
         shell: bash -l {0}
         run: |
           pip install jax  # Install JAX for CPU
+          echo "=== Bare Metal Python Script Execution ==="
           python scripts/benchmark-hardware.py
+      - name: Run Jupyter Notebook Benchmark (via nbconvert)
+        shell: bash -l {0}
+        run: |
+          echo "=== Jupyter Kernel Execution ==="
+          jupyter nbconvert --to notebook --execute scripts/benchmark-jupyter.ipynb --output benchmark-jupyter-executed.ipynb
+          echo "Notebook executed successfully"
+      - name: Run Jupyter-Book Benchmark
+        shell: bash -l {0}
+        run: |
+          echo "=== Jupyter-Book Execution ==="
+          # Build just the benchmark file using jupyter-book
+          mkdir -p benchmark_test
+          cp scripts/benchmark-jupyterbook.md benchmark_test/
+          # Create minimal _config.yml
+          echo "title: Benchmark Test" > benchmark_test/_config.yml
+          echo "execute:" >> benchmark_test/_config.yml
+          echo "  execute_notebooks: force" >> benchmark_test/_config.yml
+          # Create minimal _toc.yml
+          echo "format: jb-book" > benchmark_test/_toc.yml
+          echo "root: benchmark-jupyterbook" >> benchmark_test/_toc.yml
+          # Build
+          jb build benchmark_test --path-output benchmark_build/
+          echo "Jupyter-Book build completed successfully"
       - name: Download "build" folder (cache)
         uses: dawidd6/action-download-artifact@v11
         with:
diff --git a/scripts/benchmark-jupyter.ipynb b/scripts/benchmark-jupyter.ipynb
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/benchmark-jupyterbook.md b/scripts/benchmark-jupyterbook.md
new file mode 100644
index 00000000..23434e9e
--- /dev/null
+++ b/scripts/benchmark-jupyterbook.md
@@ -0,0 +1,156 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+# JAX Performance Benchmark - Jupyter Book Execution
+
+This file tests JAX performance when executed through Jupyter Book's notebook execution.
+Compare results with direct script and nbconvert execution.
+
+```{code-cell} ipython3
+import time
+import platform
+import os
+
+print("=" * 60)
+print("JUPYTER BOOK EXECUTION BENCHMARK")
+print("=" * 60)
+print(f"Platform: {platform.platform()}")
+print(f"Python: {platform.python_version()}")
+print(f"CPU Count: {os.cpu_count()}")
+```
+
+```{code-cell} ipython3
+# Import JAX and check devices
+import jax
+import jax.numpy as jnp
+
+devices = jax.devices()
+default_backend = jax.default_backend()
+has_gpu = any('cuda' in str(d).lower() or 'gpu' in str(d).lower() for d in devices)
+
+print(f"JAX devices: {devices}")
+print(f"Default backend: {default_backend}")
+print(f"GPU Available: {has_gpu}")
+```
+
+```{code-cell} ipython3
+# Define JIT-compiled function
+@jax.jit
+def matmul(a, b):
+    return jnp.dot(a, b)
+
+print("matmul function defined with @jax.jit")
+```
+
+```{code-cell} ipython3
+# Benchmark 1: Small matrix (1000x1000) - includes JIT compilation
+print("\n" + "=" * 60)
+print("BENCHMARK 1: Small Matrix (1000x1000)")
+print("=" * 60)
+
+n = 1000
+key = jax.random.PRNGKey(0)
+A = jax.random.normal(key, (n, n))
+B = jax.random.normal(key, (n, n))
+
+# Warm-up run (includes compilation)
+start = time.perf_counter()
+C = matmul(A, B).block_until_ready()
+warmup_time = time.perf_counter() - start
+print(f"Warm-up (includes JIT compile): {warmup_time:.3f} seconds")
+
+# Compiled run
+start = time.perf_counter()
+C = matmul(A, B).block_until_ready()
+compiled_time = time.perf_counter() - start
+print(f"Compiled execution: {compiled_time:.3f} seconds")
+```
+
+```{code-cell} ipython3
+# Benchmark 2: Large matrix (3000x3000) - triggers recompilation
+print("\n" + "=" * 60)
+print("BENCHMARK 2: Large Matrix (3000x3000)")
+print("=" * 60)
+
+n = 3000
+A = jax.random.normal(key, (n, n))
+B = jax.random.normal(key, (n, n))
+
+# Warm-up run (recompilation for new size)
+start = time.perf_counter()
+C = matmul(A, B).block_until_ready()
+warmup_time = time.perf_counter() - start
+print(f"Warm-up (recompile for new size): {warmup_time:.3f} seconds")
+
+# Compiled run
+start = time.perf_counter()
+C = matmul(A, B).block_until_ready()
+compiled_time = time.perf_counter() - start
+print(f"Compiled execution: {compiled_time:.3f} seconds")
+```
+
+```{code-cell} ipython3
+# Benchmark 3: Element-wise operations (50M elements)
+print("\n" + "=" * 60)
+print("BENCHMARK 3: Element-wise Operations (50M elements)")
+print("=" * 60)
+
+@jax.jit
+def elementwise_ops(x):
+    return jnp.cos(x**2) + jnp.sin(x)
+
+x = jax.random.normal(key, (50_000_000,))
+
+# Warm-up
+start = time.perf_counter()
+y = elementwise_ops(x).block_until_ready()
+warmup_time = time.perf_counter() - start
+print(f"Warm-up (includes JIT compile): {warmup_time:.3f} seconds")
+
+# Compiled
+start = time.perf_counter()
+y = elementwise_ops(x).block_until_ready()
+compiled_time = time.perf_counter() - start
+print(f"Compiled execution: {compiled_time:.3f} seconds")
+```
+
+```{code-cell} ipython3
+# Benchmark 4: Multiple small operations (simulates lecture cells)
+print("\n" + "=" * 60)
+print("BENCHMARK 4: Multiple Small Operations (lecture simulation)")
+print("=" * 60)
+
+total_start = time.perf_counter()
+
+# Simulate multiple cell executions with different operations
+for i, size in enumerate([100, 500, 1000, 2000, 3000]):
+    @jax.jit
+    def compute(a, b):
+        return jnp.dot(a, b) + jnp.sum(a)
+    
+    A = jax.random.normal(key, (size, size))
+    B = jax.random.normal(key, (size, size))
+    
+    start = time.perf_counter()
+    result = compute(A, B).block_until_ready()
+    elapsed = time.perf_counter() - start
+    print(f"  Size {size}x{size}: {elapsed:.3f} seconds")
+
+total_time = time.perf_counter() - total_start
+print(f"\nTotal time for all operations: {total_time:.3f} seconds")
+```
+
+```{code-cell} ipython3
+print("\n" + "=" * 60)
+print("JUPYTER BOOK EXECUTION BENCHMARK COMPLETE")
+print("=" * 60)
+```

From ef69a1a17c4e20231b150b1f7d8c98460a7aec8f Mon Sep 17 00:00:00 2001
From: mmcky <mamckay@gmail.com>
Date: Thu, 27 Nov 2025 16:34:18 +1100
Subject: [PATCH 3/4] Fix: Add benchmark content to benchmark-jupyter.ipynb

---
 scripts/benchmark-jupyter.ipynb | 207 ++++++++++++++++++++++++++++++++
 1 file changed, 207 insertions(+)

diff --git a/scripts/benchmark-jupyter.ipynb b/scripts/benchmark-jupyter.ipynb
index e69de29b..e095f79b 100644
--- a/scripts/benchmark-jupyter.ipynb
+++ b/scripts/benchmark-jupyter.ipynb
@@ -0,0 +1,207 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# JAX Performance Benchmark - Jupyter Kernel Execution\n",
+    "\n",
+    "This notebook tests JAX performance when executed through a Jupyter kernel.\n",
+    "Compare results with direct script and jupyter-book execution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import platform\n",
+    "import os\n",
+    "\n",
+    "print(\"=\" * 60)\n",
+    "print(\"JUPYTER KERNEL EXECUTION BENCHMARK\")\n",
+    "print(\"=\" * 60)\n",
+    "print(f\"Platform: {platform.platform()}\")\n",
+    "print(f\"Python: {platform.python_version()}\")\n",
+    "print(f\"CPU Count: {os.cpu_count()}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import JAX and check devices\n",
+    "import jax\n",
+    "import jax.numpy as jnp\n",
+    "\n",
+    "devices = jax.devices()\n",
+    "default_backend = jax.default_backend()\n",
+    "has_gpu = any('cuda' in str(d).lower() or 'gpu' in str(d).lower() for d in devices)\n",
+    "\n",
+    "print(f\"JAX devices: {devices}\")\n",
+    "print(f\"Default backend: {default_backend}\")\n",
+    "print(f\"GPU Available: {has_gpu}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define JIT-compiled function\n",
+    "@jax.jit\n",
+    "def matmul(a, b):\n",
+    "    return jnp.dot(a, b)\n",
+    "\n",
+    "print(\"matmul function defined with @jax.jit\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Benchmark 1: Small matrix (1000x1000) - includes JIT compilation\n",
+    "print(\"\\n\" + \"=\" * 60)\n",
+    "print(\"BENCHMARK 1: Small Matrix (1000x1000)\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "n = 1000\n",
+    "key = jax.random.PRNGKey(0)\n",
+    "A = jax.random.normal(key, (n, n))\n",
+    "B = jax.random.normal(key, (n, n))\n",
+    "\n",
+    "# Warm-up run (includes compilation)\n",
+    "start = time.perf_counter()\n",
+    "C = matmul(A, B).block_until_ready()\n",
+    "warmup_time = time.perf_counter() - start\n",
+    "print(f\"Warm-up (includes JIT compile): {warmup_time:.3f} seconds\")\n",
+    "\n",
+    "# Compiled run\n",
+    "start = time.perf_counter()\n",
+    "C = matmul(A, B).block_until_ready()\n",
+    "compiled_time = time.perf_counter() - start\n",
+    "print(f\"Compiled execution: {compiled_time:.3f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Benchmark 2: Large matrix (3000x3000) - triggers recompilation\n",
+    "print(\"\\n\" + \"=\" * 60)\n",
+    "print(\"BENCHMARK 2: Large Matrix (3000x3000)\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "n = 3000\n",
+    "A = jax.random.normal(key, (n, n))\n",
+    "B = jax.random.normal(key, (n, n))\n",
+    "\n",
+    "# Warm-up run (recompilation for new size)\n",
+    "start = time.perf_counter()\n",
+    "C = matmul(A, B).block_until_ready()\n",
+    "warmup_time = time.perf_counter() - start\n",
+    "print(f\"Warm-up (recompile for new size): {warmup_time:.3f} seconds\")\n",
+    "\n",
+    "# Compiled run\n",
+    "start = time.perf_counter()\n",
+    "C = matmul(A, B).block_until_ready()\n",
+    "compiled_time = time.perf_counter() - start\n",
+    "print(f\"Compiled execution: {compiled_time:.3f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Benchmark 3: Element-wise operations (50M elements)\n",
+    "print(\"\\n\" + \"=\" * 60)\n",
+    "print(\"BENCHMARK 3: Element-wise Operations (50M elements)\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "@jax.jit\n",
+    "def elementwise_ops(x):\n",
+    "    return jnp.cos(x**2) + jnp.sin(x)\n",
+    "\n",
+    "x = jax.random.normal(key, (50_000_000,))\n",
+    "\n",
+    "# Warm-up\n",
+    "start = time.perf_counter()\n",
+    "y = elementwise_ops(x).block_until_ready()\n",
+    "warmup_time = time.perf_counter() - start\n",
+    "print(f\"Warm-up (includes JIT compile): {warmup_time:.3f} seconds\")\n",
+    "\n",
+    "# Compiled\n",
+    "start = time.perf_counter()\n",
+    "y = elementwise_ops(x).block_until_ready()\n",
+    "compiled_time = time.perf_counter() - start\n",
+    "print(f\"Compiled execution: {compiled_time:.3f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Benchmark 4: Multiple small operations (simulates lecture cells)\n",
+    "print(\"\\n\" + \"=\" * 60)\n",
+    "print(\"BENCHMARK 4: Multiple Small Operations (lecture simulation)\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "total_start = time.perf_counter()\n",
+    "\n",
+    "# Simulate multiple cell executions with different operations\n",
+    "for i, size in enumerate([100, 500, 1000, 2000, 3000]):\n",
+    "    @jax.jit\n",
+    "    def compute(a, b):\n",
+    "        return jnp.dot(a, b) + jnp.sum(a)\n",
+    "    \n",
+    "    A = jax.random.normal(key, (size, size))\n",
+    "    B = jax.random.normal(key, (size, size))\n",
+    "    \n",
+    "    start = time.perf_counter()\n",
+    "    result = compute(A, B).block_until_ready()\n",
+    "    elapsed = time.perf_counter() - start\n",
+    "    print(f\"  Size {size}x{size}: {elapsed:.3f} seconds\")\n",
+    "\n",
+    "total_time = time.perf_counter() - total_start\n",
+    "print(f\"\\nTotal time for all operations: {total_time:.3f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"\\n\" + \"=\" * 60)\n",
+    "print(\"JUPYTER KERNEL EXECUTION BENCHMARK COMPLETE\")\n",
+    "print(\"=\" * 60)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.13.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 8f06e41231fc0f80e6fd4ee61b6e880247a64d67 Mon Sep 17 00:00:00 2001
From: mmcky <mamckay@gmail.com>
Date: Thu, 27 Nov 2025 16:46:19 +1100
Subject: [PATCH 4/4] Add JSON output to benchmarks and upload as artifacts

- Update benchmark-hardware.py to save results to JSON
- Update benchmark-jupyter.ipynb to save results to JSON
- Update benchmark-jupyterbook.md to save results to JSON
- Add CI step to collect and display benchmark results
- Add CI step to upload benchmark results as artifact
---
 .github/workflows/ci.yml         | 39 ++++++++++++++--
 scripts/benchmark-hardware.py    | 80 ++++++++++++++++++++++++++++++--
 scripts/benchmark-jupyter.ipynb  | 52 ++++++++++++++++++---
 scripts/benchmark-jupyterbook.md | 40 ++++++++++++++++
 4 files changed, 198 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 686572d6..18d09454 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -45,8 +45,10 @@ jobs:
         shell: bash -l {0}
         run: |
           echo "=== Jupyter Kernel Execution ==="
-          jupyter nbconvert --to notebook --execute scripts/benchmark-jupyter.ipynb --output benchmark-jupyter-executed.ipynb
+          cd scripts
+          jupyter nbconvert --to notebook --execute benchmark-jupyter.ipynb --output benchmark-jupyter-executed.ipynb
           echo "Notebook executed successfully"
+          cd ..
       - name: Run Jupyter-Book Benchmark
         shell: bash -l {0}
         run: |
@@ -61,9 +63,40 @@ jobs:
           # Create minimal _toc.yml
           echo "format: jb-book" > benchmark_test/_toc.yml
           echo "root: benchmark-jupyterbook" >> benchmark_test/_toc.yml
-          # Build
-          jb build benchmark_test --path-output benchmark_build/
+          # Build (run from benchmark_test so JSON is written there)
+          cd benchmark_test
+          jb build . --path-output ../benchmark_build/
+          cd ..
           echo "Jupyter-Book build completed successfully"
+      - name: Collect and Display Benchmark Results
+        shell: bash -l {0}
+        run: |
+          echo "=== Collecting Benchmark Results ==="
+          mkdir -p benchmark_results
+          
+          # Copy results from each pathway
+          cp benchmark_results_bare_metal.json benchmark_results/ 2>/dev/null || echo "No bare metal results"
+          cp scripts/benchmark_results_jupyter.json benchmark_results/ 2>/dev/null || echo "No jupyter results"
+          cp benchmark_test/benchmark_results_jupyterbook.json benchmark_results/ 2>/dev/null || echo "No jupyterbook results"
+          
+          # Display summary
+          echo ""
+          echo "============================================================"
+          echo "BENCHMARK RESULTS SUMMARY"
+          echo "============================================================"
+          for f in benchmark_results/*.json; do
+            if [ -f "$f" ]; then
+              echo ""
+              echo "--- $(basename $f) ---"
+              cat "$f"
+            fi
+          done
+      - name: Upload Benchmark Results
+        uses: actions/upload-artifact@v5
+        with:
+          name: benchmark-results
+          path: benchmark_results/
+          if-no-files-found: warn
       - name: Download "build" folder (cache)
         uses: dawidd6/action-download-artifact@v11
         with:
diff --git a/scripts/benchmark-hardware.py b/scripts/benchmark-hardware.py
index 45a1604c..12443855 100644
--- a/scripts/benchmark-hardware.py
+++ b/scripts/benchmark-hardware.py
@@ -6,6 +6,16 @@
 import time
 import platform
 import os
+import json
+from datetime import datetime
+
+# Global results dictionary
+RESULTS = {
+    "pathway": "bare_metal",
+    "timestamp": datetime.now().isoformat(),
+    "system": {},
+    "benchmarks": {}
+}
 
 def get_cpu_info():
     """Get CPU information."""
@@ -16,12 +26,20 @@ def get_cpu_info():
     print(f"Processor: {platform.processor()}")
     print(f"Python: {platform.python_version()}")
     
-    # Try to get CPU frequency
+    RESULTS["system"]["platform"] = platform.platform()
+    RESULTS["system"]["processor"] = platform.processor()
+    RESULTS["system"]["python"] = platform.python_version()
+    RESULTS["system"]["cpu_count"] = os.cpu_count()
+    
+    # Try to get CPU model
+    cpu_model = None
+    cpu_mhz = None
     try:
         with open('/proc/cpuinfo', 'r') as f:
             for line in f:
                 if 'model name' in line:
-                    print(f"CPU Model: {line.split(':')[1].strip()}")
+                    cpu_model = line.split(':')[1].strip()
+                    print(f"CPU Model: {cpu_model}")
                     break
     except:
         pass
@@ -31,26 +49,33 @@ def get_cpu_info():
         with open('/proc/cpuinfo', 'r') as f:
             for line in f:
                 if 'cpu MHz' in line:
-                    print(f"CPU MHz: {line.split(':')[1].strip()}")
+                    cpu_mhz = line.split(':')[1].strip()
+                    print(f"CPU MHz: {cpu_mhz}")
                     break
     except:
         pass
     
+    RESULTS["system"]["cpu_model"] = cpu_model
+    RESULTS["system"]["cpu_mhz"] = cpu_mhz
+    
     # CPU count
     print(f"CPU Count: {os.cpu_count()}")
     
     # Check for GPU
+    gpu_info = None
     try:
         import subprocess
         result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'], 
                               capture_output=True, text=True, timeout=5)
         if result.returncode == 0:
-            print(f"GPU: {result.stdout.strip()}")
+            gpu_info = result.stdout.strip()
+            print(f"GPU: {gpu_info}")
         else:
             print("GPU: None detected")
     except:
         print("GPU: None detected (nvidia-smi not available)")
     
+    RESULTS["system"]["gpu"] = gpu_info
     print()
 
 def benchmark_cpu_pure_python():
@@ -59,11 +84,14 @@ def benchmark_cpu_pure_python():
     print("CPU BENCHMARK: Pure Python")
     print("=" * 60)
     
+    results = {}
+    
     # Integer computation
     start = time.perf_counter()
     total = sum(i * i for i in range(10_000_000))
     elapsed = time.perf_counter() - start
     print(f"Integer sum (10M iterations): {elapsed:.3f} seconds")
+    results["integer_sum_10m"] = elapsed
     
     # Float computation
     start = time.perf_counter()
@@ -72,7 +100,10 @@ def benchmark_cpu_pure_python():
         total += (i * 0.1) ** 0.5
     elapsed = time.perf_counter() - start
     print(f"Float sqrt (1M iterations): {elapsed:.3f} seconds")
+    results["float_sqrt_1m"] = elapsed
     print()
+    
+    RESULTS["benchmarks"]["pure_python"] = results
 
 def benchmark_cpu_numpy():
     """NumPy CPU benchmark."""
@@ -82,6 +113,8 @@ def benchmark_cpu_numpy():
     print("CPU BENCHMARK: NumPy")
     print("=" * 60)
     
+    results = {}
+    
     # Matrix multiplication
     n = 3000
     A = np.random.randn(n, n)
@@ -91,6 +124,7 @@ def benchmark_cpu_numpy():
     C = A @ B
     elapsed = time.perf_counter() - start
     print(f"Matrix multiply ({n}x{n}): {elapsed:.3f} seconds")
+    results["matmul_3000x3000"] = elapsed
     
     # Element-wise operations
     x = np.random.randn(50_000_000)
@@ -99,7 +133,10 @@ def benchmark_cpu_numpy():
     y = np.cos(x**2) + np.sin(x)
     elapsed = time.perf_counter() - start
     print(f"Element-wise ops (50M elements): {elapsed:.3f} seconds")
+    results["elementwise_50m"] = elapsed
     print()
+    
+    RESULTS["benchmarks"]["numpy"] = results
 
 def benchmark_gpu_jax():
     """JAX benchmark (GPU if available, otherwise CPU)."""
@@ -125,6 +162,12 @@ def benchmark_gpu_jax():
         print(f"GPU Available: {has_gpu}")
         print()
         
+        results = {
+            "backend": default_backend,
+            "has_gpu": has_gpu,
+            "devices": str(devices)
+        }
+        
         # Warm-up JIT compilation
         print("Warming up JIT compilation...")
         n = 1000
@@ -141,12 +184,14 @@ def matmul(a, b):
         C = matmul(A, B).block_until_ready()
         warmup_time = time.perf_counter() - start
         print(f"Warm-up (includes JIT compile, {n}x{n}): {warmup_time:.3f} seconds")
+        results["matmul_1000x1000_warmup"] = warmup_time
         
         # Actual benchmark (compiled)
         start = time.perf_counter()
         C = matmul(A, B).block_until_ready()
         elapsed = time.perf_counter() - start
         print(f"Matrix multiply compiled ({n}x{n}): {elapsed:.3f} seconds")
+        results["matmul_1000x1000_compiled"] = elapsed
         
         # Larger matrix
         n = 3000
@@ -158,12 +203,14 @@ def matmul(a, b):
         C = matmul(A, B).block_until_ready()
         warmup_time = time.perf_counter() - start
         print(f"Warm-up (recompile for {n}x{n}): {warmup_time:.3f} seconds")
+        results["matmul_3000x3000_warmup"] = warmup_time
         
         # Benchmark compiled
         start = time.perf_counter()
         C = matmul(A, B).block_until_ready()
         elapsed = time.perf_counter() - start
         print(f"Matrix multiply compiled ({n}x{n}): {elapsed:.3f} seconds")
+        results["matmul_3000x3000_compiled"] = elapsed
         
         # Element-wise GPU benchmark
         x = jax.random.normal(key, (50_000_000,))
@@ -177,19 +224,24 @@ def elementwise_ops(x):
         y = elementwise_ops(x).block_until_ready()
         warmup_time = time.perf_counter() - start
         print(f"Element-wise warm-up (50M): {warmup_time:.3f} seconds")
+        results["elementwise_50m_warmup"] = warmup_time
         
         # Compiled
         start = time.perf_counter()
         y = elementwise_ops(x).block_until_ready()
         elapsed = time.perf_counter() - start
         print(f"Element-wise compiled (50M): {elapsed:.3f} seconds")
+        results["elementwise_50m_compiled"] = elapsed
         
         print()
+        RESULTS["benchmarks"]["jax"] = results
         
     except ImportError as e:
         print(f"JAX not available: {e}")
+        RESULTS["benchmarks"]["jax"] = {"error": str(e)}
     except Exception as e:
         print(f"JAX benchmark failed: {e}")
+        RESULTS["benchmarks"]["jax"] = {"error": str(e)}
 
 def benchmark_numba():
     """Numba CPU benchmark."""
@@ -201,6 +253,8 @@ def benchmark_numba():
         print("CPU BENCHMARK: Numba")
         print("=" * 60)
         
+        results = {}
+        
         @numba.jit(nopython=True)
         def numba_sum(n):
             total = 0
@@ -213,12 +267,14 @@ def numba_sum(n):
         result = numba_sum(10_000_000)
         warmup_time = time.perf_counter() - start
         print(f"Integer sum warm-up (includes compile): {warmup_time:.3f} seconds")
+        results["integer_sum_10m_warmup"] = warmup_time
         
         # Compiled run
         start = time.perf_counter()
         result = numba_sum(10_000_000)
         elapsed = time.perf_counter() - start
         print(f"Integer sum compiled (10M): {elapsed:.3f} seconds")
+        results["integer_sum_10m_compiled"] = elapsed
         
         @numba.jit(nopython=True, parallel=True)
         def numba_parallel_sum(arr):
@@ -234,19 +290,32 @@ def numba_parallel_sum(arr):
         result = numba_parallel_sum(arr)
         warmup_time = time.perf_counter() - start
         print(f"Parallel sum warm-up (50M): {warmup_time:.3f} seconds")
+        results["parallel_sum_50m_warmup"] = warmup_time
         
         # Compiled
         start = time.perf_counter()
         result = numba_parallel_sum(arr)
         elapsed = time.perf_counter() - start
         print(f"Parallel sum compiled (50M): {elapsed:.3f} seconds")
+        results["parallel_sum_50m_compiled"] = elapsed
         
         print()
+        RESULTS["benchmarks"]["numba"] = results
         
     except ImportError as e:
         print(f"Numba not available: {e}")
+        RESULTS["benchmarks"]["numba"] = {"error": str(e)}
     except Exception as e:
         print(f"Numba benchmark failed: {e}")
+        RESULTS["benchmarks"]["numba"] = {"error": str(e)}
+
+
+def save_results(output_path="benchmark_results_bare_metal.json"):
+    """Save benchmark results to JSON file."""
+    with open(output_path, 'w') as f:
+        json.dump(RESULTS, f, indent=2)
+    print(f"\nResults saved to: {output_path}")
+
 
 if __name__ == "__main__":
     print("\n" + "=" * 60)
@@ -259,6 +328,9 @@ def numba_parallel_sum(arr):
     benchmark_numba()
     benchmark_gpu_jax()
     
+    # Save results to JSON
+    save_results("benchmark_results_bare_metal.json")
+    
     print("=" * 60)
     print("BENCHMARK COMPLETE")
     print("=" * 60)
diff --git a/scripts/benchmark-jupyter.ipynb b/scripts/benchmark-jupyter.ipynb
index e095f79b..909b8fe5 100644
--- a/scripts/benchmark-jupyter.ipynb
+++ b/scripts/benchmark-jupyter.ipynb
@@ -19,6 +19,20 @@
     "import time\n",
     "import platform\n",
     "import os\n",
+    "import json\n",
+    "from datetime import datetime\n",
+    "\n",
+    "# Initialize results dictionary\n",
+    "RESULTS = {\n",
+    "    \"pathway\": \"jupyter_kernel\",\n",
+    "    \"timestamp\": datetime.now().isoformat(),\n",
+    "    \"system\": {\n",
+    "        \"platform\": platform.platform(),\n",
+    "        \"python\": platform.python_version(),\n",
+    "        \"cpu_count\": os.cpu_count()\n",
+    "    },\n",
+    "    \"benchmarks\": {}\n",
+    "}\n",
     "\n",
     "print(\"=\" * 60)\n",
     "print(\"JUPYTER KERNEL EXECUTION BENCHMARK\")\n",
@@ -44,7 +58,11 @@
     "\n",
     "print(f\"JAX devices: {devices}\")\n",
     "print(f\"Default backend: {default_backend}\")\n",
-    "print(f\"GPU Available: {has_gpu}\")"
+    "print(f\"GPU Available: {has_gpu}\")\n",
+    "\n",
+    "RESULTS[\"system\"][\"jax_backend\"] = default_backend\n",
+    "RESULTS[\"system\"][\"has_gpu\"] = has_gpu\n",
+    "RESULTS[\"system\"][\"jax_devices\"] = str(devices)"
    ]
   },
   {
@@ -87,7 +105,10 @@
     "start = time.perf_counter()\n",
     "C = matmul(A, B).block_until_ready()\n",
     "compiled_time = time.perf_counter() - start\n",
-    "print(f\"Compiled execution: {compiled_time:.3f} seconds\")"
+    "print(f\"Compiled execution: {compiled_time:.3f} seconds\")\n",
+    "\n",
+    "RESULTS[\"benchmarks\"][\"matmul_1000x1000_warmup\"] = warmup_time\n",
+    "RESULTS[\"benchmarks\"][\"matmul_1000x1000_compiled\"] = compiled_time"
    ]
   },
   {
@@ -115,7 +136,10 @@
     "start = time.perf_counter()\n",
     "C = matmul(A, B).block_until_ready()\n",
     "compiled_time = time.perf_counter() - start\n",
-    "print(f\"Compiled execution: {compiled_time:.3f} seconds\")"
+    "print(f\"Compiled execution: {compiled_time:.3f} seconds\")\n",
+    "\n",
+    "RESULTS[\"benchmarks\"][\"matmul_3000x3000_warmup\"] = warmup_time\n",
+    "RESULTS[\"benchmarks\"][\"matmul_3000x3000_compiled\"] = compiled_time"
    ]
   },
   {
@@ -145,7 +169,10 @@
     "start = time.perf_counter()\n",
     "y = elementwise_ops(x).block_until_ready()\n",
     "compiled_time = time.perf_counter() - start\n",
-    "print(f\"Compiled execution: {compiled_time:.3f} seconds\")"
+    "print(f\"Compiled execution: {compiled_time:.3f} seconds\")\n",
+    "\n",
+    "RESULTS[\"benchmarks\"][\"elementwise_50m_warmup\"] = warmup_time\n",
+    "RESULTS[\"benchmarks\"][\"elementwise_50m_compiled\"] = compiled_time"
    ]
   },
   {
@@ -160,6 +187,7 @@
     "print(\"=\" * 60)\n",
     "\n",
     "total_start = time.perf_counter()\n",
+    "multi_results = {}\n",
     "\n",
     "# Simulate multiple cell executions with different operations\n",
     "for i, size in enumerate([100, 500, 1000, 2000, 3000]):\n",
@@ -174,9 +202,13 @@
     "    result = compute(A, B).block_until_ready()\n",
     "    elapsed = time.perf_counter() - start\n",
     "    print(f\"  Size {size}x{size}: {elapsed:.3f} seconds\")\n",
+    "    multi_results[f\"size_{size}x{size}\"] = elapsed\n",
     "\n",
     "total_time = time.perf_counter() - total_start\n",
-    "print(f\"\\nTotal time for all operations: {total_time:.3f} seconds\")"
+    "print(f\"\\nTotal time for all operations: {total_time:.3f} seconds\")\n",
+    "\n",
+    "RESULTS[\"benchmarks\"][\"multi_ops\"] = multi_results\n",
+    "RESULTS[\"benchmarks\"][\"multi_ops_total\"] = total_time"
    ]
   },
   {
@@ -185,9 +217,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Save results to JSON file\n",
+    "output_path = \"benchmark_results_jupyter.json\"\n",
+    "with open(output_path, 'w') as f:\n",
+    "    json.dump(RESULTS, f, indent=2)\n",
+    "\n",
     "print(\"\\n\" + \"=\" * 60)\n",
     "print(\"JUPYTER KERNEL EXECUTION BENCHMARK COMPLETE\")\n",
-    "print(\"=\" * 60)"
+    "print(\"=\" * 60)\n",
+    "print(f\"\\nResults saved to: {output_path}\")\n",
+    "print(\"\\nJSON Results:\")\n",
+    "print(json.dumps(RESULTS, indent=2))"
    ]
   }
  ],
diff --git a/scripts/benchmark-jupyterbook.md b/scripts/benchmark-jupyterbook.md
index 23434e9e..162613c8 100644
--- a/scripts/benchmark-jupyterbook.md
+++ b/scripts/benchmark-jupyterbook.md
@@ -19,6 +19,20 @@ Compare results with direct script and nbconvert execution.
 import time
 import platform
 import os
+import json
+from datetime import datetime
+
+# Initialize results dictionary
+RESULTS = {
+    "pathway": "jupyter_book",
+    "timestamp": datetime.now().isoformat(),
+    "system": {
+        "platform": platform.platform(),
+        "python": platform.python_version(),
+        "cpu_count": os.cpu_count()
+    },
+    "benchmarks": {}
+}
 
 print("=" * 60)
 print("JUPYTER BOOK EXECUTION BENCHMARK")
@@ -40,6 +54,10 @@ has_gpu = any('cuda' in str(d).lower() or 'gpu' in str(d).lower() for d in devic
 print(f"JAX devices: {devices}")
 print(f"Default backend: {default_backend}")
 print(f"GPU Available: {has_gpu}")
+
+RESULTS["system"]["jax_backend"] = default_backend
+RESULTS["system"]["has_gpu"] = has_gpu
+RESULTS["system"]["jax_devices"] = str(devices)
 ```
 
 ```{code-cell} ipython3
@@ -73,6 +91,9 @@ start = time.perf_counter()
 C = matmul(A, B).block_until_ready()
 compiled_time = time.perf_counter() - start
 print(f"Compiled execution: {compiled_time:.3f} seconds")
+
+RESULTS["benchmarks"]["matmul_1000x1000_warmup"] = warmup_time
+RESULTS["benchmarks"]["matmul_1000x1000_compiled"] = compiled_time
 ```
 
 ```{code-cell} ipython3
@@ -96,6 +117,9 @@ start = time.perf_counter()
 C = matmul(A, B).block_until_ready()
 compiled_time = time.perf_counter() - start
 print(f"Compiled execution: {compiled_time:.3f} seconds")
+
+RESULTS["benchmarks"]["matmul_3000x3000_warmup"] = warmup_time
+RESULTS["benchmarks"]["matmul_3000x3000_compiled"] = compiled_time
 ```
 
 ```{code-cell} ipython3
@@ -121,6 +145,9 @@ start = time.perf_counter()
 y = elementwise_ops(x).block_until_ready()
 compiled_time = time.perf_counter() - start
 print(f"Compiled execution: {compiled_time:.3f} seconds")
+
+RESULTS["benchmarks"]["elementwise_50m_warmup"] = warmup_time
+RESULTS["benchmarks"]["elementwise_50m_compiled"] = compiled_time
 ```
 
 ```{code-cell} ipython3
@@ -130,6 +157,7 @@ print("BENCHMARK 4: Multiple Small Operations (lecture simulation)")
 print("=" * 60)
 
 total_start = time.perf_counter()
+multi_results = {}
 
 # Simulate multiple cell executions with different operations
 for i, size in enumerate([100, 500, 1000, 2000, 3000]):
@@ -144,13 +172,25 @@ for i, size in enumerate([100, 500, 1000, 2000, 3000]):
     result = compute(A, B).block_until_ready()
     elapsed = time.perf_counter() - start
     print(f"  Size {size}x{size}: {elapsed:.3f} seconds")
+    multi_results[f"size_{size}x{size}"] = elapsed
 
 total_time = time.perf_counter() - total_start
 print(f"\nTotal time for all operations: {total_time:.3f} seconds")
+
+RESULTS["benchmarks"]["multi_ops"] = multi_results
+RESULTS["benchmarks"]["multi_ops_total"] = total_time
 ```
 
 ```{code-cell} ipython3
+# Save results to JSON file
+output_path = "benchmark_results_jupyterbook.json"
+with open(output_path, 'w') as f:
+    json.dump(RESULTS, f, indent=2)
+
 print("\n" + "=" * 60)
 print("JUPYTER BOOK EXECUTION BENCHMARK COMPLETE")
 print("=" * 60)
+print(f"\nResults saved to: {output_path}")
+print("\nJSON Results:")
+print(json.dumps(RESULTS, indent=2))
 ```