ci: fix 3 CI failures on self-hosted runner (#62)

27Bslash6 · web-flow · commit 223fc03d8e1d · 2026-03-17T12:14:11.000+11:00
1. Exclude tests/integration/saas/ from full test suite — requires
   private SaaS worker (localhost:8787) not available in CI. Eliminates
   45 errors and prevents timeout on Python 3.9/3.10/3.12/3.13.

2. Pin Python version in quick-check and post-merge jobs via
   --python $DEFAULT_PYTHON_VERSION. The self-hosted runner has 3.14
   installed, causing uv to select it over 3.12 and triggering stale
   Rust target cache collisions with maturin.

3. Fix test_l1_cache_speedup_ratio_validation — was asserting hits
   faster than misses, but L1 hits include LRU bookkeeping
   (move_to_end) that misses skip. Now validates both paths are
   sub-microsecond and hit overhead is bounded (&lt; 2x miss).
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -44,7 +44,7 @@ jobs:
           venv-${{ runner.os }}-py${{ env.DEFAULT_PYTHON_VERSION }}-
 
     - name: Install dependencies (if not cached)
-      run: uv sync --group dev
+      run: uv sync --python ${{ env.DEFAULT_PYTHON_VERSION }} --group dev
 
     - name: Check Python formatting
       run: uv run ruff format --check .
@@ -131,6 +131,7 @@ jobs:
       run: |
         uv run pytest tests/ -m "not slow" \
           --ignore=tests/fuzzing \
+          --ignore=tests/integration/saas \
           --cov=src/cachekit \
           --cov-report=xml \
           --cov-report=term \
@@ -180,7 +181,7 @@ jobs:
         workspaces: rust
 
     - name: Install dependencies (if not cached)
-      run: uv sync --group dev
+      run: uv sync --python ${{ env.DEFAULT_PYTHON_VERSION }} --group dev
 
     - name: Scan Python dependencies for CVEs
       run: uv run pip-audit --desc
diff --git a/tests/performance/test_statistical_rigor.py b/tests/performance/test_statistical_rigor.py
@@ -16,7 +16,7 @@
 
 from cachekit.l1_cache import L1Cache
 
-from .stats_utils import benchmark_with_gc_handling, speedup_ratio
+from .stats_utils import benchmark_with_gc_handling
 
 
 @pytest.fixture(scope="session", autouse=True)
@@ -251,10 +251,15 @@ def test_confidence_intervals_matter() -> None:
 
 @pytest.mark.performance
 def test_l1_cache_speedup_ratio_validation() -> None:
-    """Validate that L1 cache hits are significantly faster than misses.
+    """Validate that L1 cache hit and miss are both sub-microsecond.
 
-    This demonstrates the value of L1 caching - hits should be 2x+ faster
-    than misses because hits avoid the L2 backend roundtrip.
+    In L1-only mode (no L2 backend), hits and misses are both fast dict
+    operations. Hits include LRU bookkeeping (move_to_end), so they may
+    be slightly slower than misses — this is expected.
+
+    Real speedup (10-20x) only manifests when L2 backend (Redis/SaaS)
+    adds network latency to misses. This test validates absolute
+    performance bounds for both paths.
     """
     cache = L1Cache(max_memory_mb=100)
 
@@ -288,22 +293,25 @@ def miss_fn() -> None:
         unit="ns",
     )
 
-    # Calculate speedup ratio
-    hit_samples = [hit_result.mean] * 100  # Synthetic for ratio calculation
-    miss_samples = [miss_result.mean] * 100
-    ratio, interpretation = speedup_ratio(miss_samples, hit_samples)
-
-    print("\nSpeedup Ratio Analysis:")
+    print("\nL1 Performance Analysis:")
     print(f"  Hit mean:   {hit_result.mean:.0f}ns")
     print(f"  Miss mean:  {miss_result.mean:.0f}ns")
-    print(f"  Speedup:    {ratio:.1f}x {interpretation}")
-
-    # Validate speedup is meaningful (hits should be faster than misses)
-    # Note: In this L1-only test, both hit and miss are fast dict operations (~450-500ns).
-    # With a real L2 Redis backend, hits would be 10-20x faster (avoiding network).
-    # We validate >1.0x (measurable) rather than 2.0x (more visible with L2 overhead).
-    assert ratio > 1.0, f"L1 hits should be faster than misses, got {ratio:.1f}x"
-    print(f"✅ Speedup ratio validated: {ratio:.1f}x (hits are measurably faster)")
+
+    # Both operations must be sub-microsecond (1000ns)
+    assert hit_result.p95 < 1000, f"L1 hit p95 {hit_result.p95:.0f}ns exceeds 1000ns target"
+    assert miss_result.p95 < 1000, f"L1 miss p95 {miss_result.p95:.0f}ns exceeds 1000ns target"
+
+    # Hit overhead from LRU bookkeeping should be bounded (< 2x miss time)
+    # Hits do more work (move_to_end), so they can be slower — but not drastically
+    if miss_result.mean > 0:
+        overhead_ratio = hit_result.mean / miss_result.mean
+        print(f"  Hit/Miss ratio: {overhead_ratio:.2f}x")
+        assert overhead_ratio < 2.0, (
+            f"L1 hit overhead too high: {overhead_ratio:.1f}x miss time "
+            f"(hit={hit_result.mean:.0f}ns, miss={miss_result.mean:.0f}ns)"
+        )
+
+    print("✅ L1 cache performance validated (both paths sub-microsecond)")
 
 
 @pytest.mark.performance