fix(lib): address Codex review feedback on model capabilities

SinhSinhAn · SinhSinhAn · commit 2171fb7cc575 · 2026-04-25T11:42:38.000-05:00
Two P1 corrections from the automated Codex review on PR #3124: 1. Enforce segment boundary in prefix matching. `startswith()` was matching unknown identifiers like `gpt-5.10` as `gpt-5.1` and `o1-previewed` as `o1-preview`, causing the documented `None` fallback to be skipped. The lookup now requires the family prefix to either equal the model exactly or be followed by a `-`. 2. Align gpt-5 family effort options with the canonical SDK docs in `src/openai/types/shared/reasoning.py`: - gpt-5.1 (and -codex / -codex-max / -mini): `none/low/medium/high` -- removed `minimal`, which would have surfaced an unsupported value and triggered avoidable 400 responses. - gpt-5.2 / gpt-5.3 / gpt-5.4: `none/low/medium/high/xhigh` (`xhigh` is supported for models *after* gpt-5.1-codex-max). - gpt-5-pro: split into its own family with the high-only effort scale per the SDK docstring ("defaults to and only supports high"). Other improvements: - Chat / search variant detection now matches dated snapshots like `gpt-4o-search-preview-2025-03-11` via a regex instead of a strict `endswith` check. - New `TestBoundaryMatching` class with parametrized regression tests for prefix-collision cases (`gpt-5.10`, `gpt-5.1foo`, `o1-previewed`) and dated chat / search variants. - New `TestGpt5ProFamily` class covering the high-only effort scale. 49 of 49 new tests pass; 154 of 154 lib tests pass overall.
diff --git a/src/openai/lib/_models.py b/src/openai/lib/_models.py
@@ -4,19 +4,23 @@
 capability matrix is documented behaviour rather than schema. When OpenAI ships
 a new model family, the registry in this file should be updated to match.
 
+The canonical source for reasoning-effort behaviour is the docstring on the
+``Reasoning.effort`` parameter in ``src/openai/types/shared/reasoning.py``.
+
 Example:
     >>> from openai import get_model_capabilities
     >>> caps = get_model_capabilities("gpt-5.4-mini")
     >>> caps.supports_reasoning
     True
     >>> caps.reasoning_effort_options
-    ('none', 'minimal', 'low', 'medium', 'high', 'xhigh')
+    ('none', 'low', 'medium', 'high', 'xhigh')
     >>> get_model_capabilities("gpt-4.1").supports_reasoning
     False
 """
 
 from __future__ import annotations
 
+import re
 from typing import Any, Tuple, Optional
 from dataclasses import dataclass
 
@@ -81,28 +85,43 @@ def _caps(
 # ---------------------------------------------------------------------------
 # Family registry.
 #
-# Entries are matched by longest-prefix against the model string, with chat /
-# search variants checked via the suffix test in `get_model_capabilities`.
+# Entries are matched by longest *segment* prefix against the model string
+# (i.e. the registered prefix must either equal the model exactly or be
+# followed by a `-`), with chat / search variants checked via the suffix test
+# in `get_model_capabilities`.
 #
 # When OpenAI ships a new family, add an entry here. Order within this tuple
 # does not matter; the lookup picks the longest matching prefix.
 # ---------------------------------------------------------------------------
 
-# Effort scales reused across families.
+# Effort scales reused across families. These mirror the prose in
+# `src/openai/types/shared/reasoning.py`:
+#
+#   - All models *before* gpt-5.1 default to medium and do NOT support `none`.
+#     gpt-5 base accepts `minimal/low/medium/high`.
+#   - gpt-5.1 supports `none/low/medium/high` (no `minimal`).
+#   - `xhigh` is supported for models *after* gpt-5.1-codex-max, i.e.
+#     gpt-5.2 onward, on top of the gpt-5.1 effort scale.
+#   - gpt-5-pro defaults to and only supports `high`.
 _EFFORT_O_SERIES: Tuple[ReasoningEffort, ...] = ("low", "medium", "high")
-_EFFORT_GPT5: Tuple[ReasoningEffort, ...] = ("minimal", "low", "medium", "high")
-_EFFORT_GPT5_1: Tuple[ReasoningEffort, ...] = ("none", "minimal", "low", "medium", "high")
-_EFFORT_GPT5_4: Tuple[ReasoningEffort, ...] = ("none", "minimal", "low", "medium", "high", "xhigh")
+_EFFORT_GPT5_BASE: Tuple[ReasoningEffort, ...] = ("minimal", "low", "medium", "high")
+_EFFORT_GPT5_1: Tuple[ReasoningEffort, ...] = ("none", "low", "medium", "high")
+_EFFORT_GPT5_2_PLUS: Tuple[ReasoningEffort, ...] = ("none", "low", "medium", "high", "xhigh")
+_EFFORT_GPT5_PRO: Tuple[ReasoningEffort, ...] = ("high",)
 
 
 _FAMILIES: Tuple[ModelCapabilities, ...] = (
     # gpt-5.x reasoning models. Temperature is rejected unless you use a
     # `-chat-latest` variant or set `reasoning_effort="none"`.
-    _caps("gpt-5.4", supports_temperature=False, supports_reasoning=True, reasoning_effort_options=_EFFORT_GPT5_4),
-    _caps("gpt-5.3", supports_temperature=False, supports_reasoning=True, reasoning_effort_options=_EFFORT_GPT5_1),
-    _caps("gpt-5.2", supports_temperature=False, supports_reasoning=True, reasoning_effort_options=_EFFORT_GPT5_1),
+    #
+    # gpt-5-pro is a high-only reasoning model and must be registered as its
+    # own family so longest-prefix matching beats the generic `gpt-5`.
+    _caps("gpt-5-pro", supports_temperature=False, supports_reasoning=True, reasoning_effort_options=_EFFORT_GPT5_PRO),
+    _caps("gpt-5.4", supports_temperature=False, supports_reasoning=True, reasoning_effort_options=_EFFORT_GPT5_2_PLUS),
+    _caps("gpt-5.3", supports_temperature=False, supports_reasoning=True, reasoning_effort_options=_EFFORT_GPT5_2_PLUS),
+    _caps("gpt-5.2", supports_temperature=False, supports_reasoning=True, reasoning_effort_options=_EFFORT_GPT5_2_PLUS),
     _caps("gpt-5.1", supports_temperature=False, supports_reasoning=True, reasoning_effort_options=_EFFORT_GPT5_1),
-    _caps("gpt-5", supports_temperature=False, supports_reasoning=True, reasoning_effort_options=_EFFORT_GPT5),
+    _caps("gpt-5", supports_temperature=False, supports_reasoning=True, reasoning_effort_options=_EFFORT_GPT5_BASE),
     # Classic chat families.
     _caps("gpt-4.1", supports_temperature=True, supports_reasoning=False, reasoning_effort_options=None),
     _caps("gpt-4o", supports_temperature=True, supports_reasoning=False, reasoning_effort_options=None),
@@ -142,9 +161,22 @@ def _caps(
 )
 
 
-# Suffixes that override family defaults. A model ending in one of these is
-# treated as a non-reasoning chat variant regardless of its family.
-_CHAT_VARIANT_SUFFIXES: Tuple[str, ...] = ("-chat-latest", "-search-preview")
+# Matches `*-chat-latest` and `*-search-preview` (with an optional trailing
+# `-YYYY-MM-DD` snapshot date), e.g. `gpt-4o-search-preview-2025-03-11`.
+# These variants behave like classic chat models regardless of family.
+_CHAT_VARIANT_RE = re.compile(r"-(?:chat-latest|search-preview)(?:-\d{4}-\d{2}-\d{2})?$")
+
+
+def _matches_family(model: str, family: str) -> bool:
+    """Match ``model`` against a family prefix at a segment boundary.
+
+    A model matches when it equals the family exactly or extends it with a
+    ``-`` separator. This prevents collisions like ``gpt-5.10`` being
+    misclassified as ``gpt-5.1``.
+    """
+    if model == family:
+        return True
+    return model.startswith(family + "-")
 
 
 def get_model_capabilities(model: str) -> Optional[ModelCapabilities]:
@@ -155,11 +187,17 @@ def get_model_capabilities(model: str) -> Optional[ModelCapabilities]:
     decide which controls to render) but is only as fresh as this module's
     registry. New model families need a corresponding entry here.
 
+    Matching is segment-aware: the registered prefix must either equal the
+    model exactly or be followed by a ``-`` separator. ``"gpt-5.10"`` will
+    therefore *not* match the ``gpt-5.1`` family and ``"o1-previewed"`` will
+    not match ``o1-preview``; both fall through to ``None`` so callers treat
+    them as unknown.
+
     Args:
         model: A model identifier such as ``"gpt-5.4-mini"`` or
             ``"gpt-4o-2024-08-06"``. Date suffixes and size variants
-            (``-mini``, ``-nano``) are handled automatically by longest-prefix
-            matching.
+            (``-mini``, ``-nano``, ``-pro``) are handled automatically by
+            longest-prefix matching.
 
     Returns:
         A :class:`ModelCapabilities` describing the model, or ``None`` if no
@@ -169,11 +207,15 @@ def get_model_capabilities(model: str) -> Optional[ModelCapabilities]:
 
     Example:
         >>> get_model_capabilities("gpt-5.4-mini").reasoning_effort_options
-        ('none', 'minimal', 'low', 'medium', 'high', 'xhigh')
+        ('none', 'low', 'medium', 'high', 'xhigh')
         >>> get_model_capabilities("gpt-5-chat-latest").supports_temperature
         True
         >>> get_model_capabilities("gpt-5").supports_temperature
         False
+        >>> get_model_capabilities("gpt-5-pro").reasoning_effort_options
+        ('high',)
+        >>> get_model_capabilities("gpt-5.10") is None
+        True
         >>> get_model_capabilities("nonexistent-model") is None
         True
     """
@@ -184,11 +226,12 @@ def get_model_capabilities(model: str) -> Optional[ModelCapabilities]:
     if not isinstance(candidate, str) or not candidate:
         return None
 
-    # Longest matching prefix wins so that "gpt-5.4" beats "gpt-5", and "o1-pro"
-    # beats "o1".
+    # Longest matching family wins so that "gpt-5.4" beats "gpt-5", and
+    # "o1-pro" beats "o1". Segment boundary check rejects things like
+    # "gpt-5.10" claiming to be "gpt-5.1".
     best: Optional[ModelCapabilities] = None
     for entry in _FAMILIES:
-        if not candidate.startswith(entry.family):
+        if not _matches_family(candidate, entry.family):
             continue
         if best is None or len(entry.family) > len(best.family):
             best = entry
@@ -199,8 +242,10 @@ def get_model_capabilities(model: str) -> Optional[ModelCapabilities]:
     # Chat / search variants override family defaults: gpt-5-chat-latest is a
     # non-reasoning model even though gpt-5* normally is one. We still report
     # the family so callers can group e.g. "gpt-5.2-chat-latest" with
-    # "gpt-5.2".
-    if any(candidate.endswith(suffix) for suffix in _CHAT_VARIANT_SUFFIXES):
+    # "gpt-5.2". The regex tolerates a trailing date snapshot like
+    # `-2025-03-11` so dated variants like `gpt-4o-search-preview-2025-03-11`
+    # are recognized too.
+    if _CHAT_VARIANT_RE.search(candidate):
         return ModelCapabilities(
             family=best.family,
             supports_temperature=True,
diff --git a/tests/lib/test_model_capabilities.py b/tests/lib/test_model_capabilities.py
@@ -39,49 +39,69 @@ def test_gpt_5_chat_latest_is_non_reasoning(self) -> None:
 
 
 class TestGpt51Family:
-    def test_gpt_5_1_adds_none_to_effort(self) -> None:
+    def test_gpt_5_1_drops_minimal_adds_none(self) -> None:
+        # Per src/openai/types/shared/reasoning.py the supported gpt-5.1
+        # effort values are none/low/medium/high. `minimal` was removed and
+        # would trigger 400 responses if surfaced in a UI.
         caps = get_model_capabilities("gpt-5.1")
         assert caps is not None
         assert caps.family == "gpt-5.1"
-        assert caps.reasoning_effort_options == (
-            "none",
-            "minimal",
-            "low",
-            "medium",
-            "high",
-        )
+        assert caps.reasoning_effort_options == ("none", "low", "medium", "high")
+        assert "minimal" not in (caps.reasoning_effort_options or ())
 
     def test_gpt_5_1_codex(self) -> None:
         caps = get_model_capabilities("gpt-5.1-codex")
         assert caps is not None
         assert caps.family == "gpt-5.1"
 
-    def test_gpt_5_2_uses_same_effort_scale(self) -> None:
+    def test_gpt_5_1_codex_max(self) -> None:
+        # gpt-5.1-codex-max is the boundary referenced in the SDK docs:
+        # `xhigh` is supported for models *after* this one. So it should
+        # still match the gpt-5.1 family (no xhigh).
+        caps = get_model_capabilities("gpt-5.1-codex-max")
+        assert caps is not None
+        assert caps.family == "gpt-5.1"
+        assert "xhigh" not in (caps.reasoning_effort_options or ())
+
+
+class TestGpt52PlusFamilies:
+    def test_gpt_5_2_supports_xhigh(self) -> None:
+        # Per the SDK docstring, `xhigh` is supported for all models *after*
+        # gpt-5.1-codex-max, which means gpt-5.2 onward.
+        caps = get_model_capabilities("gpt-5.2")
+        assert caps is not None
+        assert caps.family == "gpt-5.2"
+        assert caps.reasoning_effort_options == ("none", "low", "medium", "high", "xhigh")
+
+    def test_gpt_5_2_pro(self) -> None:
         caps = get_model_capabilities("gpt-5.2-pro")
         assert caps is not None
         assert caps.family == "gpt-5.2"
-        assert caps.reasoning_effort_options == (
-            "none",
-            "minimal",
-            "low",
-            "medium",
-            "high",
-        )
+
+
+class TestGpt5ProFamily:
+    def test_gpt_5_pro_only_supports_high(self) -> None:
+        # Per the SDK docs, gpt-5-pro defaults to (and only supports) `high`.
+        caps = get_model_capabilities("gpt-5-pro")
+        assert caps is not None
+        assert caps.family == "gpt-5-pro"
+        assert caps.supports_reasoning is True
+        assert caps.reasoning_effort_options == ("high",)
+
+    def test_gpt_5_pro_dated_snapshot(self) -> None:
+        caps = get_model_capabilities("gpt-5-pro-2025-10-06")
+        assert caps is not None
+        assert caps.family == "gpt-5-pro"
+        assert caps.reasoning_effort_options == ("high",)
 
 
 class TestGpt54Family:
-    def test_gpt_5_4_adds_xhigh(self) -> None:
+    def test_gpt_5_4_full_effort_scale(self) -> None:
         caps = get_model_capabilities("gpt-5.4")
         assert caps is not None
         assert caps.family == "gpt-5.4"
-        assert caps.reasoning_effort_options == (
-            "none",
-            "minimal",
-            "low",
-            "medium",
-            "high",
-            "xhigh",
-        )
+        assert caps.reasoning_effort_options == ("none", "low", "medium", "high", "xhigh")
+        assert "minimal" not in (caps.reasoning_effort_options or ())
 
     def test_gpt_5_4_size_variants(self) -> None:
         for size in ("gpt-5.4", "gpt-5.4-mini", "gpt-5.4-nano"):
@@ -181,6 +201,71 @@ def test_o4_mini(self) -> None:
         assert caps.supports_reasoning is True
 
 
+class TestBoundaryMatching:
+    """Family lookup must respect segment boundaries.
+
+    A registered prefix should only match the model when the prefix either
+    equals the model exactly or is followed by ``-``. Without this, a future
+    model name like ``gpt-5.10`` would silently impersonate ``gpt-5.1``.
+    """
+
+    @pytest.mark.parametrize(
+        "model",
+        [
+            "gpt-5.10",  # version typo / future model: must NOT match gpt-5.1
+            "gpt-5.10-mini",
+            "gpt-5.1foo",
+            "o1-previewed",  # near-typo of o1-preview
+            "o1-previews",
+        ],
+    )
+    def test_boundary_collisions_do_not_misclassify(self, model: str) -> None:
+        # Either we return None (no match) or we match a strictly different
+        # family (e.g. `gpt-5.10` could match `gpt-5` segment-wise -- in that
+        # case we want it to NOT claim to be gpt-5.1).
+        caps = get_model_capabilities(model)
+        if caps is not None:
+            assert caps.family != "gpt-5.1", model
+            assert caps.family != "o1-preview", model
+
+    def test_gpt_5_10_does_not_match_gpt_5_1(self) -> None:
+        # `gpt-5.10` is hypothetical, but the lookup must not classify it as
+        # gpt-5.1 just because of `startswith`. Today we'd expect None until
+        # a 5.10 family is registered.
+        assert get_model_capabilities("gpt-5.10") is None
+
+    def test_o1_previewed_does_not_match_o1_preview(self) -> None:
+        # Without the boundary check, `o1-previewed` would be tagged as
+        # o1-preview (which has supports_reasoning=False), masking what is
+        # really an unknown model.
+        caps = get_model_capabilities("o1-previewed")
+        if caps is not None:
+            assert caps.family != "o1-preview"
+
+    def test_exact_match_still_works(self) -> None:
+        # Sanity: removing collisions must not break the exact-match path.
+        for model in ("gpt-5.1", "o1-preview", "chatgpt-4o-latest", "gpt-5-pro"):
+            assert get_model_capabilities(model) is not None, model
+
+    def test_dated_search_preview_still_recognized_as_chat_variant(self) -> None:
+        # `gpt-4o-search-preview-2025-03-11` should still count as a chat
+        # variant despite the trailing date snapshot. The previous suffix
+        # check (`endswith("-search-preview")`) silently regressed on dated
+        # models.
+        caps = get_model_capabilities("gpt-4o-search-preview-2025-03-11")
+        assert caps is not None
+        assert caps.supports_temperature is True
+        assert caps.supports_reasoning is False
+
+    def test_dated_chat_latest_still_recognized_as_chat_variant(self) -> None:
+        caps = get_model_capabilities("gpt-5.2-chat-latest-2025-12-11")
+        assert caps is not None
+        assert caps.supports_temperature is True
+        assert caps.supports_reasoning is False
+        # Family should still be reported as the underlying gpt-5.2
+        assert caps.family == "gpt-5.2"
+
+
 class TestUnknownAndEdgeCases:
     def test_unknown_model_returns_none(self) -> None:
         assert get_model_capabilities("nonexistent-model") is None
@@ -247,13 +332,14 @@ def effort_options(model: str) -> tuple[str, ...]:
             return tuple(opt for opt in caps.reasoning_effort_options if opt is not None)
 
         assert effort_options("gpt-5") == ("minimal", "low", "medium", "high")
+        assert effort_options("gpt-5.1") == ("none", "low", "medium", "high")
         assert effort_options("gpt-5.4") == (
             "none",
-            "minimal",
             "low",
             "medium",
             "high",
             "xhigh",
         )
+        assert effort_options("gpt-5-pro") == ("high",)
         assert effort_options("gpt-4.1") == ()
         assert effort_options("o3") == ("low", "medium", "high")