refactor(telemetry): extract gen_ai_span context manager — drop try/except + import time from client

Kamilbenkirane · Kamilbenkirane · commit b4cf340adb9e · 2026-05-05T15:14:27.000+02:00
User pushed back on `_predict` wrapping its entire body in try/except just to
record `gen_ai.client.operation.duration` on the error path. The right home
for that is a context manager inside `telemetry.py`.

`gen_ai_span(model=, provider=, protocol=, modality=)` opens the span via
`tracer.start_as_current_span`, captures the start time on enter, and in
`finally` records the operation duration with `error.type` populated when
the body raised. It yields `(span, request_attrs)` so `_predict` can call
`add_input_event` and `record_output` against them.

Result:
- `_predict` is straight-line code: no try/except, no manual time math.
- `client.py` no longer imports `time`.
- `record_output` no longer takes `duration_seconds` / `error` params —
  duration is the span's lifecycle concern, not the output recorder's.
- Streaming side (`_TracedStream._finalize`) updates symmetrically:
  records duration directly, then calls the simplified `record_output`.

Real-call validated against gemini-3.1-flash-lite-preview text non-streaming
+ async streaming: same span attributes, same metric histograms, same
content events as before.
diff --git a/src/celeste/client.py b/src/celeste/client.py
@@ -1,6 +1,5 @@
 """Base client for modality-specific AI operations."""
 
-import time
 import warnings
 from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator
@@ -214,55 +213,42 @@ async def _predict(
         Returns:
             Output of the parameterized type.
         """
-        request_attrs = telemetry.request_attributes(
+        with telemetry.gen_ai_span(
             model=self.model,
             provider=self.provider,
             protocol=self.protocol,
             modality=self.modality,
-        )
-        started = time.monotonic()
-        with telemetry.tracer.start_as_current_span(
-            telemetry.span_name(self.modality, self.model),
-            attributes=request_attrs,
-        ) as span:
-            try:
-                inputs, parameters = self._validate_artifacts(inputs, **parameters)
-                telemetry.add_input_event(span, inputs)
-                request_body = self._build_request(
-                    inputs, extra_body=extra_body, **parameters
-                )
-                response_data = await self._make_request(
-                    request_body,
-                    endpoint=endpoint,
-                    extra_headers=extra_headers,
-                    **parameters,
-                )
-                content = self._parse_content(response_data)
-                content = self._transform_output(content, **parameters)
-                tool_calls = self._parse_tool_calls(response_data)
-                reasoning, signature = self._parse_reasoning(response_data)
-                kwargs: dict[str, Any] = {}
-                if reasoning is not None:
-                    kwargs["reasoning"] = reasoning
-                if signature:
-                    kwargs["signature"] = signature
-                output = self._output_class()(
-                    content=content,
-                    usage=self._get_usage(response_data),
-                    finish_reason=self._get_finish_reason(response_data),
-                    metadata=self._build_metadata(response_data),
-                    tool_calls=tool_calls,
-                    **kwargs,
-                )
-                telemetry.record_output(
-                    span, output, request_attrs, time.monotonic() - started
-                )
-                return output
-            except BaseException as exc:
-                telemetry.record_operation_duration(
-                    time.monotonic() - started, request_attrs, error=exc
-                )
-                raise
+        ) as (span, request_attrs):
+            inputs, parameters = self._validate_artifacts(inputs, **parameters)
+            telemetry.add_input_event(span, inputs)
+            request_body = self._build_request(
+                inputs, extra_body=extra_body, **parameters
+            )
+            response_data = await self._make_request(
+                request_body,
+                endpoint=endpoint,
+                extra_headers=extra_headers,
+                **parameters,
+            )
+            content = self._parse_content(response_data)
+            content = self._transform_output(content, **parameters)
+            tool_calls = self._parse_tool_calls(response_data)
+            reasoning, signature = self._parse_reasoning(response_data)
+            kwargs: dict[str, Any] = {}
+            if reasoning is not None:
+                kwargs["reasoning"] = reasoning
+            if signature:
+                kwargs["signature"] = signature
+            output = self._output_class()(
+                content=content,
+                usage=self._get_usage(response_data),
+                finish_reason=self._get_finish_reason(response_data),
+                metadata=self._build_metadata(response_data),
+                tool_calls=tool_calls,
+                **kwargs,
+            )
+            telemetry.record_output(span, output, request_attrs)
+            return output
 
     def _parse_tool_calls(self, response_data: dict[str, Any]) -> list[ToolCall]:
         """Parse tool calls from response. Override in providers that support tools."""
diff --git a/src/celeste/telemetry.py b/src/celeste/telemetry.py
@@ -365,16 +365,49 @@ def record_output(
     span: Any,
     output: Output[Any],
     metric_attributes: dict[str, Any],
-    duration_seconds: float,
-    error: BaseException | None = None,
 ) -> None:
-    """Emit span attrs, content event, and metrics for a successful Output."""
+    """Emit span attrs, content event, and token usage for a successful Output."""
     span.set_attributes(output_attributes(output))
     output_event = _output_messages_event(output)
     if output_event is not None:
         span.add_event("gen_ai.output.messages", attributes=output_event)
     record_token_usage(output.usage, metric_attributes)
-    record_operation_duration(duration_seconds, metric_attributes, error=error)
+
+
+@contextmanager
+def gen_ai_span(
+    *,
+    model: Model,
+    provider: Provider | None,
+    protocol: Protocol | None,
+    modality: Modality,
+    extra_attributes: dict[str, Any] | None = None,
+) -> Iterator[tuple[Any, dict[str, Any]]]:
+    """Open a GenAI span and record operation duration on exit.
+
+    Yields ``(span, request_attrs)``. On any exception, the duration is recorded
+    with ``error.type`` set; on success it's recorded plain.
+    """
+    request_attrs = request_attributes(
+        model=model, provider=provider, protocol=protocol, modality=modality
+    )
+    span_attrs = (
+        {**request_attrs, **extra_attributes} if extra_attributes else request_attrs
+    )
+    started = time.monotonic()
+    error: BaseException | None = None
+    with tracer.start_as_current_span(
+        span_name(modality, model), attributes=span_attrs
+    ) as span:
+        try:
+            yield span, request_attrs
+        except BaseException as exc:
+            error = exc
+            raise
+        finally:
+            record_operation_duration(
+                time.monotonic() - started, request_attrs, error=error
+            )
 
 
 class _TracedStream:
@@ -484,19 +517,17 @@ def _finalize(self) -> None:
         if self._ended:
             return
         self._ended = True
-        duration = time.monotonic() - self._started
+        record_operation_duration(
+            time.monotonic() - self._started,
+            self._metric_attributes,
+            error=self._error,
+        )
         try:
             output = self._inner.output
         except StreamNotExhaustedError:
             output = None
         if output is not None:
-            record_output(
-                self._span, output, self._metric_attributes, duration, error=self._error
-            )
-        else:
-            record_operation_duration(
-                duration, self._metric_attributes, error=self._error
-            )
+            record_output(self._span, output, self._metric_attributes)
         self._span.end()
 
 
@@ -513,6 +544,7 @@ def trace_stream(
     "Status",
     "StatusCode",
     "add_input_event",
+    "gen_ai_span",
     "meter",
     "output_attributes",
     "record_operation_duration",