pydata · aladinor · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -159,6 +159,18 @@ Deprecations
 ~~~~~~~~~~~~
 
 
+Performance
+~~~~~~~~~~~
+
+- Skip data reads during CF datetime dtype inference when the output dtype is
+  determinable from metadata alone (``use_cftime`` explicitly set, non-standard
+  calendar, non-nanosecond ``time_unit``, lazy backend arrays, or when
+  ``valid_max`` / ``time_coverage_end`` / a narrow storage dtype bounds the
+  possible values). This can significantly speed up
+  :py:func:`open_dataset` / :py:func:`open_datatree` on remote-backed stores
+  (zarr on S3, icechunk) with many time-encoded variables (:issue:`11303`).
+  By `Alfonso Ladino <https://github.com/aladinor>`_.
+
 Bug Fixes
 ~~~~~~~~~
 

diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -336,39 +336,210 @@ def _unpack_time_units_and_ref_date_cftime(units: str, calendar: str):
     return time_units, ref_date
 
 
-def _decode_cf_datetime_dtype(
+class _CannotInferFromMetadata(Exception):
+    """Raised by _infer_dtype_from_metadata when data reads are required."""
+
+
+# Edges of the datetime64[ns] representable range (the value range of a
+# pandas.Timestamp at nanosecond resolution).
+_DATETIME64_NS_LOWER = pd.Timestamp("1677-09-21 00:12:43.145224193")
+_DATETIME64_NS_UPPER = pd.Timestamp("2262-04-11 23:47:16.854775807")
+
+
+def _max_for_storage_dtype(dtype: np.dtype) -> int | None:
+    """Maximum absolute value representable in ``dtype``, or None if the
+    bound is too loose to be useful (int64, floats)."""
+    if dtype.kind in "iu" and dtype.itemsize <= 4:
+        info = np.iinfo(dtype)
+        return int(max(abs(info.min), info.max))
+    return None
+
+
+def _bound_fits_datetime64_ns(max_value: float | int, units: str) -> bool:
+    """True if ``ref_date ± max_value * unit_seconds`` fits in datetime64[ns]."""
+    time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
+    try:
+        delta = pd.Timedelta(int(max_value), unit=time_unit)
+        upper = ref_date + delta
+        lower = ref_date - delta
+    except (OutOfBoundsDatetime, OutOfBoundsTimedelta, OverflowError, ValueError):
+        return False
+    return _DATETIME64_NS_LOWER <= lower and upper <= _DATETIME64_NS_UPPER
+
+
+def _is_backed_by_lazy_backend(data) -> bool:
+    """Walk intermediate coder wrappers to find a backend-lazy array.
+
+    Returns True when some level of the wrapper chain is a
+    ``LazilyIndexedArray`` (zarr/netcdf/DAP backends). Returns False for
+    in-memory data (plain numpy, ``PandasIndexingAdapter``) where reads
+    are essentially free and we want to keep the fail-fast semantics.
+    """
+    # Guard against cycles; bound the depth to a small number.
+    for _ in range(16):
+        if isinstance(data, indexing.LazilyIndexedArray):
+            return True
+        inner = getattr(data, "array", None)
+        if inner is None or inner is data:
+            return False
+        data = inner
+    return False
+
+
+def _bound_from_attrs(attrs: dict, units: str) -> bool:
+    """True if ``valid_max`` or ``time_coverage_end`` in attrs proves the
+    decoded values fit in datetime64[ns]."""
+    bound = attrs.get("valid_max")
+    if bound is not None and _bound_fits_datetime64_ns(bound, units):
+        return True
+    end = attrs.get("time_coverage_end")
+    if end is not None:
+        try:
+            ts = pd.Timestamp(end)
+        except (ValueError, TypeError):
+            return False
+        # Strip any timezone for comparison with tz-naive ns bounds.
+        if ts.tzinfo is not None:
+            ts = ts.tz_convert(None) if ts.tz is not None else ts.tz_localize(None)
+        return bool(_DATETIME64_NS_LOWER <= ts <= _DATETIME64_NS_UPPER)
+    return False
+
+
+def _is_overflow_safe_from_metadata(
     data,
     units: str,
     calendar: str | None,
     use_cftime: bool | None,
-    time_unit: PDDatetimeUnitOptions = "ns",
+    time_unit: PDDatetimeUnitOptions,
+    attrs: dict | None,
+) -> bool:
+    """Whether decode_cf_datetime's output dtype is knowable from metadata.
+
+    Output dtype depends on data values only for ``use_cftime=None`` +
+    standard calendar + ``time_unit="ns"`` with a backend-lazy source
+    (where reads are expensive) and no attr/dtype bound that proves
+    values fit the datetime64[ns] range.
+    """
+    if use_cftime is not None:
+        return True
+    # Non-standard calendars always decode to cftime (object dtype).
+    if calendar is not None and not _is_standard_calendar(calendar):
+        return True
+    # Non-"ns" datetime64 resolutions have ranges ~±10¹¹ years — overflow
+    # is unreachable in practice.
+    if time_unit != "ns":
+        return True
+    # Backend-lazy arrays: reads go through the backend's indexing
+    # pipeline (zarr sync/async, DAP, etc.) and can be expensive even
+    # when values are cached. We walk through intermediate coder wrappers
+    # (_ElementwiseFunctionArray) to find the backing array. In-memory
+    # wrappers like PandasIndexingAdapter are not covered here — reading
+    # them is free and preserves fail-fast on invalid values.
+    if _is_backed_by_lazy_backend(data):
+        return True
+    # Chunked + ns + auto: need a proven bound.
+    if attrs is not None and _bound_from_attrs(attrs, units):
+        return True
+    storage_dtype = getattr(data, "dtype", None)
+    if storage_dtype is not None:
+        max_val = _max_for_storage_dtype(storage_dtype)
+        if max_val is not None and _bound_fits_datetime64_ns(max_val, units):
+            return True
+    return False
+
+
+def _infer_dtype_from_metadata(
+    data,
+    units: str,
+    calendar: str | None,
+    use_cftime: bool | None,
+    time_unit: PDDatetimeUnitOptions,
+    attrs: dict | None,
+) -> np.dtype:
+    """Infer the decoded dtype from metadata alone, without reading data.
+
+    Raises ``_CannotInferFromMetadata`` when the dtype depends on values
+    that must be read. For deterministic cases (``use_cftime`` explicit,
+    non-standard calendar) returns the dtype directly so ``cftime`` does
+    not need to be importable. Otherwise runs ``decode_cf_datetime`` on
+    a synthetic ``[0, 1]`` array so resolution-selection logic matches
+    real-data output.
+    """
+    # Validate units — raises ValueError on malformed strings.
+    _unpack_time_unit_and_ref_date(units)
+
+    if use_cftime is True:
+        return np.dtype("object")
+    if calendar is not None and not _is_standard_calendar(calendar):
+        return np.dtype("object")
+
+    if not _is_overflow_safe_from_metadata(
+        data, units, calendar, use_cftime, time_unit, attrs
+    ):
+        raise _CannotInferFromMetadata
+
+    result = decode_cf_datetime(
+        np.array([0, 1]), units, calendar, use_cftime, time_unit
+    )
+    return getattr(result, "dtype", np.dtype("object"))
+
+
+def _infer_dtype_from_first_last(
+    data,
+    units: str,
+    calendar: str | None,
+    use_cftime: bool | None,
+    time_unit: PDDatetimeUnitOptions,
 ) -> np.dtype:
-    # Verify that at least the first and last date can be decoded
-    # successfully. Otherwise, tracebacks end up swallowed by
-    # Dataset.__repr__ when users try to view their lazily decoded array.
+    """Fallback dtype inference that reads the first and last data values.
+
+    Used only when metadata cannot prove the dtype (chunked int64/float64
+    arrays with ``use_cftime=None`` and no value bounds). Reads are
+    expensive on remote-backed stores but unavoidable in this case.
+    """
     values = indexing.ImplicitToExplicitIndexingAdapter(indexing.as_indexable(data))
     example_value = np.concatenate(
         [to_numpy(first_n_items(values, 1)), to_numpy(last_item(values))]
     )
+    result = decode_cf_datetime(example_value, units, calendar, use_cftime, time_unit)
+    return getattr(result, "dtype", np.dtype("object"))
 
+
+def _wrap_decode_error(err: Exception, units: str, calendar: str | None) -> ValueError:
+    calendar_msg = (
+        "the default calendar" if calendar is None else f"calendar {calendar!r}"
+    )
+    msg = (
+        f"unable to decode time units {units!r} with {calendar_msg!r}. Try "
+        "opening your dataset with decode_times=False or installing cftime "
+        "if it is not installed."
+    )
+    return ValueError(msg)
+
+
+def _decode_cf_datetime_dtype(
+    data,
+    units: str,
+    calendar: str | None,
+    use_cftime: bool | None,
+    time_unit: PDDatetimeUnitOptions = "ns",
+    attrs: dict | None = None,
+) -> np.dtype:
     try:
-        result = decode_cf_datetime(
-            example_value, units, calendar, use_cftime, time_unit
+        return _infer_dtype_from_metadata(
+            data, units, calendar, use_cftime, time_unit, attrs
         )
+    except _CannotInferFromMetadata:
+        pass
     except Exception as err:
-        calendar_msg = (
-            "the default calendar" if calendar is None else f"calendar {calendar!r}"
-        )
-        msg = (
-            f"unable to decode time units {units!r} with {calendar_msg!r}. Try "
-            "opening your dataset with decode_times=False or installing cftime "
-            "if it is not installed."
-        )
-        raise ValueError(msg) from err
-    else:
-        dtype = getattr(result, "dtype", np.dtype("object"))
+        raise _wrap_decode_error(err, units, calendar) from err
 
-    return dtype
+    try:
+        return _infer_dtype_from_first_last(
+            data, units, calendar, use_cftime, time_unit
+        )
+    except Exception as err:
+        raise _wrap_decode_error(err, units, calendar) from err
 
 
 def _decode_datetime_with_cftime(
@@ -1412,7 +1583,12 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
             units = pop_to(attrs, encoding, "units")
             calendar = pop_to(attrs, encoding, "calendar")
             dtype = _decode_cf_datetime_dtype(
-                data, units, calendar, self.use_cftime, self.time_unit
+                data,
+                units,
+                calendar,
+                self.use_cftime,
+                self.time_unit,
+                attrs=attrs,
             )
             transform = partial(
                 decode_cf_datetime,