huggingface · Cyrilvallez · Jan 9, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
@@ -258,20 +258,6 @@ A [`StoppingCriteria`] can be used to change when to stop generation (other than
 
 [[autodoc]] EncoderDecoderCache
 
-[[autodoc]] QuantoQuantizedCache
-
-[[autodoc]] HQQQuantizedCache
-
-[[autodoc]] OffloadedCache
-
-[[autodoc]] OffloadedStaticCache
-
-[[autodoc]] HybridCache
-
-[[autodoc]] HybridChunkedCache
-
-[[autodoc]] SlidingWindowCache
-
 ## Watermark Utils
 
 [[autodoc]] WatermarkingConfig

diff --git a/docs/source/ko/internal/generation_utils.md b/docs/source/ko/internal/generation_utils.md
@@ -235,22 +235,8 @@ generation_output[:2]
 
 [[autodoc]] QuantizedCache
 
-[[autodoc]] QuantoQuantizedCache
-
-[[autodoc]] HQQQuantizedCache
-
-[[autodoc]] OffloadedCache
-
 [[autodoc]] StaticCache
 
-[[autodoc]] OffloadedStaticCache
-
-[[autodoc]] HybridCache
-
-[[autodoc]] HybridChunkedCache
-
-[[autodoc]] SlidingWindowCache
-
 [[autodoc]] EncoderDecoderCache
 
 ## 워터마크 유틸리티 (Watermark Utils) [[transformers.WatermarkDetector]]

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -363,15 +363,6 @@
         "StaticSlidingWindowLayer",
         "QuantoQuantizedLayer",
         "HQQQuantizedLayer",
-        "SlidingWindowLayer",
-        "ChunkedSlidingLayer",
-        "HQQQuantizedCache",
-        "HybridCache",
-        "HybridChunkedCache",
-        "OffloadedCache",
-        "OffloadedStaticCache",
-        "QuantoQuantizedCache",
-        "SlidingWindowCache",
         "Cache",
         "DynamicCache",
         "EncoderDecoderCache",
@@ -482,20 +473,12 @@
 if TYPE_CHECKING:
     # All modeling imports
     from .cache_utils import Cache as Cache
-    from .cache_utils import ChunkedSlidingLayer as ChunkedSlidingLayer
     from .cache_utils import DynamicCache as DynamicCache
     from .cache_utils import DynamicLayer as DynamicLayer
     from .cache_utils import EncoderDecoderCache as EncoderDecoderCache
-    from .cache_utils import HQQQuantizedCache as HQQQuantizedCache
     from .cache_utils import HQQQuantizedLayer as HQQQuantizedLayer
-    from .cache_utils import HybridCache as HybridCache
-    from .cache_utils import OffloadedCache as OffloadedCache
-    from .cache_utils import OffloadedStaticCache as OffloadedStaticCache
     from .cache_utils import QuantizedCache as QuantizedCache
-    from .cache_utils import QuantoQuantizedCache as QuantoQuantizedCache
     from .cache_utils import QuantoQuantizedLayer as QuantoQuantizedLayer
-    from .cache_utils import SlidingWindowCache as SlidingWindowCache
-    from .cache_utils import SlidingWindowLayer as SlidingWindowLayer
     from .cache_utils import StaticCache as StaticCache
     from .cache_utils import StaticLayer as StaticLayer
     from .cache_utils import StaticSlidingWindowLayer as StaticSlidingWindowLayer

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -588,7 +588,7 @@ def __init__(
             from optimum.quanto import MaxOptimizer, qint2, qint4
         else:
             raise ImportError(
-                "You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. "
+                "You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedLayer`. "
             )
 
         if self.nbits not in [2, 4]:
@@ -1293,112 +1293,3 @@ def is_sliding(self):
     @property
     def is_compileable(self) -> bool:
         return self.self_attention_cache.is_compileable
-
-
-### Deprecated classes
-
-
-class SlidingWindowLayer(StaticSlidingWindowLayer):
-    def __init__(self, max_cache_len: int, sliding_window: int):
-        logger.warning_once(
-            "`SlidingWindowLayer` is deprecated and will be removed in version v4.59 "
-            "Use `StaticSlidingWindowLayer` instead, which is a better name for it."
-        )
-        super().__init__(max_cache_len, sliding_window)
-
-
-class ChunkedSlidingLayer(StaticSlidingWindowLayer):
-    def __init__(self, max_cache_len: int, sliding_window: int):
-        logger.warning_once(
-            "`ChunkedSlidingLayer` is deprecated and will be removed in version v4.59 "
-            "Use `StaticSlidingWindowLayer` instead, which has the exact same functionalities."
-        )
-        super().__init__(max_cache_len, sliding_window)
-
-
-class OffloadedCache(DynamicCache):
-    def __init__(self) -> None:
-        logger.warning_once(
-            "`OffloadedCache` is deprecated and will be removed in version v4.59 "
-            "Use `DynamicCache(offloading=True)` instead"
-        )
-        super().__init__(offloading=True)
-
-
-class OffloadedStaticCache(StaticCache):
-    def __init__(self, config: PreTrainedConfig, max_cache_len: int, *args, **kwargs):
-        logger.warning_once(
-            "`OffloadedStaticCache` is deprecated and will be removed in version v4.59 "
-            "Use `StaticCache(..., offloading=True)` instead"
-        )
-        super().__init__(config=config, max_cache_len=max_cache_len, offloading=True)
-
-
-class SlidingWindowCache(StaticCache):
-    def __init__(self, config: PreTrainedConfig, max_cache_len: int, *args, **kwargs):
-        logger.warning_once(
-            "`SlidingWindowCache` is deprecated and will be removed in version v4.59 "
-            "Use `StaticCache(...)` instead which will correctly infer the type of each layer."
-        )
-        super().__init__(config=config, max_cache_len=max_cache_len)
-
-
-class HybridCache(StaticCache):
-    def __init__(self, config: PreTrainedConfig, max_cache_len: int, *args, **kwargs):
-        logger.warning_once(
-            "`HybridCache` is deprecated and will be removed in version v4.59 "
-            "Use `StaticCache(...)` instead which will correctly infer the type of each layer."
-        )
-        super().__init__(config=config, max_cache_len=max_cache_len)
-
-
-class HybridChunkedCache(StaticCache):
-    def __init__(self, config: PreTrainedConfig, max_cache_len: int, *args, **kwargs):
-        logger.warning_once(
-            "`HybridChunkedCache` is deprecated and will be removed in version v4.59 "
-            "Use `StaticCache(...)` instead which will correctly infer the type of each layer."
-        )
-        super().__init__(config=config, max_cache_len=max_cache_len)
-
-
-class OffloadedHybridCache(StaticCache):
-    def __init__(self, config: PreTrainedConfig, max_cache_len: int, *args, **kwargs):
-        logger.warning_once(
-            "`OffloadedHybridCache` is deprecated and will be removed in version v4.59 "
-            "Use `StaticCache(..., offload=True)` instead which will correctly infer the type of each layer."
-        )
-        super().__init__(config=config, max_cache_len=max_cache_len, offloading=True)
-
-
-class QuantoQuantizedCache(QuantizedCache):
-    def __init__(
-        self,
-        config: PreTrainedConfig,
-        nbits: int = 4,
-        axis_key: int = 0,
-        axis_value: int = 0,
-        q_group_size: int = 64,
-        residual_length: int = 128,
-    ):
-        logger.warning_once(
-            "`QuantoQuantizedCache` is deprecated and will be removed in version v4.59 "
-            "Use `QuantizedCache(backend='quanto', ...)` instead."
-        )
-        super().__init__("quanto", config, nbits, axis_key, axis_value, q_group_size, residual_length)
-
-
-class HQQQuantizedCache(QuantizedCache):
-    def __init__(
-        self,
-        config: PreTrainedConfig,
-        nbits: int = 4,
-        axis_key: int = 0,
-        axis_value: int = 0,
-        q_group_size: int = 64,
-        residual_length: int = 128,
-    ):
-        logger.warning_once(
-            "`HQQQuantizedCache` is deprecated and will be removed in version v4.59 "
-            "Use `QuantizedCache(backend='hqq', ...)` instead."
-        )
-        super().__init__("hqq", config, nbits, axis_key, axis_value, q_group_size, residual_length)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
@@ -23,55 +23,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class HQQQuantizedCache(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class HybridCache(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class OffloadedCache(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class OffloadedStaticCache(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class QuantizedCache(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class QuantoQuantizedCache(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class SlidingWindowCache(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class StaticCache(metaclass=DummyObject):
     _backends = ["torch"]
 

diff --git a/tests/models/ministral/test_modeling_ministral.py b/tests/models/ministral/test_modeling_ministral.py
@@ -174,7 +174,7 @@ def test_export_text_with_hybrid_cache(self):
             ),
         )
 
-        # Export + HybridCache
+        # Export
         model.eval()
         exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
         exported_program = exportable_module.export(

diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
@@ -585,18 +585,6 @@ def _check_generate_outputs(self, output, config, use_cache=False, num_return_se
     def test_generate_continue_from_past_key_values(self):
         pass
 
-    @unittest.skip(
-        "Moshi either needs default generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
-    )
-    def test_greedy_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip(
-        "Moshi either needs default generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
-    )
-    def test_beam_search_generate_dict_outputs_use_cache(self):
-        pass
-
     @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     @unittest.skip(reason="Unimplemented. Relies on `test_eager_matches_sdpa_generate` to check correctness.")
     def test_eager_matches_sdpa_inference(

diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
@@ -177,7 +177,7 @@ def test_cache_beam_search(self, cache_implementation):
         """
         _skip_on_failed_cache_prerequisites(self, cache_implementation)
         if cache_implementation == "offloaded_hybrid_chunked":
-            # TODO (joao, cyril): something is off with `offloaded_hybrid_chunked` aka `OffloadedHybridCache`: the
+            # TODO (joao, cyril): something is off with `offloaded_hybrid_chunked`: the
             # output sequence (and the corresponding beam scores, if we add `output_scores=True`) are significantly
             # different from the other caches.
             self.skipTest("`offloaded_hybrid_chunked` fails this test")