Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions docs/source/en/internal/generation_utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,20 +258,6 @@ A [`StoppingCriteria`] can be used to change when to stop generation (other than

[[autodoc]] EncoderDecoderCache

[[autodoc]] QuantoQuantizedCache

[[autodoc]] HQQQuantizedCache

[[autodoc]] OffloadedCache

[[autodoc]] OffloadedStaticCache

[[autodoc]] HybridCache

[[autodoc]] HybridChunkedCache

[[autodoc]] SlidingWindowCache

## Watermark Utils

[[autodoc]] WatermarkingConfig
Expand Down
14 changes: 0 additions & 14 deletions docs/source/ko/internal/generation_utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,22 +235,8 @@ generation_output[:2]

[[autodoc]] QuantizedCache

[[autodoc]] QuantoQuantizedCache

[[autodoc]] HQQQuantizedCache

[[autodoc]] OffloadedCache

[[autodoc]] StaticCache

[[autodoc]] OffloadedStaticCache

[[autodoc]] HybridCache

[[autodoc]] HybridChunkedCache

[[autodoc]] SlidingWindowCache

[[autodoc]] EncoderDecoderCache

## 워터마크 유틸리티 (Watermark Utils) [[transformers.WatermarkDetector]]
Expand Down
17 changes: 0 additions & 17 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,15 +363,6 @@
"StaticSlidingWindowLayer",
"QuantoQuantizedLayer",
"HQQQuantizedLayer",
"SlidingWindowLayer",
"ChunkedSlidingLayer",
"HQQQuantizedCache",
"HybridCache",
"HybridChunkedCache",
"OffloadedCache",
"OffloadedStaticCache",
"QuantoQuantizedCache",
"SlidingWindowCache",
"Cache",
"DynamicCache",
"EncoderDecoderCache",
Expand Down Expand Up @@ -482,20 +473,12 @@
if TYPE_CHECKING:
# All modeling imports
from .cache_utils import Cache as Cache
from .cache_utils import ChunkedSlidingLayer as ChunkedSlidingLayer
from .cache_utils import DynamicCache as DynamicCache
from .cache_utils import DynamicLayer as DynamicLayer
from .cache_utils import EncoderDecoderCache as EncoderDecoderCache
from .cache_utils import HQQQuantizedCache as HQQQuantizedCache
from .cache_utils import HQQQuantizedLayer as HQQQuantizedLayer
from .cache_utils import HybridCache as HybridCache
from .cache_utils import OffloadedCache as OffloadedCache
from .cache_utils import OffloadedStaticCache as OffloadedStaticCache
from .cache_utils import QuantizedCache as QuantizedCache
from .cache_utils import QuantoQuantizedCache as QuantoQuantizedCache
from .cache_utils import QuantoQuantizedLayer as QuantoQuantizedLayer
from .cache_utils import SlidingWindowCache as SlidingWindowCache
from .cache_utils import SlidingWindowLayer as SlidingWindowLayer
from .cache_utils import StaticCache as StaticCache
from .cache_utils import StaticLayer as StaticLayer
from .cache_utils import StaticSlidingWindowLayer as StaticSlidingWindowLayer
Expand Down
111 changes: 1 addition & 110 deletions src/transformers/cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ def __init__(
from optimum.quanto import MaxOptimizer, qint2, qint4
else:
raise ImportError(
"You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. "
"You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedLayer`. "
)

if self.nbits not in [2, 4]:
Expand Down Expand Up @@ -1293,112 +1293,3 @@ def is_sliding(self):
@property
def is_compileable(self) -> bool:
return self.self_attention_cache.is_compileable


### Deprecated classes


class SlidingWindowLayer(StaticSlidingWindowLayer):
def __init__(self, max_cache_len: int, sliding_window: int):
logger.warning_once(
"`SlidingWindowLayer` is deprecated and will be removed in version v4.59 "
"Use `StaticSlidingWindowLayer` instead, which is a better name for it."
)
super().__init__(max_cache_len, sliding_window)


class ChunkedSlidingLayer(StaticSlidingWindowLayer):
def __init__(self, max_cache_len: int, sliding_window: int):
logger.warning_once(
"`ChunkedSlidingLayer` is deprecated and will be removed in version v4.59 "
"Use `StaticSlidingWindowLayer` instead, which has the exact same functionalities."
)
super().__init__(max_cache_len, sliding_window)


class OffloadedCache(DynamicCache):
def __init__(self) -> None:
logger.warning_once(
"`OffloadedCache` is deprecated and will be removed in version v4.59 "
"Use `DynamicCache(offloading=True)` instead"
)
super().__init__(offloading=True)


class OffloadedStaticCache(StaticCache):
def __init__(self, config: PreTrainedConfig, max_cache_len: int, *args, **kwargs):
logger.warning_once(
"`OffloadedStaticCache` is deprecated and will be removed in version v4.59 "
"Use `StaticCache(..., offloading=True)` instead"
)
super().__init__(config=config, max_cache_len=max_cache_len, offloading=True)


class SlidingWindowCache(StaticCache):
def __init__(self, config: PreTrainedConfig, max_cache_len: int, *args, **kwargs):
logger.warning_once(
"`SlidingWindowCache` is deprecated and will be removed in version v4.59 "
"Use `StaticCache(...)` instead which will correctly infer the type of each layer."
)
super().__init__(config=config, max_cache_len=max_cache_len)


class HybridCache(StaticCache):
def __init__(self, config: PreTrainedConfig, max_cache_len: int, *args, **kwargs):
logger.warning_once(
"`HybridCache` is deprecated and will be removed in version v4.59 "
"Use `StaticCache(...)` instead which will correctly infer the type of each layer."
)
super().__init__(config=config, max_cache_len=max_cache_len)


class HybridChunkedCache(StaticCache):
def __init__(self, config: PreTrainedConfig, max_cache_len: int, *args, **kwargs):
logger.warning_once(
"`HybridChunkedCache` is deprecated and will be removed in version v4.59 "
"Use `StaticCache(...)` instead which will correctly infer the type of each layer."
)
super().__init__(config=config, max_cache_len=max_cache_len)


class OffloadedHybridCache(StaticCache):
def __init__(self, config: PreTrainedConfig, max_cache_len: int, *args, **kwargs):
logger.warning_once(
"`OffloadedHybridCache` is deprecated and will be removed in version v4.59 "
"Use `StaticCache(..., offload=True)` instead which will correctly infer the type of each layer."
)
super().__init__(config=config, max_cache_len=max_cache_len, offloading=True)


class QuantoQuantizedCache(QuantizedCache):
def __init__(
self,
config: PreTrainedConfig,
nbits: int = 4,
axis_key: int = 0,
axis_value: int = 0,
q_group_size: int = 64,
residual_length: int = 128,
):
logger.warning_once(
"`QuantoQuantizedCache` is deprecated and will be removed in version v4.59 "
"Use `QuantizedCache(backend='quanto', ...)` instead."
)
super().__init__("quanto", config, nbits, axis_key, axis_value, q_group_size, residual_length)


class HQQQuantizedCache(QuantizedCache):
def __init__(
self,
config: PreTrainedConfig,
nbits: int = 4,
axis_key: int = 0,
axis_value: int = 0,
q_group_size: int = 64,
residual_length: int = 128,
):
logger.warning_once(
"`HQQQuantizedCache` is deprecated and will be removed in version v4.59 "
"Use `QuantizedCache(backend='hqq', ...)` instead."
)
super().__init__("hqq", config, nbits, axis_key, axis_value, q_group_size, residual_length)
42 changes: 0 additions & 42 deletions src/transformers/utils/dummy_pt_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,55 +23,13 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])


class HQQQuantizedCache(metaclass=DummyObject):
_backends = ["torch"]

def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])


class HybridCache(metaclass=DummyObject):
_backends = ["torch"]

def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])


class OffloadedCache(metaclass=DummyObject):
_backends = ["torch"]

def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])


class OffloadedStaticCache(metaclass=DummyObject):
_backends = ["torch"]

def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])


class QuantizedCache(metaclass=DummyObject):
_backends = ["torch"]

def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])


class QuantoQuantizedCache(metaclass=DummyObject):
_backends = ["torch"]

def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])


class SlidingWindowCache(metaclass=DummyObject):
_backends = ["torch"]

def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])


class StaticCache(metaclass=DummyObject):
_backends = ["torch"]

Expand Down
2 changes: 1 addition & 1 deletion tests/models/ministral/test_modeling_ministral.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_export_text_with_hybrid_cache(self):
),
)

# Export + HybridCache
# Export
model.eval()
exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
exported_program = exportable_module.export(
Expand Down
12 changes: 0 additions & 12 deletions tests/models/moshi/test_modeling_moshi.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,18 +585,6 @@ def _check_generate_outputs(self, output, config, use_cache=False, num_return_se
def test_generate_continue_from_past_key_values(self):
pass

@unittest.skip(
"Moshi either needs default generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
)
def test_greedy_generate_dict_outputs_use_cache(self):
pass

@unittest.skip(
"Moshi either needs default generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
)
def test_beam_search_generate_dict_outputs_use_cache(self):
pass

@parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
@unittest.skip(reason="Unimplemented. Relies on `test_eager_matches_sdpa_generate` to check correctness.")
def test_eager_matches_sdpa_inference(
Expand Down
2 changes: 1 addition & 1 deletion tests/utils/test_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def test_cache_beam_search(self, cache_implementation):
"""
_skip_on_failed_cache_prerequisites(self, cache_implementation)
if cache_implementation == "offloaded_hybrid_chunked":
# TODO (joao, cyril): something is off with `offloaded_hybrid_chunked` aka `OffloadedHybridCache`: the
# TODO (joao, cyril): something is off with `offloaded_hybrid_chunked`: the
# output sequence (and the corresponding beam scores, if we add `output_scores=True`) are significantly
# different from the other caches.
self.skipTest("`offloaded_hybrid_chunked` fails this test")
Expand Down