diff --git a/conftest.py b/conftest.py index 670ceba2f272..0fab7ce3db27 100644 --- a/conftest.py +++ b/conftest.py @@ -54,7 +54,6 @@ "test_gradient_checkpointing_backward_compatibility", "test_gradient_checkpointing_enable_disable", "test_torch_save_load", - "test_initialization", "test_forward_signature", "test_model_get_set_embeddings", "test_model_main_input_name", diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py index 4e1bbf75c507..4f0f98f359e7 100644 --- a/tests/models/aimv2/test_modeling_aimv2.py +++ b/tests/models/aimv2/test_modeling_aimv2.py @@ -39,7 +39,6 @@ from ...test_modeling_common import ( TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION, ModelTesterMixin, - _config_zero_init, _test_eager_matches_sdpa_inference, floats_tensor, ids_tensor, @@ -427,30 +426,6 @@ def test_model_get_set_embeddings(self): def test_multi_gpu_data_parallel_forward(self): pass - # Override as the `logit_scale` parameter initialization is different for Aimv2 - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_load_vision_text_config(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index 1e41e5fc9c1d..e64ced9131a5 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -466,7 +466,7 @@ def test_batching_equivalence(self, atol=3e-4, rtol=3e-4): @unittest.skip(reason="Start to fail after using torch `cu118`.") def test_multi_gpu_data_parallel_forward(self): - super().test_multi_gpu_data_parallel_forward() + pass @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): @@ -488,35 +488,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `temperature` parameter initialization is different for ALIGN - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `temperature` is initialized as per the original implementation - if name == "temperature": - self.assertAlmostEqual( - param.data.item(), - 1.0, - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - elif name == "text_projection.weight": - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 9ed9a8331617..86ea1e6da431 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -464,29 +464,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for AltCLIP - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") diff --git a/tests/models/aria/test_modeling_aria.py b/tests/models/aria/test_modeling_aria.py index 6ac9b216c24c..42186970e459 100644 --- a/tests/models/aria/test_modeling_aria.py +++ b/tests/models/aria/test_modeling_aria.py @@ -198,10 +198,6 @@ def setUp(self): self.model_tester = AriaVisionText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=AriaConfig, has_text_modality=False) - @unittest.skip(reason="Unstable test") - def test_initialization(self): - pass - SKIP = False torch_accelerator_module = getattr(torch, torch_device) diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py index 499c4a268e56..aa71b32d5447 100644 --- a/tests/models/aya_vision/test_modeling_aya_vision.py +++ b/tests/models/aya_vision/test_modeling_aya_vision.py @@ -197,10 +197,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="Siglip uses a non-standard initialization scheme") - def test_initialization(self): - pass - @unittest.skip(reason="Compile not yet supported because in LLava models") @pytest.mark.torch_compile_test def test_sdpa_can_compile_dynamic(self): diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py index a1c832948209..0092218bc609 100644 --- a/tests/models/bamba/test_modeling_bamba.py +++ b/tests/models/bamba/test_modeling_bamba.py @@ -41,7 +41,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor +from ...test_modeling_common import ModelTesterMixin, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -335,30 +335,6 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_initialization(self): - r""" - Overriding the test_initialization test as the A_log and D params of the Bamba mixer are initialized differently - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if "A_log" in name: - A = torch.arange(1, config.mamba_n_heads + 1, dtype=torch.float32) - torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5) - elif "D" in name: - D = torch.ones(config.mamba_n_heads, dtype=torch.float32) - torch.testing.assert_close(param.data, D, rtol=1e-5, atol=1e-5) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_attention_outputs(self): r""" Overriding the test_attention_outputs test as the Bamba model outputs attention only for its attention layers diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py index f78ed65c3d93..ab6c28dd4b2f 100644 --- a/tests/models/beit/test_modeling_beit.py +++ b/tests/models/beit/test_modeling_beit.py @@ -34,7 +34,7 @@ from ...test_backbone_common import BackboneTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -382,24 +382,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - # we skip lambda parameters as these require special initial values - # determined by config.layer_scale_init_value - if "lambda" in name: - continue - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @slow def test_model_from_pretrained(self): model_name = "microsoft/beit-base-patch16-224" diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py index 06477f320477..3f9269ca17a2 100644 --- a/tests/models/bit/test_modeling_bit.py +++ b/tests/models/bit/test_modeling_bit.py @@ -28,7 +28,6 @@ if is_torch_available(): import torch - from torch import nn from transformers import BitBackbone, BitForImageClassification, BitImageProcessor, BitModel @@ -200,22 +199,6 @@ def test_backbone(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_backbone(*config_and_inputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config=config) - for name, module in model.named_modules(): - if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)): - self.assertTrue( - torch.all(module.weight == 1), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - self.assertTrue( - torch.all(module.bias == 0), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 27b1b1202c11..cfe2d7dd0696 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -456,41 +456,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for Blip - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - # See PR #38607 (to avoid flakiness) - data = torch.flatten(param.data) - n_elements = torch.numel(data) - # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in - # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332 - n_elements_to_skip_on_each_side = int(n_elements * 0.025) - data_to_check = torch.sort(data).values - if n_elements_to_skip_on_each_side > 0: - data_to_check = data_to_check[ - n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side - ] - self.assertIn( - ((data_to_check.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") @@ -981,30 +946,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - # override as the `logit_scale` parameter initialization is different for Blip - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") @@ -1194,30 +1135,6 @@ def test_training_gradient_checkpointing(self): loss = model(**inputs).loss loss.backward() - # override as the `logit_scale` parameter initialization is different for Blip - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 6523fddb47cc..f2efb12a4a39 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -13,7 +13,6 @@ # limitations under the License. """Testing suite for the PyTorch BLIP-2 model.""" -import copy import inspect import tempfile import unittest @@ -42,7 +41,6 @@ from ...test_modeling_common import ( TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION, ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -988,23 +986,6 @@ def test_get_qformer_features(self): (self.model_tester.vision_model_tester.batch_size, 10, config.vision_config.hidden_size), ) - # override from common to deal with nested configurations (`vision_config`, `text_config` and `qformer_config`) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for key in ["vision_config", "qformer_config", "text_config"]: - setattr(configs_no_init, key, _config_zero_init(getattr(configs_no_init, key))) - for model_class in self.all_model_classes: - model = model_class(config=copy.deepcopy(configs_no_init)) - for name, param in model.named_parameters(): - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip("T5 backbone deepcopies the configs, and fixing it would be more involved") def test_internal_model_config_and_subconfig_are_same(self): pass @@ -1511,36 +1492,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - elif name == "temp": - self.assertAlmostEqual( - param.data.item(), - 0.07, - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index f1a3d94c20e4..ba9bf5aca93f 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -28,7 +28,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -437,29 +436,6 @@ def test_retain_grad_hidden_states_attentions(self): if self.has_attentions: self.assertIsNotNone(attentions.grad) - # override as the `logit_scale` parameter initialization is different for BRIDGE TOWER - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - config.logit_scale_init_value, - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="""Bridge Tower does not have input/output embeddings. So this test is not applicable.""") def test_model_get_set_embeddings(self): pass diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 337e31c67dbb..140823b076d7 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -18,7 +18,6 @@ import tempfile import unittest -import numpy as np import requests from transformers import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig @@ -578,33 +577,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for CHINESE_CLIP - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for sub_config_key in ("vision_config", "text_config"): - sub_config = getattr(configs_no_init, sub_config_key, {}) - setattr(configs_no_init, sub_config_key, _config_zero_init(sub_config)) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index b3914ee07fd5..5790c880e7a6 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -525,30 +525,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for CLAP - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if "logit_scale" in name: - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 60104222c096..e0721c531b2b 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -562,30 +562,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for CLIP - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") @@ -750,10 +726,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="CLIP uses a non-standard initialization scheme") - def test_initialization(self): - pass - @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) @slow @is_flaky() diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 753815c5989a..0410fc91fd34 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -490,33 +490,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - # override as the some parameters require custom initialization - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if "logit_scale" in name: - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - elif "film" in name or "transposed_conv" in name or "reduce" in name: - # those parameters use PyTorch' default nn.Linear initialization scheme - pass - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index b8d559943c4e..e1c756df82c5 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -32,7 +32,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, ids_tensor, random_attention_mask, ) @@ -499,36 +498,6 @@ def test_inputs_embeds(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for Clvp - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - expected_value = np.log(1 / 0.07) - returned_value = param.data.item() - - self.assertAlmostEqual( - returned_value, - expected_value, - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - expected_range = [0.0, 1.0] - returned_range = ((param.data.mean() * 1e9).round() / 1e9).item() - - self.assertIn( - returned_range, - expected_range, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_load_speech_text_decoder_config(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/cohere2_vision/test_modeling_cohere2_vision.py b/tests/models/cohere2_vision/test_modeling_cohere2_vision.py index 93169a34ca5e..53bded107a3b 100644 --- a/tests/models/cohere2_vision/test_modeling_cohere2_vision.py +++ b/tests/models/cohere2_vision/test_modeling_cohere2_vision.py @@ -169,10 +169,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - @unittest.skip(reason="Siglip backbone uses a non-standard initialization scheme") - def test_initialization(self): - pass - @require_read_token @require_torch diff --git a/tests/models/colpali/test_modeling_colpali.py b/tests/models/colpali/test_modeling_colpali.py index c8571b1a5d74..761946891ead 100644 --- a/tests/models/colpali/test_modeling_colpali.py +++ b/tests/models/colpali/test_modeling_colpali.py @@ -273,10 +273,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self): def test_model_parallelism(self): pass - @unittest.skip(reason="PaliGemma's SigLip encoder uses a non-standard initialization scheme") - def test_initialization(self): - pass - # TODO extend valid outputs to include this test @Molbap @unittest.skip(reason="PaliGemma has currently one output format.") def test_model_outputs_equivalence(self): diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py index c4d83e1b7bda..d186ce1d9a47 100644 --- a/tests/models/conditional_detr/test_modeling_conditional_detr.py +++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py @@ -22,7 +22,7 @@ from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -506,29 +506,6 @@ def test_hf_backbone(self): self.assertTrue(outputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - configs_no_init.init_xavier_std = 1e9 - - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if "bbox_attention" in name and "bias" not in name: - self.assertLess( - 100000, - abs(param.data.max().item()), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - TOLERANCE = 1e-4 diff --git a/tests/models/csm/test_modeling_csm.py b/tests/models/csm/test_modeling_csm.py index 341407795567..cc09a9fe646a 100644 --- a/tests/models/csm/test_modeling_csm.py +++ b/tests/models/csm/test_modeling_csm.py @@ -39,7 +39,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, ids_tensor, ) @@ -189,25 +188,6 @@ def _get_logits_processor_kwargs(self, do_sample=False, config=None): return logits_processor_kwargs - def test_initialization(self): - """ - Overrides [ModelTesterMixin.test_initialization] because of specificities of Mimi codec model. - See https://github.com/huggingface/transformers/blob/1077603410cd73ba71d64a522033574d66d64b55/tests/models/mimi/test_modeling_mimi.py#L384-L397 - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = ["conv", "input_proj", "output_proj"] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e-5): """ Overrides [GenerationTesterMixin._check_similar_generate_outputs] to handle third input_ids dimension. diff --git a/tests/models/d_fine/test_modeling_d_fine.py b/tests/models/d_fine/test_modeling_d_fine.py index 1348226857e8..7b6eddcfaca3 100644 --- a/tests/models/d_fine/test_modeling_d_fine.py +++ b/tests/models/d_fine/test_modeling_d_fine.py @@ -48,7 +48,7 @@ from transformers import RTDetrImageProcessor from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -628,58 +628,6 @@ def test_hf_backbone(self): self.assertTrue(outputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - configs_no_init.initializer_bias_prior_prob = 0.2 - bias_value = -1.3863 # log_e ((1 - 0.2) / 0.2) - - failed_cases = [] - - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - # Skip the check for the backbone - for name, module in model.named_modules(): - if module.__class__.__name__ == "DFineConvEncoder": - backbone_params = [f"{name}.{key}" for key in module.state_dict()] - break - - for name, param in model.named_parameters(): - if param.requires_grad: - if ("class_embed" in name and "bias" in name) or "enc_score_head.bias" in name: - bias_tensor = torch.full_like(param.data, bias_value) - try: - torch.testing.assert_close(param.data, bias_tensor, atol=1e-4, rtol=1e-4) - except AssertionError: - failed_cases.append( - f"Parameter {name} of model {model_class} seems not properly initialized. " - f"Biases should be initialized to {bias_value}, got {param.data}" - ) - elif ( - "level_embed" in name - or "sampling_offsets.bias" in name - or "value_proj" in name - or "output_proj" in name - or "reference_points" in name - or "enc_score_head.weight" in name - or ("class_embed" in name and "weight" in name) - or name in backbone_params - ): - continue - else: - mean = param.data.mean() - round_mean = (mean * 1e9).round() / 1e9 - round_mean = round_mean.item() - if round_mean not in [0.0, 1.0]: - failed_cases.append( - f"Parameter {name} of model {model_class} seems not properly initialized. " - f"Mean is {round_mean}, but should be in [0, 1]" - ) - - message = "\n" + "\n".join(failed_cases) - self.assertTrue(not failed_cases, message) - @parameterized.expand(["float32", "float16", "bfloat16"]) @require_torch_accelerator @slow diff --git a/tests/models/dab_detr/test_modeling_dab_detr.py b/tests/models/dab_detr/test_modeling_dab_detr.py index f9446e2eaee2..9df7038fdccc 100644 --- a/tests/models/dab_detr/test_modeling_dab_detr.py +++ b/tests/models/dab_detr/test_modeling_dab_detr.py @@ -22,7 +22,7 @@ from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -700,55 +700,6 @@ def test_different_timm_backbone(self): self.assertTrue(outputs) - def test_initialization(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - configs_no_init.init_xavier_std = 1e9 - # Copied from RT-DETR - configs_no_init.initializer_bias_prior_prob = 0.2 - bias_value = -1.3863 # log_e ((1 - 0.2) / 0.2) - - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if "bbox_attention" in name and "bias" not in name: - self.assertLess( - 100000, - abs(param.data.max().item()), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # Modified from RT-DETR - elif "class_embed" in name and "bias" in name: - bias_tensor = torch.full_like(param.data, bias_value) - torch.testing.assert_close( - param.data, - bias_tensor, - atol=1e-4, - rtol=1e-4, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - elif "activation_fn" in name and config.activation_function == "prelu": - self.assertTrue( - param.data.mean() == 0.25, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - elif "backbone.conv_encoder.model" in name: - continue - elif "self_attn.in_proj_weight" in name: - self.assertIn( - ((param.data.mean() * 1e2).round() / 1e2).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - TOLERANCE = 1e-4 CHECKPOINT = "IDEA-Research/dab-detr-resnet-50" diff --git a/tests/models/dac/test_modeling_dac.py b/tests/models/dac/test_modeling_dac.py index eb5ad7225a8a..cb286a14e965 100644 --- a/tests/models/dac/test_modeling_dac.py +++ b/tests/models/dac/test_modeling_dac.py @@ -353,22 +353,6 @@ def recursive_check(tuple_object, dict_object): dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs) - # Ignore copy - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = ["conv", "in_proj", "out_proj", "codebook"] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_identity_shortcut(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs() config.use_conv_shortcut = False diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py index be5f9d6a0c57..34785ccd3a74 100644 --- a/tests/models/data2vec/test_modeling_data2vec_audio.py +++ b/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -24,7 +24,7 @@ from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init +from ...test_modeling_common import ModelTesterMixin from ...test_pipeline_mixin import PipelineTesterMixin @@ -457,39 +457,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "objective.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py index c39baea90bc4..b540caecafe5 100644 --- a/tests/models/data2vec/test_modeling_data2vec_vision.py +++ b/tests/models/data2vec/test_modeling_data2vec_vision.py @@ -32,7 +32,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -291,24 +291,6 @@ def test_training_gradient_checkpointing(self): loss = model(**inputs).loss loss.backward() - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - # we skip lambda parameters as these require special initial values - # determined by config.layer_scale_init_value - if "lambda" in name: - continue - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_for_image_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_image_classification(*config_and_inputs) diff --git a/tests/models/deepseek_vl/test_modeling_deepseek_vl.py b/tests/models/deepseek_vl/test_modeling_deepseek_vl.py index 7941695c6583..33c5e0c4a465 100644 --- a/tests/models/deepseek_vl/test_modeling_deepseek_vl.py +++ b/tests/models/deepseek_vl/test_modeling_deepseek_vl.py @@ -186,11 +186,6 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] torch.testing.assert_close(out_embeds, out_ids) - @unittest.skip(reason="Siglip uses a non-standard initialization scheme") - # Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization - def test_initialization(self): - pass - # Copied from tests.models.janus.test_modeling_janus.JanusVisionText2TextModelTest.test_sdpa_can_dispatch_composite_models def test_sdpa_can_dispatch_composite_models(self): for model_class in self.all_model_classes: diff --git a/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py b/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py index 5ce1679f5997..99dba2542ea3 100644 --- a/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py +++ b/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py @@ -217,11 +217,6 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] torch.testing.assert_close(out_embeds, out_ids) - @unittest.skip(reason="Siglip uses a non-standard initialization scheme") - # Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization - def test_initialization(self): - pass - def test_sdpa_can_dispatch_composite_models(self): for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index dcd95a009ead..130c04df10ac 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -30,7 +30,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -575,29 +575,6 @@ def test_hf_backbone(self): self.assertTrue(outputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - print("Model class:", model_class) - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if ( - "level_embed" in name - or "sampling_offsets.bias" in name - or "value_proj" in name - or "output_proj" in name - or "reference_points" in name - ): - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_two_stage_training(self): model_class = DeformableDetrForObjectDetection config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 6574923592a9..7cb57b843b7a 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -22,7 +22,7 @@ from transformers.testing_utils import require_torch, require_vision, slow, torch_device from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -300,45 +300,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - non_uniform_init_parms = [ - # these encoders are vision transformers - # any layer outside these encoders is either Conv2d or ConvTranspose2d - # which use kaiming initialization - "patch_encoder", - "image_encoder", - "fov_model.encoder", - ] - if param.requires_grad: - if any(x in name for x in non_uniform_init_parms): - # See PR #38607 (to avoid flakiness) - data = torch.flatten(param.data) - n_elements = torch.numel(data) - # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in - # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332 - n_elements_to_skip_on_each_side = int(n_elements * 0.025) - data_to_check = torch.sort(data).values - if n_elements_to_skip_on_each_side > 0: - data_to_check = data_to_check[ - n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side - ] - self.assertIn( - ((data_to_check.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # this started when switched from normal initialization to kaiming_normal initialization # maybe because the magnitude of offset values from ViT-encoders increases when followed by many convolution layers def test_batching_equivalence(self, atol=1e-4, rtol=1e-4): diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py index b17d94bb031a..e6f8e07f8d16 100644 --- a/tests/models/detr/test_modeling_detr.py +++ b/tests/models/detr/test_modeling_detr.py @@ -22,7 +22,7 @@ from transformers.testing_utils import Expectations, require_timm, require_torch, require_vision, slow, torch_device from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -526,29 +526,6 @@ def test_greyscale_images(self): self.assertTrue(outputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - configs_no_init.init_xavier_std = 1e9 - - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if "bbox_attention" in name and "bias" not in name: - self.assertLess( - 100000, - abs(param.data.max().item()), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - TOLERANCE = 1e-4 diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py index 721af9e8cab7..099b285581cb 100644 --- a/tests/models/dinat/test_modeling_dinat.py +++ b/tests/models/dinat/test_modeling_dinat.py @@ -23,7 +23,7 @@ from ...test_backbone_common import BackboneTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -326,20 +326,6 @@ def test_model_from_pretrained(self): model = DinatModel.from_pretrained(model_name) self.assertIsNotNone(model) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "embeddings" not in name and param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @require_natten @require_vision diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py index 87a799df3709..85e70f99fca6 100644 --- a/tests/models/dinov2/test_modeling_dinov2.py +++ b/tests/models/dinov2/test_modeling_dinov2.py @@ -18,7 +18,6 @@ from transformers import Dinov2Config from transformers.testing_utils import ( - is_flaky, require_torch, require_vision, slow, @@ -237,10 +236,6 @@ def setUp(self): self.model_tester = Dinov2ModelTester(self) self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=37) - @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.") - def test_initialization(self): - super().test_initialization() - def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py index c9696dedb2ae..bbeae1888a51 100644 --- a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py +++ b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py @@ -27,7 +27,7 @@ from ...test_backbone_common import BackboneTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -244,29 +244,6 @@ def setUp(self): self, config_class=Dinov2WithRegistersConfig, has_text_modality=False, hidden_size=37 ) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad and "register_tokens" not in name: - # See PR #38607 (to avoid flakiness) - data = torch.flatten(param.data) - n_elements = torch.numel(data) - # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in - # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332 - n_elements_to_skip_on_each_side = int(n_elements * 0.025) - data_to_check = torch.sort(data).values - if n_elements_to_skip_on_each_side > 0: - data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side] - self.assertIn( - ((data_to_check.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py index 93af786e4c3b..1de2e6ec5ceb 100644 --- a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py +++ b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py @@ -21,7 +21,7 @@ from transformers.utils import is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -160,29 +160,6 @@ def setUp(self): self.model_tester = DINOv3ViTModelTester(self) self.config_tester = ConfigTester(self, config_class=DINOv3ViTConfig, has_text_modality=False, hidden_size=37) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad and "register_tokens" not in name: - # See PR #38607 (to avoid flakiness) - data = torch.flatten(param.data) - n_elements = torch.numel(data) - # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in - # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332 - n_elements_to_skip_on_each_side = int(n_elements * 0.025) - data_to_check = torch.sort(data).values - if n_elements_to_skip_on_each_side > 0: - data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side] - self.assertIn( - ((data_to_check.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py index 1f26e3514843..720a11ca85f0 100644 --- a/tests/models/donut/test_modeling_donut_swin.py +++ b/tests/models/donut/test_modeling_donut_swin.py @@ -21,7 +21,7 @@ from transformers.utils import is_torch_available from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -358,17 +358,3 @@ def test_model_from_pretrained(self): model_name = "naver-clova-ix/donut-base" model = DonutSwinModel.from_pretrained(model_name) self.assertIsNotNone(model) - - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "embeddings" not in name and param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py index 10453e59c906..501139310b30 100644 --- a/tests/models/dpt/test_modeling_dpt.py +++ b/tests/models/dpt/test_modeling_dpt.py @@ -23,7 +23,7 @@ from transformers.testing_utils import Expectations, require_torch, require_vision, slow, torch_device from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -260,29 +260,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self): def test_sdpa_can_compile_dynamic(self): pass - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - # Skip the check for the backbone - backbone_params = [] - for name, module in model.named_modules(): - if module.__class__.__name__ == "DPTViTHybridEmbeddings": - backbone_params = [f"{name}.{key}" for key in module.state_dict()] - break - - for name, param in model.named_parameters(): - if param.requires_grad: - if name in backbone_params: - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_backbone_selection(self): def _validate_backbone_init(): for model_class in self.all_model_classes: diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py index c8fd288bfbf5..237045464a87 100644 --- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py +++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py @@ -21,7 +21,7 @@ from transformers.utils.import_utils import get_torch_major_and_minor_version from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -194,29 +194,6 @@ def test_training_gradient_checkpointing(self): loss = model(**inputs).loss loss.backward() - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - # Skip the check for the backbone - backbone_params = [] - for name, module in model.named_modules(): - if module.__class__.__name__ == "DPTViTHybridEmbeddings": - backbone_params = [f"{name}.{key}" for key in module.state_dict()] - break - - for name, param in model.named_parameters(): - if param.requires_grad: - if name in backbone_params: - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings") def test_model_get_set_embeddings(self): pass diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py index 3e10d0014ada..c28260320bf1 100644 --- a/tests/models/dpt/test_modeling_dpt_hybrid.py +++ b/tests/models/dpt/test_modeling_dpt_hybrid.py @@ -20,7 +20,7 @@ from transformers.testing_utils import require_torch, require_vision, slow, torch_device from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -270,29 +270,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - # Skip the check for the backbone - backbone_params = [] - for name, module in model.named_modules(): - if module.__class__.__name__ == "DPTViTHybridEmbeddings": - backbone_params = [f"{name}.{key}" for key in module.state_dict()] - break - - for name, param in model.named_parameters(): - if param.requires_grad: - if name in backbone_params: - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @slow def test_model_from_pretrained(self): model_name = "Intel/dpt-hybrid-midas" diff --git a/tests/models/edgetam/test_modeling_edgetam.py b/tests/models/edgetam/test_modeling_edgetam.py index 687a895cc939..3bbc350a0841 100644 --- a/tests/models/edgetam/test_modeling_edgetam.py +++ b/tests/models/edgetam/test_modeling_edgetam.py @@ -367,10 +367,6 @@ def test_hidden_states_output(self): def test_can_init_all_missing_weights(self): pass - @unittest.skip(reason="Timm weights cannot be fully constructed in _init_weights") - def test_initialization(self): - pass - @unittest.skip( reason="TIMM's attention implementation is self configured and won't raise ValueError on global attention implementation." ) diff --git a/tests/models/emu3/test_modeling_emu3.py b/tests/models/emu3/test_modeling_emu3.py index 946226eaf7f6..71d3c9f48b52 100644 --- a/tests/models/emu3/test_modeling_emu3.py +++ b/tests/models/emu3/test_modeling_emu3.py @@ -348,10 +348,6 @@ def test_disk_offload_bin(self): def test_cpu_offload(self): pass - @unittest.skip("VQ-VAE module doesn't initialize weights properly") - def test_initialization(self): - pass - @pytest.mark.generate @unittest.skip("Emu3 has dynamic control flow in vision backbone") def test_generate_with_static_cache(self): diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index 21091c164c9d..2dfbbcaee41e 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -410,28 +410,6 @@ def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = ["conv"] - ignore_init = ["lstm"] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - elif not any(x in name for x in ignore_init): - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_identity_shortcut(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs() config.use_conv_shortcut = False diff --git a/tests/models/eomt/test_modeling_eomt.py b/tests/models/eomt/test_modeling_eomt.py index 367b29651f0d..7c8163266479 100644 --- a/tests/models/eomt/test_modeling_eomt.py +++ b/tests/models/eomt/test_modeling_eomt.py @@ -22,7 +22,7 @@ from transformers.utils import is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -232,40 +232,6 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_initialization(self): - # Apart from the below params, all other parameters are initialized using kaiming uniform. - non_uniform_init_parms = [ - "layernorm.bias", - "layernorm.weight", - "norm1.bias", - "norm1.weight", - "norm2.bias", - "norm2.weight", - "layer_scale1.lambda1", - "layer_scale2.lambda1", - "register_tokens", - "cls_token", - ] - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if any(x in name for x in non_uniform_init_parms): - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @require_torch class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase): diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py index 27f27105cc37..ff28ad50baec 100644 --- a/tests/models/esm/test_modeling_esmfold.py +++ b/tests/models/esm/test_modeling_esmfold.py @@ -240,12 +240,6 @@ def test_model_outputs_equivalence(self): def test_feed_forward_chunking(self): pass - @unittest.skip( - reason="ESMFold doesn't respect you and it certainly doesn't respect your initialization arguments." - ) - def test_initialization(self): - pass - @unittest.skip(reason="ESMFold doesn't support torchscript compilation.") def test_torchscript_output_attentions(self): pass diff --git a/tests/models/evolla/test_modeling_evolla.py b/tests/models/evolla/test_modeling_evolla.py index 78716cf36a33..6e399ef0ffde 100644 --- a/tests/models/evolla/test_modeling_evolla.py +++ b/tests/models/evolla/test_modeling_evolla.py @@ -32,7 +32,6 @@ from ...test_modeling_common import ( TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION, ModelTesterMixin, - _config_zero_init, ids_tensor, random_attention_mask, ) @@ -300,25 +299,6 @@ def test_single_forward(self): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) print(outputs) - def test_initialization(self): - # we skip the latents initialization test - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # skip latents - if name.endswith("latents"): - print(f"Skipping latents {name}") - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) @unittest.skip("Evolla requires both text and protein inputs which is currently not done in this test.") def test_eager_matches_sdpa_inference(self): diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py index 7c0b6cf19aa9..919130175b7e 100644 --- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py +++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py @@ -13,7 +13,6 @@ # limitations under the License. -import math import unittest from unittest.util import safe_repr @@ -34,7 +33,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor +from ...test_modeling_common import ModelTesterMixin, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -331,45 +330,6 @@ def test_falcon_mamba_lm_head_forward_and_backwards(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_falcon_mamba_lm_head_forward_and_backwards(*config_and_inputs) - def test_initialization(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - config.rescale_prenorm_residual = True - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "dt_proj.bias" in name: - dt = torch.exp( - torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min)) - + math.log(config.time_step_min) - ).clamp(min=config.time_step_floor) - inv_dt = dt + torch.log(-torch.expm1(-dt)) - if param.requires_grad: - self.assertTrue(param.data.max().item() <= inv_dt[1]) - self.assertTrue(param.data.min().item() >= inv_dt[0]) - elif "A_log" in name: - A = torch.arange(1, config.state_size + 1, dtype=torch.float32)[None, :] - A = A.expand(config.intermediate_size, -1).contiguous() - torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5) - elif "D" in name: - if param.requires_grad: - # check if it's a ones like - torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) - else: - if param.requires_grad: - if ( - "mixer.conv1d.weight" in name - or "mixer.dt_proj.weight" in name - or "mixer.out_proj.weight" in name - ): - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @slow # Ignore copy def test_model_from_pretrained(self): diff --git a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py index 6e7b567c1d91..33889a7a9404 100644 --- a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py +++ b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py @@ -33,7 +33,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor +from ...test_modeling_common import ModelTesterMixin, ids_tensor if is_torch_available(): @@ -141,22 +141,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_initialization(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - msg = f"Parameter {name} of model {model_class} seems not properly initialized" - if "norm" in name: - if "bias" in name: - self.assertEqual(param.data.mean().item(), 0.0, msg=msg) - if "weight" in name: - self.assertEqual(param.data.mean().item(), 1.0, msg=msg) - elif "conv" in name or "embed" in name: - self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg) - def test_duration_energy_pitch_output(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True @@ -573,22 +557,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_initialization(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - msg = f"Parameter {name} of model {model_class} seems not properly initialized" - if "norm" in name: - if "bias" in name: - self.assertEqual(param.data.mean().item(), 0.0, msg=msg) - if "weight" in name: - self.assertEqual(param.data.mean().item(), 1.0, msg=msg) - elif "conv" in name or "embed" in name: - self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg) - def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): return inputs_dict diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index 5915d25ee39b..5333e6ef9242 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -920,30 +920,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for FLAVA - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale" or name == "flava.logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py index f3ea480ed91d..f09d8e8a8e51 100644 --- a/tests/models/focalnet/test_modeling_focalnet.py +++ b/tests/models/focalnet/test_modeling_focalnet.py @@ -23,7 +23,7 @@ from ...test_backbone_common import BackboneTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -387,20 +387,6 @@ def test_model_from_pretrained(self): model = FocalNetModel.from_pretrained(model_name) self.assertIsNotNone(model) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "embeddings" not in name and param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @require_vision @require_torch diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py index 27cfa83f7b33..bcceea0e7379 100644 --- a/tests/models/gemma3/test_modeling_gemma3.py +++ b/tests/models/gemma3/test_modeling_gemma3.py @@ -335,12 +335,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="Siglip (vision backbone) uses the same initialization scheme as the Flax original implementation" - ) - def test_initialization(self): - pass - @unittest.skip("Loading nested configs with overwritten `kwargs` isn't supported yet, FIXME @raushan.") def test_load_with_mismatched_shapes(self): pass diff --git a/tests/models/gemma3n/test_modeling_gemma3n.py b/tests/models/gemma3n/test_modeling_gemma3n.py index 7f7a1d38915a..06c118955aff 100644 --- a/tests/models/gemma3n/test_modeling_gemma3n.py +++ b/tests/models/gemma3n/test_modeling_gemma3n.py @@ -147,7 +147,6 @@ class Gemma3nAudioModelTest(ModelTesterMixin, unittest.TestCase): is_generative = False _is_stateful = True main_input_name = "audio_mel" - test_initialization = False def setUp(self): self.model_tester = Gemma3nAudioModelTester(self) @@ -700,10 +699,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="Siglip (vision backbone) uses a non-standard initialization scheme") - def test_initialization(self): - pass - @unittest.skip( reason="Siglip has no FLEX attention, and we don't have a proper way to set/test attn in VLMs. TODO @raushan" ) diff --git a/tests/models/got_ocr2/test_modeling_got_ocr2.py b/tests/models/got_ocr2/test_modeling_got_ocr2.py index bfc1d6e26f41..db9f9f97d417 100644 --- a/tests/models/got_ocr2/test_modeling_got_ocr2.py +++ b/tests/models/got_ocr2/test_modeling_got_ocr2.py @@ -25,7 +25,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -162,20 +162,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @require_torch class GotOcr2IntegrationTest(unittest.TestCase): diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index 87e23a81915e..516d44896c2e 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -40,7 +40,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, ) @@ -251,22 +250,6 @@ def test_inputs_embeds(self): with torch.no_grad(): model(**inputs) - def test_initialization(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if name == "projector.query": - continue - elif param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_sdpa_can_dispatch_composite_models(self): # overwrite because Granite Speech is audio+text model (not vision+text) if not self.has_attentions: diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index e5684462e449..6de95751f165 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -40,7 +40,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -569,32 +569,6 @@ def test_hf_backbone(self): self.assertTrue(outputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if ( - "level_embed" in name - or "sampling_offsets.bias" in name - or "text_param" in name - or "vision_param" in name - or "value_proj" in name - or "output_proj" in name - or "reference_points" in name - or "vision_proj" in name - or "text_proj" in name - ): - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # Copied from tests.models.deformable_detr.test_modeling_deformable_detr.DeformableDetrModelTest.test_two_stage_training with DeformableDetr->GroundingDino def test_two_stage_training(self): model_class = GroundingDinoForObjectDetection diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index dc5238a9275a..7dcc17070a10 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -566,30 +566,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for GROUPVIT - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") diff --git a/tests/models/hgnet_v2/test_modeling_hgnet_v2.py b/tests/models/hgnet_v2/test_modeling_hgnet_v2.py index d6e783677d49..1900f94657a8 100644 --- a/tests/models/hgnet_v2/test_modeling_hgnet_v2.py +++ b/tests/models/hgnet_v2/test_modeling_hgnet_v2.py @@ -16,7 +16,6 @@ import unittest import torch -from torch import nn from transformers import HGNetV2Config from transformers.testing_utils import require_torch, torch_device @@ -208,22 +207,6 @@ def test_backbone(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_backbone(*config_and_inputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config=config) - for name, module in model.named_modules(): - if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)): - self.assertTrue( - torch.all(module.weight == 1), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - self.assertTrue( - torch.all(module.bias == 0), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index 79c4b4e02118..dfa75fbb89c6 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -407,32 +407,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "quantizer.weight_proj.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # Hubert cannot be TorchScripted because of torch.nn.utils.weight_norm def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False): # TODO: fix it @@ -671,32 +645,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "quantizer.weight_proj.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/internvl/test_modeling_internvl.py b/tests/models/internvl/test_modeling_internvl.py index 69ed30495524..584680d57d58 100644 --- a/tests/models/internvl/test_modeling_internvl.py +++ b/tests/models/internvl/test_modeling_internvl.py @@ -41,7 +41,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -207,20 +207,6 @@ def test_flex_attention_with_grads(self): def test_config(self): self.config_tester.run_common_tests() - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="Compile not yet supported because in LLava models") @pytest.mark.torch_compile_test def test_sdpa_can_compile_dynamic(self): diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py index e00528c74cf0..bcf1a3b491f5 100644 --- a/tests/models/jamba/test_modeling_jamba.py +++ b/tests/models/jamba/test_modeling_jamba.py @@ -34,7 +34,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask from ...test_pipeline_mixin import PipelineTesterMixin @@ -427,31 +427,6 @@ def test_load_balancing_loss(self): # After #40617, we still have 0.003 % of failure rate here. self.assertNotAlmostEqual(include_padding_result.aux_loss.item(), result.aux_loss.item()) - def test_initialization(self): - r""" - Overriding the test_initialization test as the A_log and D params of the Mamba block are initialized differently - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if "A_log" in name: - A = torch.arange(1, config.mamba_d_state + 1, dtype=torch.float32)[None, :] - A = A.expand(config.mamba_expand * config.hidden_size, -1).contiguous() - torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5) - elif "D" in name: - # check if it's a ones like - torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_attention_outputs(self): r""" Overriding the test_attention_outputs test as the Jamba model outputs attention only for its attention layers diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 48dae2ec820b..d578337df886 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -315,24 +315,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - # overwrite from common to skip `image_to_text_projection.latent_query` - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if name == "image_to_text_projection.latent_query": - # The original code use ` nn.Parameter(torch.randn(...))` for which this test won't pass. - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) diff --git a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py index 8a7811c766c1..fc7dac8f02b5 100644 --- a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py +++ b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py @@ -43,7 +43,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -378,24 +377,6 @@ def test_assisted_decoding_sample(self): def test_prompt_lookup_decoding_matches_greedy_search(self): pass - # overwrite from common to skip `image_to_text_projection.latent_query` - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if name == "image_to_text_projection.latent_query": - # The original code use ` nn.Parameter(torch.randn(...))` for which this test won't pass. - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py index 1bc92b9afac5..c0ea557a33da 100644 --- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py +++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py @@ -41,7 +41,6 @@ from ...test_modeling_common import ( TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION, ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, ) @@ -317,25 +316,6 @@ def test_tied_weights_keys(self): def test_generate_without_input_ids(self): pass - def test_initialization(self): - """ - Overrides [ModelTesterMixin.test_initialization] because of specificities of Mimi codec model. - See https://github.com/huggingface/transformers/blob/1077603410cd73ba71d64a522033574d66d64b55/tests/models/mimi/test_modeling_mimi.py#L384-L397 - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = ["conv", "input_proj", "output_proj"] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) def test_eager_matches_sdpa_inference( self, name, dtype, padding_side, use_attention_mask, output_attentions, enable_kernels diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index 2c1b157e3a90..4faf6aa61b4a 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -26,7 +26,7 @@ from transformers.utils import is_detectron2_available, is_torch_available from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask from ...test_pipeline_mixin import PipelineTesterMixin @@ -432,23 +432,6 @@ def test_model_from_pretrained(self): model = LayoutLMv2Model.from_pretrained(model_name) self.assertIsNotNone(model) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "backbone" in name or "visual_segment_embedding" in name: - continue - - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_batching_equivalence(self): def equivalence(tensor1, tensor2): return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0) diff --git a/tests/models/lfm2_vl/test_modeling_lfm2_vl.py b/tests/models/lfm2_vl/test_modeling_lfm2_vl.py index c9d39c3ef9ee..d097d7e9e629 100644 --- a/tests/models/lfm2_vl/test_modeling_lfm2_vl.py +++ b/tests/models/lfm2_vl/test_modeling_lfm2_vl.py @@ -204,12 +204,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="Siglip2 backbone has a non-standard initialization scheme, that this test cannot handle easily" - ) - def test_initialization(self): - pass - @require_torch_accelerator @require_read_token diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index bd119be91be7..ef2b1609ba99 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -41,7 +41,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, ) @@ -205,22 +204,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "image_newline" in name: - continue - elif param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_mismatching_num_image_tokens(self): """ Tests that VLMs through an error with explicit message saying what is wrong diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index ed9cdb12ab01..d8980f35fdcf 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -41,7 +41,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, ) @@ -218,22 +217,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "image_newline" in name: - continue - elif param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_mismatching_num_image_tokens(self): """ Tests that VLMs through an error with explicit message saying what is wrong diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index c8ce5c364daf..c2280478a35b 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -41,7 +41,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, ) @@ -215,23 +214,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - # LLaVa Onevision has SigLIP backbone which init weights differently from CLIP - if "image_newline" in name or "vision_tower" in name: - continue - elif param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_odd_sized_image(self): # prepare model configuration config = self.model_tester.get_config() diff --git a/tests/models/mamba/test_modeling_mamba.py b/tests/models/mamba/test_modeling_mamba.py index a78a89868229..f63b3cf012a0 100644 --- a/tests/models/mamba/test_modeling_mamba.py +++ b/tests/models/mamba/test_modeling_mamba.py @@ -13,7 +13,6 @@ # limitations under the License. -import math import unittest from unittest.util import safe_repr @@ -25,7 +24,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor +from ...test_modeling_common import ModelTesterMixin, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -298,45 +297,6 @@ def test_mamba_lm_head_forward_and_backwards(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_mamba_lm_head_forward_and_backwards(*config_and_inputs) - def test_initialization(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - config.rescale_prenorm_residual = True - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "dt_proj.bias" in name: - dt = torch.exp( - torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min)) - + math.log(config.time_step_min) - ).clamp(min=config.time_step_floor) - inv_dt = dt + torch.log(-torch.expm1(-dt)) - if param.requires_grad: - self.assertTrue(param.data.max().item() <= inv_dt[1]) - self.assertTrue(param.data.min().item() >= inv_dt[0]) - elif "A_log" in name: - A = torch.arange(1, config.state_size + 1, dtype=torch.float32)[None, :] - A = A.expand(config.intermediate_size, -1).contiguous() - torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5) - elif "D" in name: - if param.requires_grad: - # check if it's a ones like - torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) - else: - if param.requires_grad: - if ( - "mixer.conv1d.weight" in name - or "mixer.dt_proj.weight" in name - or "mixer.out_proj.weight" in name - ): - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @slow def test_model_from_pretrained(self): model = MambaModel.from_pretrained("hf-internal-testing/mamba-130m") diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py index 03a91dcbe559..0589b4778b8d 100644 --- a/tests/models/mamba2/test_modeling_mamba2.py +++ b/tests/models/mamba2/test_modeling_mamba2.py @@ -13,7 +13,6 @@ # limitations under the License. -import math import unittest from transformers import AutoTokenizer, Mamba2Config, is_torch_available @@ -29,7 +28,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor +from ...test_modeling_common import ModelTesterMixin, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -273,40 +272,6 @@ def test_mamba2_slow_vs_fast_forward_grouped(self): config_and_inputs[0].n_groups //= 2 self.model_tester.create_and_check_mamba2_slow_vs_fast_forward(*config_and_inputs) - def test_initialization(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - config.rescale_prenorm_residual = True - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "dt_proj.bias" in name: - dt = torch.exp( - torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min)) - + math.log(config.time_step_min) - ).clamp(min=config.time_step_floor) - inv_dt = dt + torch.log(-torch.expm1(-dt)) - if param.requires_grad: - self.assertTrue(param.data.max().item() <= inv_dt[1]) - self.assertTrue(param.data.min().item() >= inv_dt[0]) - elif "A_log" in name: - A = torch.arange(1, config.num_heads + 1) - torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5) - elif "D" in name: - if param.requires_grad: - # check if it's a ones like - torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) - else: - if param.requires_grad: - if "mixer.conv1d.weight" in name or "mixer.dt_bias" in name or "mixer.out_proj.weight" in name: - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="A large mamba2 would be necessary (and costly) for that") def test_multi_gpu_data_parallel_forward(self): pass diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py index 4f78399a2865..2b28936a0dc1 100644 --- a/tests/models/mask2former/test_modeling_mask2former.py +++ b/tests/models/mask2former/test_modeling_mask2former.py @@ -35,7 +35,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init +from ...test_modeling_common import ModelTesterMixin from ...test_pipeline_mixin import PipelineTesterMixin @@ -351,26 +351,6 @@ def test_backbone_selection(self): elif model.__class__.__name__ == "Mask2FormerForUniversalSegmentation": self.assertEqual(model.model.pixel_level_module.encoder.out_indices, [1, 2, 3]) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if ( - "self_attn.sampling_offsets.bias" in name - or "self_attn.value_proj.weight" in name - or "self_attn.output_proj.weight" in name - ): - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_initialization_pretrained_backbone(self): backbone_name = "microsoft/resnet-18" diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py index f1b1c9747438..0a944c49d9c9 100644 --- a/tests/models/maskformer/test_modeling_maskformer_swin.py +++ b/tests/models/maskformer/test_modeling_maskformer_swin.py @@ -311,10 +311,6 @@ def test_hidden_states_output_with_padding(self): def test_model_from_pretrained(self): pass - @unittest.skip(reason="This will be fixed once MaskFormerSwin is replaced by native Swin") - def test_initialization(self): - pass - @unittest.skip(reason="This will be fixed once MaskFormerSwin is replaced by native Swin") def test_gradient_checkpointing_backward_compatibility(self): pass diff --git a/tests/models/metaclip_2/test_modeling_metaclip_2.py b/tests/models/metaclip_2/test_modeling_metaclip_2.py index b6be33fa8e8b..b13cb0855923 100644 --- a/tests/models/metaclip_2/test_modeling_metaclip_2.py +++ b/tests/models/metaclip_2/test_modeling_metaclip_2.py @@ -572,30 +572,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for MetaClip2 - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") @@ -761,10 +737,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="MetaClip2 uses a non-standard initialization scheme") - def test_initialization(self): - pass - @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) @slow @is_flaky() diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py index ecb84f144932..865705481e3a 100644 --- a/tests/models/mgp_str/test_modeling_mgp_str.py +++ b/tests/models/mgp_str/test_modeling_mgp_str.py @@ -22,7 +22,7 @@ from transformers.utils import is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -203,22 +203,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - # override as the `logit_scale` parameter initialization is different for MgpstrModel - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if isinstance(param, (nn.Linear, nn.Conv2d, nn.LayerNorm)): - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="Retain_grad is tested in individual model tests") def test_retain_grad_hidden_states_attentions(self): pass diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py index 54ce4763af5f..9b49c903baf5 100644 --- a/tests/models/mimi/test_modeling_mimi.py +++ b/tests/models/mimi/test_modeling_mimi.py @@ -385,21 +385,6 @@ def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = ["conv", "input_proj", "output_proj"] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_identity_shortcut def test_identity_shortcut(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/mistral3/test_modeling_mistral3.py b/tests/models/mistral3/test_modeling_mistral3.py index 5cd0c023e165..694cf3a267bb 100644 --- a/tests/models/mistral3/test_modeling_mistral3.py +++ b/tests/models/mistral3/test_modeling_mistral3.py @@ -36,7 +36,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -192,20 +192,6 @@ def check_config_can_be_init_without_params(): self.config_tester.check_config_can_be_init_without_params = check_config_can_be_init_without_params self.config_tester.run_common_tests() - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="Compile not yet supported because in LLava models") @pytest.mark.torch_compile_test def test_sdpa_can_compile_dynamic(self): diff --git a/tests/models/mlcd/test_modeling_mlcd.py b/tests/models/mlcd/test_modeling_mlcd.py index 81fe4df9051b..bca94e9bf679 100644 --- a/tests/models/mlcd/test_modeling_mlcd.py +++ b/tests/models/mlcd/test_modeling_mlcd.py @@ -32,7 +32,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor if is_torch_available(): @@ -141,20 +141,6 @@ def test_model_get_set_embeddings(self): x = model.get_output_embeddings() self.assertTrue(x is None or isinstance(x, torch.nn.Linear)) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad and "class_pos_emb" not in name: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @require_torch class MLCDVisionModelIntegrationTest(unittest.TestCase): diff --git a/tests/models/mm_grounding_dino/test_modeling_mm_grounding_dino.py b/tests/models/mm_grounding_dino/test_modeling_mm_grounding_dino.py index fed96e23384b..f1e63787fc57 100644 --- a/tests/models/mm_grounding_dino/test_modeling_mm_grounding_dino.py +++ b/tests/models/mm_grounding_dino/test_modeling_mm_grounding_dino.py @@ -39,7 +39,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -574,34 +574,6 @@ def test_hf_backbone(self): self.assertTrue(outputs) - # Ignore copy - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if ( - "level_embed" in name - or "sampling_offsets.bias" in name - or "text_param" in name - or "vision_param" in name - or "value_proj" in name - or "output_proj" in name - or "reference_points" in name - or "vision_proj" in name - or "text_proj" in name - or ("class_embed" in name and "bias" in name) - ): - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # Copied from tests.models.deformable_detr.test_modeling_deformable_detr.DeformableDetrModelTest.test_two_stage_training with DeformableDetr->MMGroundingDino def test_two_stage_training(self): model_class = MMGroundingDinoForObjectDetection diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py index 2e2fbe77417e..db4180d6d97c 100644 --- a/tests/models/modernbert/test_modeling_modernbert.py +++ b/tests/models/modernbert/test_modeling_modernbert.py @@ -32,7 +32,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask from ...test_pipeline_mixin import PipelineTesterMixin @@ -300,31 +300,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - # The classifier.weight from ModernBertForSequenceClassification and ModernBertForTokenClassification - # are initialized without `initializer_range`, so they're not set to ~0 via the _config_zero_init - if param.requires_grad and not ( - name == "classifier.weight" - and model_class - in [ - ModernBertForSequenceClassification, - ModernBertForTokenClassification, - ModernBertForQuestionAnswering, - ModernBertForMultipleChoice, - ] - ): - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) diff --git a/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py b/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py index fcfb8f28e796..d6b1a13105d2 100644 --- a/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py +++ b/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py @@ -22,7 +22,6 @@ ) from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester -from ...test_modeling_common import _config_zero_init if is_torch_available(): @@ -53,32 +52,6 @@ class ModernBertDecoderModelTest(CausalLMModelTest, unittest.TestCase): ) model_tester_class = ModernBertDecoderModelTester - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - # The classifier.weight from ModernBertDecoderForSequenceClassification - # is initialized without `initializer_range`, so it's not set to ~0 via the _config_zero_init - if param.requires_grad and not ( - name == "classifier.weight" and model_class == ModernBertDecoderForSequenceClassification - ): - data = torch.flatten(param.data) - n_elements = torch.numel(data) - # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in - # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332 - n_elements_to_skip_on_each_side = int(n_elements * 0.025) - data_to_check = torch.sort(data).values - if n_elements_to_skip_on_each_side > 0: - data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side] - self.assertIn( - ((data_to_check.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @slow @require_torch diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index a724711e1c22..06f81f3ef79c 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -582,21 +582,6 @@ def _check_generate_outputs(self, output, config, use_cache=False, num_return_se output, config, use_cache=True, num_return_sequences=num_return_sequences, num_beams=num_beams ) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = ["conv", "input_proj", "output_proj"] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="Continuing from past key values is not straightforward as we're dealing with 3 inputs") def test_generate_continue_from_past_key_values(self): pass diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index e5b876b42604..e5f41ca358d3 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -874,29 +874,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - # override since the conv layers and lstm's in encodec are exceptions - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = ["conv"] - ignore_init = ["lstm"] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - elif not any(x in name for x in ignore_init): - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # override since we have embeddings / LM heads over multiple codebooks def test_model_get_set_embeddings(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index 4d487546c741..6f5f8728dbab 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -877,29 +877,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - # override since the conv layers and lstm's in encodec are exceptions - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = ["conv"] - ignore_init = ["lstm"] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - elif not any(x in name for x in ignore_init): - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # override since we have embeddings / LM heads over multiple codebooks def test_model_get_set_embeddings(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py index a69ef2b3d6fe..bdb6a7477b66 100644 --- a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py +++ b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py @@ -32,7 +32,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -614,29 +614,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(encoder_attentions.grad) self.assertIsNotNone(cross_attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if ( - "embeddings" in name - or ".fc" in name - or "decoder.channel_projection_layers" in name - or "query_position_head" in name - or "decoder.encoder_vision_features" in name - or "language_backbone.text_projection" in name - ): - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} seems not properly initialized", - ) - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py index 4ae6bba12033..0fc2ae46952c 100644 --- a/tests/models/oneformer/test_modeling_oneformer.py +++ b/tests/models/oneformer/test_modeling_oneformer.py @@ -35,7 +35,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init +from ...test_modeling_common import ModelTesterMixin from ...test_pipeline_mixin import PipelineTesterMixin @@ -364,34 +364,6 @@ def test_attention_outputs(self): outputs = model(**inputs, output_attentions=True) self.assertTrue(outputs.attentions is not None) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.is_training = True - config.contrastive_temperature = 1 - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if ( - "self_attn.sampling_offsets.bias" in name - or "self_attn.value_proj.weight" in name - or "self_attn.output_proj.weight" in name - or "self_attn.in_proj_weight" in name - or "self_attn.out_proj.weight" in name - or "mlp.fc1.weight" in name - or "mlp.fc2.weight" in name - or "text_mapper.text_encoder.positional_embedding" in name - or "text_mapper.text_encoder.token_embedding.weight" in name - ): - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_initialization_pretrained_backbone(self): backbone_name = "microsoft/resnet-18" diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py index ba0d041801de..d8afcbb45836 100644 --- a/tests/models/owlv2/test_modeling_owlv2.py +++ b/tests/models/owlv2/test_modeling_owlv2.py @@ -459,30 +459,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for OWLV2 - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") @@ -668,10 +644,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="Test_initialization is tested in individual model tests") - def test_initialization(self): - pass - @unittest.skip(reason="Test_forward_signature is tested in individual model tests") def test_forward_signature(self): pass diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index fa87b57072c6..185d95244759 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -454,30 +454,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for OWLVIT - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") @@ -661,10 +637,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="Test_initialization is tested in individual model tests") - def test_initialization(self): - pass - @unittest.skip(reason="Test_forward_signature is tested in individual model tests") def test_forward_signature(self): pass diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index d052bfb97f81..5797f75b2fb5 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -265,10 +265,6 @@ def test_disk_offload_safetensors(self): def test_model_parallelism(self): pass - @unittest.skip(reason="PaliGemma's SigLip encoder uses a non-standard initialization scheme") - def test_initialization(self): - pass - # TODO extend valid outputs to include this test @Molbap @unittest.skip(reason="PaliGemma has currently one output format.") def test_model_outputs_equivalence(self): diff --git a/tests/models/paligemma2/test_modeling_paligemma2.py b/tests/models/paligemma2/test_modeling_paligemma2.py index 536eb95bef24..8cb7feabe8a1 100644 --- a/tests/models/paligemma2/test_modeling_paligemma2.py +++ b/tests/models/paligemma2/test_modeling_paligemma2.py @@ -244,10 +244,6 @@ def test_disk_offload_safetensors(self): def test_model_parallelism(self): pass - @unittest.skip(reason="PaliGemma's SigLip encoder uses a non-standard initialization scheme") - def test_initialization(self): - pass - # TODO extend valid outputs to include this test @Molbap @unittest.skip(reason="PaliGemma has currently one output format.") def test_model_outputs_equivalence(self): diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py index 15738c55a3ea..ba2bf1c582c3 100644 --- a/tests/models/perception_lm/test_modeling_perception_lm.py +++ b/tests/models/perception_lm/test_modeling_perception_lm.py @@ -293,10 +293,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self): def test_can_init_all_missing_weights(self): pass - @unittest.skip(reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights") - def test_initialization(self): - pass - @unittest.skip( reason="PE/TIMM's attention implementation is self configured and won't raise ValueError on global attention implementation." ) diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py index 0363476e0fc6..61c72fe857c1 100644 --- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py +++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py @@ -208,10 +208,6 @@ def setUp(self): self.model_tester = Phi4MultimodalModelTester(self) self.config_tester = ConfigTester(self, config_class=Phi4MultimodalConfig) - @unittest.skip(reason="Unstable test") - def test_initialization(self): - pass - @unittest.skip(reason="Depending on input modalities, some params may not have gradients") def test_training_gradient_checkpointing(self): pass diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index efeefa6bc155..813334106ee4 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -524,41 +524,6 @@ def test_training_gradient_checkpointing(self): loss = model(**inputs).loss loss.backward() - # override as the `logit_scale` parameter initialization is different for Pix2Struct - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - # See PR #38607 (to avoid flakiness) - data = torch.flatten(param.data) - n_elements = torch.numel(data) - # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in - # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332 - n_elements_to_skip_on_each_side = int(n_elements * 0.025) - data_to_check = torch.sort(data).values - if n_elements_to_skip_on_each_side > 0: - data_to_check = data_to_check[ - n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side - ] - self.assertIn( - ((data_to_check.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite because `vocab_size` is not an attribute of `Pix2StructConfig` but rather `Pix2StructTextConfig` def test_resize_tokens_embeddings(self): original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py index bea696a36568..ff1264e79888 100644 --- a/tests/models/pvt/test_modeling_pvt.py +++ b/tests/models/pvt/test_modeling_pvt.py @@ -171,17 +171,6 @@ def test_inputs_embeds(self): def test_model_get_set_embeddings(self): pass - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config=config) - for name, param in model.named_parameters(): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) diff --git a/tests/models/pvt_v2/test_modeling_pvt_v2.py b/tests/models/pvt_v2/test_modeling_pvt_v2.py index 7cfa0d7bbad5..cfeef2d4d8d6 100644 --- a/tests/models/pvt_v2/test_modeling_pvt_v2.py +++ b/tests/models/pvt_v2/test_modeling_pvt_v2.py @@ -188,17 +188,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): # torch.utils.checkpoint.checkpoint self.check_training_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": True}) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config=config) - for name, param in model.named_parameters(): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py index f3f4fa2ffe4f..1ef2da1af3cf 100644 --- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py @@ -44,7 +44,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, ) @@ -236,20 +235,6 @@ def test_text_config(self): self.assertEqual(base_config.patch_size, 8) self.assertNotEqual(base_config.vision_config.patch_size, 8) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_mismatching_num_image_tokens(self): """ Tests that VLMs through an error with explicit message saying what is wrong diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 9cb028eb3a73..e455c092ba0b 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -42,7 +42,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, ) @@ -216,20 +215,6 @@ def test_text_config(self): self.assertEqual(base_config.patch_size, 8) self.assertNotEqual(base_config.vision_config.patch_size, 8) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_mismatching_num_image_tokens(self): """ Tests that VLMs through an error with explicit message saying what is wrong diff --git a/tests/models/qwen3_next/test_modeling_qwen3_next.py b/tests/models/qwen3_next/test_modeling_qwen3_next.py index 3589f32584a9..f0dcdf5ddd4a 100644 --- a/tests/models/qwen3_next/test_modeling_qwen3_next.py +++ b/tests/models/qwen3_next/test_modeling_qwen3_next.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import tempfile import unittest @@ -40,7 +39,6 @@ from ...generation.test_utils import has_similar_generate_outputs from ...test_modeling_common import ( TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION, - _config_zero_init, _test_eager_matches_sdpa_inference, ) @@ -279,24 +277,6 @@ def test_attention_outputs(self): self.assertEqual(len(self_attentions), sum(layer == "full_attention" for layer in config.layer_types)) self.assertListEqual(list(self_attentions[0].shape[-3:]), [config.num_attention_heads, seq_len, seq_len]) - def test_initialization(self): - "Some parameters need to be skipped." - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=copy.deepcopy(configs_no_init)) - for name, param in model.named_parameters(): - if param.requires_grad: - # this one need to be skipped, it's initialized as log(uniform(0, 16)) - if "A_log" in name: - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) def test_eager_matches_sdpa_inference( self, diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py index dcd57782b749..db9d2cf42655 100644 --- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py +++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py @@ -91,10 +91,6 @@ def test_left_padding_compatibility(self): def test_assisted_decoding_sample(self): pass - @unittest.skip(reason="TODO @arthurzucker not super important and failing.") - def test_initialization(self): - pass - @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests") @pytest.mark.generate def test_beam_sample_generate_dict_output(self): diff --git a/tests/models/regnet/test_modeling_regnet.py b/tests/models/regnet/test_modeling_regnet.py index 97702a108dba..73579fa7d9e8 100644 --- a/tests/models/regnet/test_modeling_regnet.py +++ b/tests/models/regnet/test_modeling_regnet.py @@ -27,7 +27,6 @@ if is_torch_available(): import torch - from torch import nn from transformers import RegNetForImageClassification, RegNetModel @@ -162,22 +161,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config=config) - for name, module in model.named_modules(): - if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)): - self.assertTrue( - torch.all(module.weight == 1), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - self.assertTrue( - torch.all(module.bias == 0), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) diff --git a/tests/models/resnet/test_modeling_resnet.py b/tests/models/resnet/test_modeling_resnet.py index c764ff6b2e25..48f250f9ec62 100644 --- a/tests/models/resnet/test_modeling_resnet.py +++ b/tests/models/resnet/test_modeling_resnet.py @@ -28,7 +28,6 @@ if is_torch_available(): import torch - from torch import nn from transformers import ResNetBackbone, ResNetForImageClassification, ResNetModel @@ -207,22 +206,6 @@ def test_backbone(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_backbone(*config_and_inputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config=config) - for name, module in model.named_modules(): - if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)): - self.assertTrue( - torch.all(module.weight == 1), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - self.assertTrue( - torch.all(module.bias == 0), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) diff --git a/tests/models/rt_detr/test_modeling_rt_detr.py b/tests/models/rt_detr/test_modeling_rt_detr.py index d1969c3c0b6e..eddf6d9a6f91 100644 --- a/tests/models/rt_detr/test_modeling_rt_detr.py +++ b/tests/models/rt_detr/test_modeling_rt_detr.py @@ -39,7 +39,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -588,56 +588,6 @@ def test_hf_backbone(self): self.assertTrue(outputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - configs_no_init.initializer_bias_prior_prob = 0.2 - bias_value = -1.3863 # log_e ((1 - 0.2) / 0.2) - - failed_cases = [] - - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - # Skip the check for the backbone - for name, module in model.named_modules(): - if module.__class__.__name__ == "RTDetrConvEncoder": - backbone_params = [f"{name}.{key}" for key in module.state_dict()] - break - - for name, param in model.named_parameters(): - if param.requires_grad: - if ("class_embed" in name and "bias" in name) or "enc_score_head.bias" in name: - bias_tensor = torch.full_like(param.data, bias_value) - if not torch.allclose(param.data, bias_tensor, atol=1e-4): - failed_cases.append( - f"Parameter {name} of model {model_class} seems not properly initialized. " - f"Biases should be initialized to {bias_value}, got {param.data}" - ) - elif ( - "level_embed" in name - or "sampling_offsets.bias" in name - or "value_proj" in name - or "output_proj" in name - or "reference_points" in name - or "enc_score_head.weight" in name - or ("class_embed" in name and "weight" in name) - or name in backbone_params - ): - continue - else: - mean = param.data.mean() - round_mean = (mean * 1e9).round() / 1e9 - round_mean = round_mean.item() - if round_mean not in [0.0, 1.0]: - failed_cases.append( - f"Parameter {name} of model {model_class} seems not properly initialized. " - f"Mean is {round_mean}, but should be in [0, 1]" - ) - - message = "\n" + "\n".join(failed_cases) - self.assertTrue(not failed_cases, message) - @parameterized.expand(["float32", "float16", "bfloat16"]) @require_torch_accelerator @slow diff --git a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py index 80c40195bfd5..2b7713af6e84 100644 --- a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py +++ b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py @@ -38,7 +38,7 @@ ) from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -592,56 +592,6 @@ def test_hf_backbone(self): self.assertTrue(outputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - configs_no_init.initializer_bias_prior_prob = 0.2 - bias_value = -1.3863 # log_e ((1 - 0.2) / 0.2) - - failed_cases = [] - - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - # Skip the check for the backbone - for name, module in model.named_modules(): - if module.__class__.__name__ == "RTDetrV2ConvEncoder": - backbone_params = [f"{name}.{key}" for key in module.state_dict()] - break - - for name, param in model.named_parameters(): - if param.requires_grad: - if ("class_embed" in name and "bias" in name) or "enc_score_head.bias" in name: - bias_tensor = torch.full_like(param.data, bias_value) - if not torch.allclose(param.data, bias_tensor, atol=1e-4): - failed_cases.append( - f"Parameter {name} of model {model_class} seems not properly initialized. " - f"Biases should be initialized to {bias_value}, got {param.data}" - ) - elif ( - "level_embed" in name - or "sampling_offsets.bias" in name - or "value_proj" in name - or "output_proj" in name - or "reference_points" in name - or "enc_score_head.weight" in name - or ("class_embed" in name and "weight" in name) - or name in backbone_params - ): - continue - else: - mean = param.data.mean() - round_mean = (mean * 1e9).round() / 1e9 - round_mean = round_mean.item() - if round_mean not in [0.0, 1.0]: - failed_cases.append( - f"Parameter {name} of model {model_class} seems not properly initialized. " - f"Mean is {round_mean}, but should be in [0, 1]" - ) - - message = "\n" + "\n".join(failed_cases) - self.assertTrue(not failed_cases, message) - @parameterized.expand(["float32", "float16", "bfloat16"]) @require_torch_accelerator @slow diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py index 9a61160a275f..692b5ab5d9bd 100644 --- a/tests/models/rwkv/test_modeling_rwkv.py +++ b/tests/models/rwkv/test_modeling_rwkv.py @@ -265,35 +265,6 @@ def test_state_equivalency(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_state_equivalency(*config_and_inputs) - def test_initialization(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config=config) - for name, param in model.named_parameters(): - if "time_decay" in name: - if param.requires_grad: - self.assertTrue(param.data.max().item() == 3.0) - self.assertTrue(param.data.min().item() == -5.0) - elif "time_first" in name: - if param.requires_grad: - # check if it's a ones like - torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) - elif any(x in name for x in ["time_mix_key", "time_mix_receptance"]): - if param.requires_grad: - self.assertInterval( - param.data, - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - elif "time_mix_value" in name: - if param.requires_grad: - self.assertInterval( - param.data, - [0.0, 1.3], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_attention_outputs(self): r""" Overriding the test_attention_outputs test as the attention outputs of Rwkv are different from other models diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py index 596df6daeda2..ab026334c099 100644 --- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py @@ -25,7 +25,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -374,44 +373,6 @@ def test_model_from_pretrained(self): model = SeamlessM4TModel.from_pretrained(model_name) self.assertIsNotNone(model) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "pos_bias_v", - "pos_bias_u", - "pointwise_conv1", - "pointwise_conv2", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "objective.weight", - "adapter", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="SeamlessM4TSpeechEncoder doesn't have an embedding layer") def test_inputs_embeds(self): pass @@ -618,44 +579,6 @@ def test_model_from_pretrained(self): model = SeamlessM4TModel.from_pretrained(model_name) self.assertIsNotNone(model) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "pos_bias_v", - "pos_bias_u", - "pointwise_conv1", - "pointwise_conv2", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "objective.weight", - "adapter", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip( reason="Expected missing keys serve when using SeamlessM4TForXXX.from_pretrained from a checkpoint saved by SeamlessM4TModel.save_pretrained." ) diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py index 521e5f864e98..66f3353d7482 100644 --- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py +++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py @@ -25,7 +25,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -400,44 +399,6 @@ def test_model_from_pretrained(self): model = SeamlessM4Tv2Model.from_pretrained(model_name) self.assertIsNotNone(model) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "pos_bias_v", - "pos_bias_u", - "pointwise_conv1", - "pointwise_conv2", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "objective.weight", - "adapter", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="SeamlessM4Tv2SpeechEncoder doesn't have an embedding layer") def test_inputs_embeds(self): pass @@ -631,44 +592,6 @@ def test_model_from_pretrained(self): model = SeamlessM4Tv2Model.from_pretrained(model_name) self.assertIsNotNone(model) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "pos_bias_v", - "pos_bias_u", - "pointwise_conv1", - "pointwise_conv2", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "objective.weight", - "adapter", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip( reason="Expected missing keys serve when using SeamlessM4Tv2ForXXX.from_pretrained from a checkpoint saved by SeamlessM4Tv2Model.save_pretrained." ) diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index 70eb4e25095d..be8b27c17712 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -24,7 +24,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -375,32 +374,6 @@ def test_seq_classifier_train(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_seq_classifier_training(*config_and_inputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.parametrizations.weight", - "conv.weight", - "masked_spec_embed", - "quantizer.weight_proj.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index c4d97b43a09d..84c0cc3c56ff 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -24,7 +24,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -386,32 +385,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.parametrizations.weight", - "conv.weight", - "masked_spec_embed", - "quantizer.weight_proj.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py index 047f96bbdf1c..95a88f28c2e8 100644 --- a/tests/models/siglip/test_modeling_siglip.py +++ b/tests/models/siglip/test_modeling_siglip.py @@ -239,10 +239,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="Siglip uses a non-standard initialization scheme") - def test_initialization(self): - pass - @slow def test_model_from_pretrained(self): model_name = "google/siglip-base-patch16-224" @@ -384,10 +380,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self): def test_inputs_embeds(self): pass - @unittest.skip(reason="Siglip uses a non-standard initialization scheme") - def test_initialization(self): - pass - @slow def test_model_from_pretrained(self): model_name = "google/siglip-base-patch16-224" @@ -495,10 +487,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="Siglip uses a non-standard initialization scheme") - def test_initialization(self): - pass - # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest._create_and_check_torchscript with CLIP->Siglip def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: @@ -654,10 +642,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="Siglip uses a non-standard initialization scheme") - def test_initialization(self): - pass - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/siglip2/test_modeling_siglip2.py b/tests/models/siglip2/test_modeling_siglip2.py index 9524ea1d6221..f417b512004b 100644 --- a/tests/models/siglip2/test_modeling_siglip2.py +++ b/tests/models/siglip2/test_modeling_siglip2.py @@ -331,10 +331,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="Siglip2 uses a non-standard initialization scheme") - def test_initialization(self): - pass - @slow def test_model_from_pretrained(self): model_name = "google/siglip2-base-patch16-naflex" @@ -472,10 +468,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self): def test_inputs_embeds(self): pass - @unittest.skip(reason="Siglip2 uses a non-standard initialization scheme") - def test_initialization(self): - pass - @slow def test_model_from_pretrained(self): model_name = "google/siglip2-base-patch16-naflex" @@ -588,10 +580,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="Siglip2 uses a non-standard initialization scheme") - def test_initialization(self): - pass - def test_load_vision_text_config(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -685,10 +673,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="Siglip2 uses a non-standard initialization scheme") - def test_initialization(self): - pass - # Draw a circle on an images with different aspect ratios def prepare_images(): diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index bfb305c38d17..103cc29e92d3 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -35,7 +35,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -556,33 +555,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # this model has no inputs_embeds @unittest.skip(reason="Model has no input_embeds") def test_inputs_embeds(self): @@ -957,29 +929,6 @@ def test_forward_signature(self): expected_arg_names.extend(["encoder_outputs"]) self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="Model has no inputs_embeds") def test_inputs_embeds(self): pass @@ -1649,33 +1598,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="Model has no input_embeds") def test_inputs_embeds(self): pass @@ -1863,10 +1785,6 @@ def test_forward_signature(self): def test_hidden_states_output(self): pass - @unittest.skip - def test_initialization(self): - pass - @unittest.skip(reason="Model has no input_embeds") def test_inputs_embeds(self): pass diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py index 91a508b97559..6e84a3a5c42f 100644 --- a/tests/models/swiftformer/test_modeling_swiftformer.py +++ b/tests/models/swiftformer/test_modeling_swiftformer.py @@ -26,7 +26,7 @@ from transformers.utils import is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -232,22 +232,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if name.endswith(".w_g"): - continue - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9) / 1e9).round().item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py index 9f602438bcd3..e6d23a8019c4 100644 --- a/tests/models/swin/test_modeling_swin.py +++ b/tests/models/swin/test_modeling_swin.py @@ -23,7 +23,7 @@ from ...test_backbone_common import BackboneTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -447,20 +447,6 @@ def test_model_from_pretrained(self): model = SwinModel.from_pretrained(model_name) self.assertIsNotNone(model) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "embeddings" not in name and param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @require_vision @require_torch diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py index 0099f3cdc644..e4bf84c68283 100644 --- a/tests/models/swin2sr/test_modeling_swin2sr.py +++ b/tests/models/swin2sr/test_modeling_swin2sr.py @@ -20,7 +20,7 @@ from transformers.utils import is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -237,32 +237,6 @@ def test_model_from_pretrained(self): model = Swin2SRModel.from_pretrained(model_name) self.assertIsNotNone(model) - # overwriting because of `logit_scale` parameter - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "logit_scale" in name: - continue - if param.requires_grad: - # See PR #38607 (to avoid flakiness) - data = torch.flatten(param.data) - n_elements = torch.numel(data) - # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in - # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332 - n_elements_to_skip_on_each_side = int(n_elements * 0.025) - data_to_check = torch.sort(data).values - if n_elements_to_skip_on_each_side > 0: - data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side] - self.assertIn( - ((data_to_check.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py index fefe7bb7f841..28c1f8671f0c 100644 --- a/tests/models/swinv2/test_modeling_swinv2.py +++ b/tests/models/swinv2/test_modeling_swinv2.py @@ -24,7 +24,7 @@ from ...test_backbone_common import BackboneTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -442,20 +442,6 @@ def test_model_from_pretrained(self): def test_feed_forward_chunking(self): pass - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if "embeddings" not in name and "logit_scale" not in name and param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @require_vision @require_torch diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py index 460f41495d73..10985988920e 100644 --- a/tests/models/table_transformer/test_modeling_table_transformer.py +++ b/tests/models/table_transformer/test_modeling_table_transformer.py @@ -23,7 +23,7 @@ from transformers.testing_utils import Expectations, require_timm, require_torch, require_vision, slow, torch_device from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -532,29 +532,6 @@ def test_greyscale_images(self): self.assertTrue(outputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - configs_no_init.init_xavier_std = 1e9 - - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if "bbox_attention" in name and "bias" not in name: - self.assertLess( - 100000, - abs(param.data.max().item()), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - TOLERANCE = 1e-4 diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index b1a324c7a660..b839e583b990 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -36,7 +36,6 @@ if is_torch_available(): import torch - from torch import nn from transformers import TextNetBackbone, TextNetForImageClassification, TextNetModel @@ -246,22 +245,6 @@ def test_backbone(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_backbone(*config_and_inputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config=config) - for name, module in model.named_modules(): - if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)): - self.assertTrue( - torch.all(module.weight == 1), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - self.assertTrue( - torch.all(module.bias == 0), - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py index f274fd65c956..aafcb141db7c 100644 --- a/tests/models/timm_backbone/test_modeling_timm_backbone.py +++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py @@ -139,10 +139,6 @@ def test_hidden_states_output(self): def test_can_init_all_missing_weights(self): pass - @unittest.skip(reason="TimmBackbone initialization is managed on the timm side") - def test_initialization(self): - pass - @unittest.skip(reason="TimmBackbone models doesn't have inputs_embeds") def test_inputs_embeds(self): pass diff --git a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py index 33592276a640..69e2f5a134db 100644 --- a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py +++ b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py @@ -157,10 +157,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_can_init_all_missing_weights(self): pass - @unittest.skip(reason="TimmWrapper initialization is managed on the timm side") - def test_initialization(self): - pass - def test_gradient_checkpointing(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() model = TimmWrapperModel._from_config(config) diff --git a/tests/models/tvp/test_modeling_tvp.py b/tests/models/tvp/test_modeling_tvp.py index c002127cbd9a..9672f74d4eee 100644 --- a/tests/models/tvp/test_modeling_tvp.py +++ b/tests/models/tvp/test_modeling_tvp.py @@ -22,7 +22,6 @@ from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -194,23 +193,6 @@ def test_inputs_embeds(self): def test_model_get_set_embeddings(self): pass - # override as the `logit_scale` parameter initialization is different for TVP - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # params are randomly initialized. - self.assertAlmostEqual( - param.data.mean().item(), - 0.0, - delta=1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @require_timm def test_backbone_selection(self): def _validate_backbone_init(): diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index 65e108df0dbb..7aa4cdf1c1f4 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -26,7 +26,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -420,39 +419,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py index cc0c8772969d..02564dea72f8 100644 --- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py +++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py @@ -26,7 +26,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -460,41 +459,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "label_embeddings_concat", - "objective.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: @@ -671,41 +635,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "label_embeddings_concat", - "objective.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py index 4bc68977fff2..f816341a3abe 100644 --- a/tests/models/upernet/test_modeling_upernet.py +++ b/tests/models/upernet/test_modeling_upernet.py @@ -30,7 +30,7 @@ from transformers.utils.import_utils import get_torch_major_and_minor_version from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -219,21 +219,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - configs_no_init.backbone_config = _config_zero_init(configs_no_init.backbone_config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @require_timm def test_backbone_selection(self): config, inputs = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py index d06524305a4e..d0571e4a0299 100644 --- a/tests/models/vit_mae/test_modeling_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_vit_mae.py @@ -34,7 +34,7 @@ from transformers.utils import is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -320,23 +320,6 @@ def test_flash_attn_2_inference_equivalence(self): def test_flash_attn_2_inference_equivalence_right_padding(self): pass - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - # This is an exception in the module, it's initialized with xavier_uniform without using initializer_range - if name.endswith("patch_embeddings.projection.weight"): - continue - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/vitdet/test_modeling_vitdet.py b/tests/models/vitdet/test_modeling_vitdet.py index e0d7c9344dc9..e099f44d074c 100644 --- a/tests/models/vitdet/test_modeling_vitdet.py +++ b/tests/models/vitdet/test_modeling_vitdet.py @@ -16,7 +16,7 @@ import unittest from transformers import VitDetConfig -from transformers.testing_utils import is_flaky, require_torch, torch_device +from transformers.testing_utils import require_torch, torch_device from transformers.utils import is_torch_available from ...test_backbone_common import BackboneTesterMixin @@ -173,28 +173,24 @@ def setUp(self): self.model_tester = VitDetModelTester(self) self.config_tester = ConfigTester(self, config_class=VitDetConfig, has_text_modality=False, hidden_size=37) - @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.") - def test_initialization(self): - super().test_initialization() - # TODO: Fix me (once this model gets more usage) @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.") def test_cpu_offload(self): - super().test_cpu_offload() + pass # TODO: Fix me (once this model gets more usage) @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.") def test_disk_offload_bin(self): - super().test_disk_offload() + pass @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.") def test_disk_offload_safetensors(self): - super().test_disk_offload() + pass # TODO: Fix me (once this model gets more usage) @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.") def test_model_parallelism(self): - super().test_model_parallelism() + pass def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py index 2876d95e1f3e..654503c0bac0 100644 --- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py +++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py @@ -141,11 +141,6 @@ def test_config(self): def test_batching_equivalence(self, atol=3e-4, rtol=3e-4): super().test_batching_equivalence(atol=atol, rtol=rtol) - # TODO: @Pavel - @unittest.skip(reason="currently failing") - def test_initialization(self): - pass - @unittest.skip(reason="VitPoseBackbone does not support input and output embeddings") def test_model_common_attributes(self): pass diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py index a6661d6946e6..f45382e7ce95 100644 --- a/tests/models/vits/test_modeling_vits.py +++ b/tests/models/vits/test_modeling_vits.py @@ -220,46 +220,6 @@ def test_determinism(self): def test_batching_equivalence(self): pass - @is_flaky( - max_attempts=3, - description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range", - ) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - uniform_init_parms = [ - "emb_rel_k", - "emb_rel_v", - "conv_1", - "conv_2", - "conv_pre", - "conv_post", - "conv_proj", - "conv_dds", - "project", - "wavenet.in_layers", - "wavenet.res_skip_layers", - "upsampler", - "resblocks", - ] - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - @unittest.skip(reason="VITS has no inputs_embeds") def test_inputs_embeds(self): pass diff --git a/tests/models/vjepa2/test_modeling_vjepa2.py b/tests/models/vjepa2/test_modeling_vjepa2.py index 191dbc54848e..3a2a24c54901 100644 --- a/tests/models/vjepa2/test_modeling_vjepa2.py +++ b/tests/models/vjepa2/test_modeling_vjepa2.py @@ -21,7 +21,6 @@ from transformers import VJEPA2Config from transformers.testing_utils import ( - is_flaky, require_torch, require_vision, slow, @@ -167,10 +166,6 @@ def setUp(self): self.model_tester = VJEPA2ModelTester(self) self.config_tester = ConfigTester(self, config_class=VJEPA2Config, has_text_modality=False, hidden_size=37) - @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.") - def test_initialization(self): - super().test_initialization() - def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index e41d37aa86a6..204eb797ee41 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -606,40 +606,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "objective.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: @@ -949,40 +915,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "objective.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py index 2ada8472b09e..7388f177d602 100644 --- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py +++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py @@ -31,7 +31,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -575,44 +574,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "pos_bias_v", - "pos_bias_u", - "pointwise_conv1", - "pointwise_conv2", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "objective.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py index 4135f2a0e1ac..0304faaad3f7 100644 --- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py +++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py @@ -33,7 +33,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -546,44 +545,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "pos_bias_v", - "pos_bias_u", - "pointwise_conv1", - "pointwise_conv2", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "objective.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py index 82edabc0bf17..787d3b738669 100644 --- a/tests/models/wavlm/test_modeling_wavlm.py +++ b/tests/models/wavlm/test_modeling_wavlm.py @@ -25,7 +25,6 @@ from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, - _config_zero_init, floats_tensor, ids_tensor, random_attention_mask, @@ -397,42 +396,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(hidden_states.grad) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - uniform_init_parms = [ - "conv.weight", - "conv.parametrizations.weight", - "masked_spec_embed", - "codevectors", - "quantizer.weight_proj.weight", - "project_hid.weight", - "project_hid.bias", - "project_q.weight", - "project_q.bias", - "feature_projection.projection.weight", - "feature_projection.projection.bias", - "label_embeddings_concat", - "rel_attn_embed", - "objective.weight", - ] - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 78c089af4957..6d1827b9073b 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -561,32 +561,6 @@ def test_model_get_set_embeddings(self): def test_feed_forward_chunking(self): pass - # override as the `logit_scale`, `prompts_generator.alpha` parameters require special treatment - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": - self.assertAlmostEqual( - param.data.item(), - np.log(1 / 0.07), - delta=1e-3, - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - elif name == "prompts_generator.alpha": - self.assertAlmostEqual(param.data.mean().item(), model.config.prompt_alpha) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") diff --git a/tests/models/xcodec/test_modeling_xcodec.py b/tests/models/xcodec/test_modeling_xcodec.py index 6e00bf19062e..41f71c409ba6 100644 --- a/tests/models/xcodec/test_modeling_xcodec.py +++ b/tests/models/xcodec/test_modeling_xcodec.py @@ -345,25 +345,6 @@ def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - # skipping the parametrizations original0 tensor - if name == "semantic_model.encoder.pos_conv_embed.conv.parametrizations.weight.original0": - continue - - uniform_init_parms = ["conv"] - - if param.requires_grad: - if any(x in name for x in uniform_init_parms): - self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, - msg=f"Parameter {name} of {model_class.__name__} seems not properly initialized", - ) - @unittest.skip(reason="The XcodecModel does not have support dynamic compile yet") def test_sdpa_can_compile_dynamic(self): pass diff --git a/tests/models/xlstm/test_modeling_xlstm.py b/tests/models/xlstm/test_modeling_xlstm.py index c25c4f12e3e5..f1fb652e8b93 100644 --- a/tests/models/xlstm/test_modeling_xlstm.py +++ b/tests/models/xlstm/test_modeling_xlstm.py @@ -168,17 +168,6 @@ def setUp(self): self, config_class=xLSTMConfig, n_embd=37, common_properties=["hidden_size", "num_hidden_layers"] ) - def test_initialization(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config=config) - for name, param in model.named_parameters(): - if "D" in name: - if param.requires_grad: - # check if it's a ones like - self.assertTrue(torch.allclose(param.data, torch.ones_like(param.data), atol=1e-5, rtol=1e-5)) - @unittest.skip(reason="xLSTM cache slicing test case is an edge case") def test_generate_without_input_ids(self): pass diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py index 30c36e78dce5..74ca658b9e09 100644 --- a/tests/models/zamba/test_modeling_zamba.py +++ b/tests/models/zamba/test_modeling_zamba.py @@ -21,7 +21,6 @@ from transformers import AutoTokenizer, ZambaConfig, is_torch_available from transformers.testing_utils import ( - is_flaky, require_bitsandbytes, require_flash_attn, require_torch, @@ -32,7 +31,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask from ...test_pipeline_mixin import PipelineTesterMixin @@ -346,50 +345,6 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - @is_flaky(description="TODO: ydshieh") - def test_initialization(self): - r""" - Overriding the test_initialization test as the A_log and D params of the Mamba block are initialized differently - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if "A_log" in name: - A = torch.arange(1, config.mamba_d_state + 1, dtype=torch.float32)[None, :] - intermediate_dim = config.mamba_expand * config.hidden_size - A = A.expand(intermediate_dim, -1).reshape( - config.n_mamba_heads, intermediate_dim // config.n_mamba_heads, -1 - ) - torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5) - elif "D" in name: - # check if it's a ones like - torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) - elif "x_proj" in name or "dt_proj_weight" in name: - self.assertIn( - ((param.data.mean() * 1e2).round() / 1e2).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized (raw value {param.data.mean()})", - ) - elif "dt_proj_bias" in name: - dt = torch.exp( - torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min)) - + math.log(config.time_step_min) - ).clamp(min=config.time_step_floor) - inv_dt = dt + torch.log(-torch.expm1(-dt)) - if param.requires_grad: - self.assertTrue(param.data.max().item() <= inv_dt[1]) - self.assertTrue(param.data.min().item() >= inv_dt[0]) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_attention_outputs(self): r""" Overriding the test_attention_outputs test as the Zamba model outputs attention only for its attention layers diff --git a/tests/models/zamba2/test_modeling_zamba2.py b/tests/models/zamba2/test_modeling_zamba2.py index 2b667dc89b40..665c03b83837 100644 --- a/tests/models/zamba2/test_modeling_zamba2.py +++ b/tests/models/zamba2/test_modeling_zamba2.py @@ -13,7 +13,6 @@ # limitations under the License. """Testing suite for the PyTorch Zamba model.""" -import math import tempfile import unittest @@ -34,7 +33,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask from ...test_pipeline_mixin import PipelineTesterMixin @@ -387,39 +386,6 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_initialization(self): - r""" - Overriding the test_initialization test as the A_log and D params of the Mamba block are initialized differently - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - if "A_log" in name: - A = torch.arange(1, config.n_mamba_heads + 1, dtype=torch.float32)[None, :] - self.assertTrue(torch.allclose(param.data, torch.log(A), atol=1e-5, rtol=1e-5)) - elif "D" in name: - # check if it's a ones like - self.assertTrue(torch.allclose(param.data, torch.ones_like(param.data), atol=1e-5, rtol=1e-5)) - elif "dt_bias" in name: - dt = torch.exp( - torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min)) - + math.log(config.time_step_min) - ).clamp(min=config.time_step_floor) - inv_dt = dt + torch.log(-torch.expm1(-dt)) - if param.requires_grad: - self.assertTrue(param.data.max().item() <= inv_dt[1]) - self.assertTrue(param.data.min().item() >= inv_dt[0]) - else: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_attention_outputs(self): r""" Overriding the test_attention_outputs test as the Zamba2 model outputs attention only for its attention layers diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 15d221e7a48d..0896c1899329 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1021,28 +1021,6 @@ def check_equal(loaded): torch.save(state_dict, pt_checkpoint_path, _use_new_zipfile_serialization=False) check_equal(load_state_dict(pt_checkpoint_path)) - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=copy.deepcopy(configs_no_init)) - for name, param in model.named_parameters(): - if param.requires_grad: - data = torch.flatten(param.data) - n_elements = torch.numel(data) - # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in - # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332 - n_elements_to_skip_on_each_side = int(n_elements * 0.025) - data_to_check = torch.sort(data).values - if n_elements_to_skip_on_each_side > 0: - data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side] - self.assertIn( - ((data_to_check.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) - def test_determinism(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()