diff --git a/conftest.py b/conftest.py
index 670ceba2f272..0fab7ce3db27 100644
--- a/conftest.py
+++ b/conftest.py
@@ -54,7 +54,6 @@
     "test_gradient_checkpointing_backward_compatibility",
     "test_gradient_checkpointing_enable_disable",
     "test_torch_save_load",
-    "test_initialization",
     "test_forward_signature",
     "test_model_get_set_embeddings",
     "test_model_main_input_name",
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 4e1bbf75c507..4f0f98f359e7 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -39,7 +39,6 @@
 from ...test_modeling_common import (
     TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
     ModelTesterMixin,
-    _config_zero_init,
     _test_eager_matches_sdpa_inference,
     floats_tensor,
     ids_tensor,
@@ -427,30 +426,6 @@ def test_model_get_set_embeddings(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    # Override as the `logit_scale` parameter initialization is different for Aimv2
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def test_load_vision_text_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 1e41e5fc9c1d..e64ced9131a5 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -466,7 +466,7 @@ def test_batching_equivalence(self, atol=3e-4, rtol=3e-4):
 
     @unittest.skip(reason="Start to fail after using torch `cu118`.")
     def test_multi_gpu_data_parallel_forward(self):
-        super().test_multi_gpu_data_parallel_forward()
+        pass
 
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
@@ -488,35 +488,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `temperature` parameter initialization is different for ALIGN
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `temperature` is initialized as per the original implementation
-                    if name == "temperature":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            1.0,
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif name == "text_projection.weight":
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index 9ed9a8331617..86ea1e6da431 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -464,29 +464,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for AltCLIP
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
diff --git a/tests/models/aria/test_modeling_aria.py b/tests/models/aria/test_modeling_aria.py
index 6ac9b216c24c..42186970e459 100644
--- a/tests/models/aria/test_modeling_aria.py
+++ b/tests/models/aria/test_modeling_aria.py
@@ -198,10 +198,6 @@ def setUp(self):
         self.model_tester = AriaVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=AriaConfig, has_text_modality=False)
 
-    @unittest.skip(reason="Unstable test")
-    def test_initialization(self):
-        pass
-
 
 SKIP = False
 torch_accelerator_module = getattr(torch, torch_device)
diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py
index 499c4a268e56..aa71b32d5447 100644
--- a/tests/models/aya_vision/test_modeling_aya_vision.py
+++ b/tests/models/aya_vision/test_modeling_aya_vision.py
@@ -197,10 +197,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="Siglip uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(reason="Compile not yet supported because in LLava models")
     @pytest.mark.torch_compile_test
     def test_sdpa_can_compile_dynamic(self):
diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py
index a1c832948209..0092218bc609 100644
--- a/tests/models/bamba/test_modeling_bamba.py
+++ b/tests/models/bamba/test_modeling_bamba.py
@@ -41,7 +41,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -335,30 +335,6 @@ def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_initialization(self):
-        r"""
-        Overriding the test_initialization test as the A_log and D params of the Bamba mixer are initialized differently
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if "A_log" in name:
-                        A = torch.arange(1, config.mamba_n_heads + 1, dtype=torch.float32)
-                        torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5)
-                    elif "D" in name:
-                        D = torch.ones(config.mamba_n_heads, dtype=torch.float32)
-                        torch.testing.assert_close(param.data, D, rtol=1e-5, atol=1e-5)
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def test_attention_outputs(self):
         r"""
         Overriding the test_attention_outputs test as the Bamba model outputs attention only for its attention layers
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index f78ed65c3d93..ab6c28dd4b2f 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -34,7 +34,7 @@
 
 from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -382,24 +382,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                # we skip lambda parameters as these require special initial values
-                # determined by config.layer_scale_init_value
-                if "lambda" in name:
-                    continue
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "microsoft/beit-base-patch16-224"
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index 06477f320477..3f9269ca17a2 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -28,7 +28,6 @@
 
 if is_torch_available():
     import torch
-    from torch import nn
 
     from transformers import BitBackbone, BitForImageClassification, BitImageProcessor, BitModel
 
@@ -200,22 +199,6 @@ def test_backbone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_backbone(*config_and_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, module in model.named_modules():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-                    self.assertTrue(
-                        torch.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        torch.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 27b1b1202c11..cfe2d7dd0696 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -456,41 +456,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for Blip
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        # See PR #38607 (to avoid flakiness)
-                        data = torch.flatten(param.data)
-                        n_elements = torch.numel(data)
-                        # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
-                        # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
-                        n_elements_to_skip_on_each_side = int(n_elements * 0.025)
-                        data_to_check = torch.sort(data).values
-                        if n_elements_to_skip_on_each_side > 0:
-                            data_to_check = data_to_check[
-                                n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side
-                            ]
-                        self.assertIn(
-                            ((data_to_check.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
@@ -981,30 +946,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for Blip
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
@@ -1194,30 +1135,6 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    # override as the `logit_scale` parameter initialization is different for Blip
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 6523fddb47cc..f2efb12a4a39 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch BLIP-2 model."""
 
-import copy
 import inspect
 import tempfile
 import unittest
@@ -42,7 +41,6 @@
 from ...test_modeling_common import (
     TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -988,23 +986,6 @@ def test_get_qformer_features(self):
             (self.model_tester.vision_model_tester.batch_size, 10, config.vision_config.hidden_size),
         )
 
-    # override from common to deal with nested configurations (`vision_config`, `text_config` and `qformer_config`)
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for key in ["vision_config", "qformer_config", "text_config"]:
-            setattr(configs_no_init, key, _config_zero_init(getattr(configs_no_init, key)))
-        for model_class in self.all_model_classes:
-            model = model_class(config=copy.deepcopy(configs_no_init))
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     @unittest.skip("T5 backbone deepcopies the configs, and fixing it would be more involved")
     def test_internal_model_config_and_subconfig_are_same(self):
         pass
@@ -1511,36 +1492,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif name == "temp":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            0.07,
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py
index f1a3d94c20e4..ba9bf5aca93f 100644
--- a/tests/models/bridgetower/test_modeling_bridgetower.py
+++ b/tests/models/bridgetower/test_modeling_bridgetower.py
@@ -28,7 +28,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -437,29 +436,6 @@ def test_retain_grad_hidden_states_attentions(self):
         if self.has_attentions:
             self.assertIsNotNone(attentions.grad)
 
-    # override as the `logit_scale` parameter initialization is different for BRIDGE TOWER
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            config.logit_scale_init_value,
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(reason="""Bridge Tower does not have input/output embeddings. So this test is not applicable.""")
     def test_model_get_set_embeddings(self):
         pass
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 337e31c67dbb..140823b076d7 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -18,7 +18,6 @@
 import tempfile
 import unittest
 
-import numpy as np
 import requests
 
 from transformers import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig
@@ -578,33 +577,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for CHINESE_CLIP
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for sub_config_key in ("vision_config", "text_config"):
-            sub_config = getattr(configs_no_init, sub_config_key, {})
-            setattr(configs_no_init, sub_config_key, _config_zero_init(sub_config))
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index b3914ee07fd5..5790c880e7a6 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -525,30 +525,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for CLAP
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if "logit_scale" in name:
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 60104222c096..e0721c531b2b 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -562,30 +562,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for CLIP
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
@@ -750,10 +726,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="CLIP uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     @slow
     @is_flaky()
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 753815c5989a..0410fc91fd34 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -490,33 +490,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    # override as the some parameters require custom initialization
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if "logit_scale" in name:
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif "film" in name or "transposed_conv" in name or "reduce" in name:
-                        # those parameters use PyTorch' default nn.Linear initialization scheme
-                        pass
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
index b8d559943c4e..e1c756df82c5 100644
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -32,7 +32,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     ids_tensor,
     random_attention_mask,
 )
@@ -499,36 +498,6 @@ def test_inputs_embeds(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for Clvp
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        expected_value = np.log(1 / 0.07)
-                        returned_value = param.data.item()
-
-                        self.assertAlmostEqual(
-                            returned_value,
-                            expected_value,
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        expected_range = [0.0, 1.0]
-                        returned_range = ((param.data.mean() * 1e9).round() / 1e9).item()
-
-                        self.assertIn(
-                            returned_range,
-                            expected_range,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def test_load_speech_text_decoder_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/cohere2_vision/test_modeling_cohere2_vision.py b/tests/models/cohere2_vision/test_modeling_cohere2_vision.py
index 93169a34ca5e..53bded107a3b 100644
--- a/tests/models/cohere2_vision/test_modeling_cohere2_vision.py
+++ b/tests/models/cohere2_vision/test_modeling_cohere2_vision.py
@@ -169,10 +169,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="Siglip backbone uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
 
 @require_read_token
 @require_torch
diff --git a/tests/models/colpali/test_modeling_colpali.py b/tests/models/colpali/test_modeling_colpali.py
index c8571b1a5d74..761946891ead 100644
--- a/tests/models/colpali/test_modeling_colpali.py
+++ b/tests/models/colpali/test_modeling_colpali.py
@@ -273,10 +273,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_model_parallelism(self):
         pass
 
-    @unittest.skip(reason="PaliGemma's SigLip encoder uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     # TODO extend valid outputs to include this test @Molbap
     @unittest.skip(reason="PaliGemma has currently one output format.")
     def test_model_outputs_equivalence(self):
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index c4d83e1b7bda..d186ce1d9a47 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -22,7 +22,7 @@
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -506,29 +506,6 @@ def test_hf_backbone(self):
 
             self.assertTrue(outputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.init_xavier_std = 1e9
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if "bbox_attention" in name and "bias" not in name:
-                        self.assertLess(
-                            100000,
-                            abs(param.data.max().item()),
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
 
 TOLERANCE = 1e-4
 
diff --git a/tests/models/csm/test_modeling_csm.py b/tests/models/csm/test_modeling_csm.py
index 341407795567..cc09a9fe646a 100644
--- a/tests/models/csm/test_modeling_csm.py
+++ b/tests/models/csm/test_modeling_csm.py
@@ -39,7 +39,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     ids_tensor,
 )
 
@@ -189,25 +188,6 @@ def _get_logits_processor_kwargs(self, do_sample=False, config=None):
 
         return logits_processor_kwargs
 
-    def test_initialization(self):
-        """
-        Overrides [ModelTesterMixin.test_initialization] because of specificities of Mimi codec model.
-        See https://github.com/huggingface/transformers/blob/1077603410cd73ba71d64a522033574d66d64b55/tests/models/mimi/test_modeling_mimi.py#L384-L397
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = ["conv", "input_proj", "output_proj"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e-5):
         """
         Overrides [GenerationTesterMixin._check_similar_generate_outputs] to handle third input_ids dimension.
diff --git a/tests/models/d_fine/test_modeling_d_fine.py b/tests/models/d_fine/test_modeling_d_fine.py
index 1348226857e8..7b6eddcfaca3 100644
--- a/tests/models/d_fine/test_modeling_d_fine.py
+++ b/tests/models/d_fine/test_modeling_d_fine.py
@@ -48,7 +48,7 @@
 from transformers import RTDetrImageProcessor
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -628,58 +628,6 @@ def test_hf_backbone(self):
 
             self.assertTrue(outputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.initializer_bias_prior_prob = 0.2
-        bias_value = -1.3863  # log_e ((1 - 0.2) / 0.2)
-
-        failed_cases = []
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "DFineConvEncoder":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if ("class_embed" in name and "bias" in name) or "enc_score_head.bias" in name:
-                        bias_tensor = torch.full_like(param.data, bias_value)
-                        try:
-                            torch.testing.assert_close(param.data, bias_tensor, atol=1e-4, rtol=1e-4)
-                        except AssertionError:
-                            failed_cases.append(
-                                f"Parameter {name} of model {model_class} seems not properly initialized. "
-                                f"Biases should be initialized to {bias_value}, got {param.data}"
-                            )
-                    elif (
-                        "level_embed" in name
-                        or "sampling_offsets.bias" in name
-                        or "value_proj" in name
-                        or "output_proj" in name
-                        or "reference_points" in name
-                        or "enc_score_head.weight" in name
-                        or ("class_embed" in name and "weight" in name)
-                        or name in backbone_params
-                    ):
-                        continue
-                    else:
-                        mean = param.data.mean()
-                        round_mean = (mean * 1e9).round() / 1e9
-                        round_mean = round_mean.item()
-                        if round_mean not in [0.0, 1.0]:
-                            failed_cases.append(
-                                f"Parameter {name} of model {model_class} seems not properly initialized. "
-                                f"Mean is {round_mean}, but should be in [0, 1]"
-                            )
-
-        message = "\n" + "\n".join(failed_cases)
-        self.assertTrue(not failed_cases, message)
-
     @parameterized.expand(["float32", "float16", "bfloat16"])
     @require_torch_accelerator
     @slow
diff --git a/tests/models/dab_detr/test_modeling_dab_detr.py b/tests/models/dab_detr/test_modeling_dab_detr.py
index f9446e2eaee2..9df7038fdccc 100644
--- a/tests/models/dab_detr/test_modeling_dab_detr.py
+++ b/tests/models/dab_detr/test_modeling_dab_detr.py
@@ -22,7 +22,7 @@
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -700,55 +700,6 @@ def test_different_timm_backbone(self):
 
             self.assertTrue(outputs)
 
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.init_xavier_std = 1e9
-        # Copied from RT-DETR
-        configs_no_init.initializer_bias_prior_prob = 0.2
-        bias_value = -1.3863  # log_e ((1 - 0.2) / 0.2)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if "bbox_attention" in name and "bias" not in name:
-                        self.assertLess(
-                            100000,
-                            abs(param.data.max().item()),
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    # Modified from RT-DETR
-                    elif "class_embed" in name and "bias" in name:
-                        bias_tensor = torch.full_like(param.data, bias_value)
-                        torch.testing.assert_close(
-                            param.data,
-                            bias_tensor,
-                            atol=1e-4,
-                            rtol=1e-4,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif "activation_fn" in name and config.activation_function == "prelu":
-                        self.assertTrue(
-                            param.data.mean() == 0.25,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif "backbone.conv_encoder.model" in name:
-                        continue
-                    elif "self_attn.in_proj_weight" in name:
-                        self.assertIn(
-                            ((param.data.mean() * 1e2).round() / 1e2).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
 
 TOLERANCE = 1e-4
 CHECKPOINT = "IDEA-Research/dab-detr-resnet-50"
diff --git a/tests/models/dac/test_modeling_dac.py b/tests/models/dac/test_modeling_dac.py
index eb5ad7225a8a..cb286a14e965 100644
--- a/tests/models/dac/test_modeling_dac.py
+++ b/tests/models/dac/test_modeling_dac.py
@@ -353,22 +353,6 @@ def recursive_check(tuple_object, dict_object):
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs)
 
-    # Ignore copy
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = ["conv", "in_proj", "out_proj", "codebook"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def test_identity_shortcut(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         config.use_conv_shortcut = False
diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py
index be5f9d6a0c57..34785ccd3a74 100644
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -24,7 +24,7 @@
 from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init
+from ...test_modeling_common import ModelTesterMixin
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -457,39 +457,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index c39baea90bc4..b540caecafe5 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -32,7 +32,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -291,24 +291,6 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                # we skip lambda parameters as these require special initial values
-                # determined by config.layer_scale_init_value
-                if "lambda" in name:
-                    continue
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
diff --git a/tests/models/deepseek_vl/test_modeling_deepseek_vl.py b/tests/models/deepseek_vl/test_modeling_deepseek_vl.py
index 7941695c6583..33c5e0c4a465 100644
--- a/tests/models/deepseek_vl/test_modeling_deepseek_vl.py
+++ b/tests/models/deepseek_vl/test_modeling_deepseek_vl.py
@@ -186,11 +186,6 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             torch.testing.assert_close(out_embeds, out_ids)
 
-    @unittest.skip(reason="Siglip uses a non-standard initialization scheme")
-    # Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization
-    def test_initialization(self):
-        pass
-
     # Copied from tests.models.janus.test_modeling_janus.JanusVisionText2TextModelTest.test_sdpa_can_dispatch_composite_models
     def test_sdpa_can_dispatch_composite_models(self):
         for model_class in self.all_model_classes:
diff --git a/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py b/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py
index 5ce1679f5997..99dba2542ea3 100644
--- a/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py
+++ b/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py
@@ -217,11 +217,6 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             torch.testing.assert_close(out_embeds, out_ids)
 
-    @unittest.skip(reason="Siglip uses a non-standard initialization scheme")
-    # Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization
-    def test_initialization(self):
-        pass
-
     def test_sdpa_can_dispatch_composite_models(self):
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index dcd95a009ead..130c04df10ac 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -30,7 +30,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -575,29 +575,6 @@ def test_hf_backbone(self):
 
             self.assertTrue(outputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            print("Model class:", model_class)
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if (
-                        "level_embed" in name
-                        or "sampling_offsets.bias" in name
-                        or "value_proj" in name
-                        or "output_proj" in name
-                        or "reference_points" in name
-                    ):
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_two_stage_training(self):
         model_class = DeformableDetrForObjectDetection
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 6574923592a9..7cb57b843b7a 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -22,7 +22,7 @@
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -300,45 +300,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                non_uniform_init_parms = [
-                    # these encoders are vision transformers
-                    # any layer outside these encoders is either Conv2d or ConvTranspose2d
-                    # which use kaiming initialization
-                    "patch_encoder",
-                    "image_encoder",
-                    "fov_model.encoder",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in non_uniform_init_parms):
-                        # See PR #38607 (to avoid flakiness)
-                        data = torch.flatten(param.data)
-                        n_elements = torch.numel(data)
-                        # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
-                        # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
-                        n_elements_to_skip_on_each_side = int(n_elements * 0.025)
-                        data_to_check = torch.sort(data).values
-                        if n_elements_to_skip_on_each_side > 0:
-                            data_to_check = data_to_check[
-                                n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side
-                            ]
-                        self.assertIn(
-                            ((data_to_check.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # this started when switched from normal initialization to kaiming_normal initialization
     # maybe because the magnitude of offset values from ViT-encoders increases when followed by many convolution layers
     def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index b17d94bb031a..e6f8e07f8d16 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -22,7 +22,7 @@
 from transformers.testing_utils import Expectations, require_timm, require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -526,29 +526,6 @@ def test_greyscale_images(self):
 
             self.assertTrue(outputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.init_xavier_std = 1e9
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if "bbox_attention" in name and "bias" not in name:
-                        self.assertLess(
-                            100000,
-                            abs(param.data.max().item()),
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
 
 TOLERANCE = 1e-4
 
diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py
index 721af9e8cab7..099b285581cb 100644
--- a/tests/models/dinat/test_modeling_dinat.py
+++ b/tests/models/dinat/test_modeling_dinat.py
@@ -23,7 +23,7 @@
 
 from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -326,20 +326,6 @@ def test_model_from_pretrained(self):
         model = DinatModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
 
 @require_natten
 @require_vision
diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py
index 87a799df3709..85e70f99fca6 100644
--- a/tests/models/dinov2/test_modeling_dinov2.py
+++ b/tests/models/dinov2/test_modeling_dinov2.py
@@ -18,7 +18,6 @@
 
 from transformers import Dinov2Config
 from transformers.testing_utils import (
-    is_flaky,
     require_torch,
     require_vision,
     slow,
@@ -237,10 +236,6 @@ def setUp(self):
         self.model_tester = Dinov2ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=37)
 
-    @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.")
-    def test_initialization(self):
-        super().test_initialization()
-
     def test_config(self):
         self.config_tester.run_common_tests()
 
diff --git a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
index c9696dedb2ae..bbeae1888a51 100644
--- a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
+++ b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
@@ -27,7 +27,7 @@
 
 from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -244,29 +244,6 @@ def setUp(self):
             self, config_class=Dinov2WithRegistersConfig, has_text_modality=False, hidden_size=37
         )
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad and "register_tokens" not in name:
-                    # See PR #38607 (to avoid flakiness)
-                    data = torch.flatten(param.data)
-                    n_elements = torch.numel(data)
-                    # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
-                    # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
-                    n_elements_to_skip_on_each_side = int(n_elements * 0.025)
-                    data_to_check = torch.sort(data).values
-                    if n_elements_to_skip_on_each_side > 0:
-                        data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
-                    self.assertIn(
-                        ((data_to_check.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_config(self):
         self.config_tester.run_common_tests()
 
diff --git a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py
index 93af786e4c3b..1de2e6ec5ceb 100644
--- a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py
+++ b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py
@@ -21,7 +21,7 @@
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -160,29 +160,6 @@ def setUp(self):
         self.model_tester = DINOv3ViTModelTester(self)
         self.config_tester = ConfigTester(self, config_class=DINOv3ViTConfig, has_text_modality=False, hidden_size=37)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad and "register_tokens" not in name:
-                    # See PR #38607 (to avoid flakiness)
-                    data = torch.flatten(param.data)
-                    n_elements = torch.numel(data)
-                    # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
-                    # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
-                    n_elements_to_skip_on_each_side = int(n_elements * 0.025)
-                    data_to_check = torch.sort(data).values
-                    if n_elements_to_skip_on_each_side > 0:
-                        data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
-                    self.assertIn(
-                        ((data_to_check.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_config(self):
         self.config_tester.run_common_tests()
 
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
index 1f26e3514843..720a11ca85f0 100644
--- a/tests/models/donut/test_modeling_donut_swin.py
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -21,7 +21,7 @@
 from transformers.utils import is_torch_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -358,17 +358,3 @@ def test_model_from_pretrained(self):
         model_name = "naver-clova-ix/donut-base"
         model = DonutSwinModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 10453e59c906..501139310b30 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -23,7 +23,7 @@
 from transformers.testing_utils import Expectations, require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -260,29 +260,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_sdpa_can_compile_dynamic(self):
         pass
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            backbone_params = []
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "DPTViTHybridEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if name in backbone_params:
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_backbone_selection(self):
         def _validate_backbone_init():
             for model_class in self.all_model_classes:
diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
index c8fd288bfbf5..237045464a87 100644
--- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py
+++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
@@ -21,7 +21,7 @@
 from transformers.utils.import_utils import get_torch_major_and_minor_version
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -194,29 +194,6 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            backbone_params = []
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "DPTViTHybridEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if name in backbone_params:
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     @unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings")
     def test_model_get_set_embeddings(self):
         pass
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 3e10d0014ada..c28260320bf1 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -20,7 +20,7 @@
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -270,29 +270,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            backbone_params = []
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "DPTViTHybridEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if name in backbone_params:
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "Intel/dpt-hybrid-midas"
diff --git a/tests/models/edgetam/test_modeling_edgetam.py b/tests/models/edgetam/test_modeling_edgetam.py
index 687a895cc939..3bbc350a0841 100644
--- a/tests/models/edgetam/test_modeling_edgetam.py
+++ b/tests/models/edgetam/test_modeling_edgetam.py
@@ -367,10 +367,6 @@ def test_hidden_states_output(self):
     def test_can_init_all_missing_weights(self):
         pass
 
-    @unittest.skip(reason="Timm weights cannot be fully constructed in _init_weights")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(
         reason="TIMM's attention implementation is self configured and won't raise ValueError on global attention implementation."
     )
diff --git a/tests/models/emu3/test_modeling_emu3.py b/tests/models/emu3/test_modeling_emu3.py
index 946226eaf7f6..71d3c9f48b52 100644
--- a/tests/models/emu3/test_modeling_emu3.py
+++ b/tests/models/emu3/test_modeling_emu3.py
@@ -348,10 +348,6 @@ def test_disk_offload_bin(self):
     def test_cpu_offload(self):
         pass
 
-    @unittest.skip("VQ-VAE module doesn't initialize weights properly")
-    def test_initialization(self):
-        pass
-
     @pytest.mark.generate
     @unittest.skip("Emu3 has dynamic control flow in vision backbone")
     def test_generate_with_static_cache(self):
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index 21091c164c9d..2dfbbcaee41e 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -410,28 +410,6 @@ def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = ["conv"]
-                ignore_init = ["lstm"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif not any(x in name for x in ignore_init):
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def test_identity_shortcut(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         config.use_conv_shortcut = False
diff --git a/tests/models/eomt/test_modeling_eomt.py b/tests/models/eomt/test_modeling_eomt.py
index 367b29651f0d..7c8163266479 100644
--- a/tests/models/eomt/test_modeling_eomt.py
+++ b/tests/models/eomt/test_modeling_eomt.py
@@ -22,7 +22,7 @@
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -232,40 +232,6 @@ def test_training(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    def test_initialization(self):
-        # Apart from the below params, all other parameters are initialized using kaiming uniform.
-        non_uniform_init_parms = [
-            "layernorm.bias",
-            "layernorm.weight",
-            "norm1.bias",
-            "norm1.weight",
-            "norm2.bias",
-            "norm2.weight",
-            "layer_scale1.lambda1",
-            "layer_scale2.lambda1",
-            "register_tokens",
-            "cls_token",
-        ]
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if any(x in name for x in non_uniform_init_parms):
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
 
 @require_torch
 class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index 27f27105cc37..ff28ad50baec 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -240,12 +240,6 @@ def test_model_outputs_equivalence(self):
     def test_feed_forward_chunking(self):
         pass
 
-    @unittest.skip(
-        reason="ESMFold doesn't respect you and it certainly doesn't respect your initialization arguments."
-    )
-    def test_initialization(self):
-        pass
-
     @unittest.skip(reason="ESMFold doesn't support torchscript compilation.")
     def test_torchscript_output_attentions(self):
         pass
diff --git a/tests/models/evolla/test_modeling_evolla.py b/tests/models/evolla/test_modeling_evolla.py
index 78716cf36a33..6e399ef0ffde 100644
--- a/tests/models/evolla/test_modeling_evolla.py
+++ b/tests/models/evolla/test_modeling_evolla.py
@@ -32,7 +32,6 @@
 from ...test_modeling_common import (
     TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
     ModelTesterMixin,
-    _config_zero_init,
     ids_tensor,
     random_attention_mask,
 )
@@ -300,25 +299,6 @@ def test_single_forward(self):
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             print(outputs)
 
-    def test_initialization(self):
-        # we skip the latents initialization test
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # skip latents
-                    if name.endswith("latents"):
-                        print(f"Skipping latents {name}")
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     @unittest.skip("Evolla requires both text and protein inputs which is currently not done in this test.")
     def test_eager_matches_sdpa_inference(self):
diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
index 7c0b6cf19aa9..919130175b7e 100644
--- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
+++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 
-import math
 import unittest
 from unittest.util import safe_repr
 
@@ -34,7 +33,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -331,45 +330,6 @@ def test_falcon_mamba_lm_head_forward_and_backwards(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_falcon_mamba_lm_head_forward_and_backwards(*config_and_inputs)
 
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.rescale_prenorm_residual = True
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "dt_proj.bias" in name:
-                    dt = torch.exp(
-                        torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min))
-                        + math.log(config.time_step_min)
-                    ).clamp(min=config.time_step_floor)
-                    inv_dt = dt + torch.log(-torch.expm1(-dt))
-                    if param.requires_grad:
-                        self.assertTrue(param.data.max().item() <= inv_dt[1])
-                        self.assertTrue(param.data.min().item() >= inv_dt[0])
-                elif "A_log" in name:
-                    A = torch.arange(1, config.state_size + 1, dtype=torch.float32)[None, :]
-                    A = A.expand(config.intermediate_size, -1).contiguous()
-                    torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5)
-                elif "D" in name:
-                    if param.requires_grad:
-                        # check if it's a ones like
-                        torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
-                else:
-                    if param.requires_grad:
-                        if (
-                            "mixer.conv1d.weight" in name
-                            or "mixer.dt_proj.weight" in name
-                            or "mixer.out_proj.weight" in name
-                        ):
-                            continue
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @slow
     # Ignore copy
     def test_model_from_pretrained(self):
diff --git a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
index 6e7b567c1d91..33889a7a9404 100644
--- a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
+++ b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
@@ -33,7 +33,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 
 
 if is_torch_available():
@@ -141,22 +141,6 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    msg = f"Parameter {name} of model {model_class} seems not properly initialized"
-                    if "norm" in name:
-                        if "bias" in name:
-                            self.assertEqual(param.data.mean().item(), 0.0, msg=msg)
-                        if "weight" in name:
-                            self.assertEqual(param.data.mean().item(), 1.0, msg=msg)
-                    elif "conv" in name or "embed" in name:
-                        self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg)
-
     def test_duration_energy_pitch_output(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
@@ -573,22 +557,6 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    msg = f"Parameter {name} of model {model_class} seems not properly initialized"
-                    if "norm" in name:
-                        if "bias" in name:
-                            self.assertEqual(param.data.mean().item(), 0.0, msg=msg)
-                        if "weight" in name:
-                            self.assertEqual(param.data.mean().item(), 1.0, msg=msg)
-                    elif "conv" in name or "embed" in name:
-                        self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg)
-
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         return inputs_dict
 
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index 5915d25ee39b..5333e6ef9242 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -920,30 +920,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for FLAVA
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale" or name == "flava.logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py
index f3ea480ed91d..f09d8e8a8e51 100644
--- a/tests/models/focalnet/test_modeling_focalnet.py
+++ b/tests/models/focalnet/test_modeling_focalnet.py
@@ -23,7 +23,7 @@
 
 from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -387,20 +387,6 @@ def test_model_from_pretrained(self):
         model = FocalNetModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
 
 @require_vision
 @require_torch
diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py
index 27cfa83f7b33..bcceea0e7379 100644
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -335,12 +335,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(
-        reason="Siglip (vision backbone) uses the same initialization scheme as the Flax original implementation"
-    )
-    def test_initialization(self):
-        pass
-
     @unittest.skip("Loading nested configs with overwritten `kwargs` isn't supported yet, FIXME @raushan.")
     def test_load_with_mismatched_shapes(self):
         pass
diff --git a/tests/models/gemma3n/test_modeling_gemma3n.py b/tests/models/gemma3n/test_modeling_gemma3n.py
index 7f7a1d38915a..06c118955aff 100644
--- a/tests/models/gemma3n/test_modeling_gemma3n.py
+++ b/tests/models/gemma3n/test_modeling_gemma3n.py
@@ -147,7 +147,6 @@ class Gemma3nAudioModelTest(ModelTesterMixin, unittest.TestCase):
     is_generative = False
     _is_stateful = True
     main_input_name = "audio_mel"
-    test_initialization = False
 
     def setUp(self):
         self.model_tester = Gemma3nAudioModelTester(self)
@@ -700,10 +699,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="Siglip (vision backbone) uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(
         reason="Siglip has no FLEX attention, and we don't have a proper way to set/test attn in VLMs. TODO @raushan"
     )
diff --git a/tests/models/got_ocr2/test_modeling_got_ocr2.py b/tests/models/got_ocr2/test_modeling_got_ocr2.py
index bfc1d6e26f41..db9f9f97d417 100644
--- a/tests/models/got_ocr2/test_modeling_got_ocr2.py
+++ b/tests/models/got_ocr2/test_modeling_got_ocr2.py
@@ -25,7 +25,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -162,20 +162,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
 
 @require_torch
 class GotOcr2IntegrationTest(unittest.TestCase):
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 87e23a81915e..516d44896c2e 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -40,7 +40,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
 )
@@ -251,22 +250,6 @@ def test_inputs_embeds(self):
             with torch.no_grad():
                 model(**inputs)
 
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if name == "projector.query":
-                    continue
-                elif param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_sdpa_can_dispatch_composite_models(self):
         # overwrite because Granite Speech is audio+text model (not vision+text)
         if not self.has_attentions:
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index e5684462e449..6de95751f165 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -40,7 +40,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -569,32 +569,6 @@ def test_hf_backbone(self):
 
             self.assertTrue(outputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if (
-                        "level_embed" in name
-                        or "sampling_offsets.bias" in name
-                        or "text_param" in name
-                        or "vision_param" in name
-                        or "value_proj" in name
-                        or "output_proj" in name
-                        or "reference_points" in name
-                        or "vision_proj" in name
-                        or "text_proj" in name
-                    ):
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     # Copied from tests.models.deformable_detr.test_modeling_deformable_detr.DeformableDetrModelTest.test_two_stage_training with DeformableDetr->GroundingDino
     def test_two_stage_training(self):
         model_class = GroundingDinoForObjectDetection
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index dc5238a9275a..7dcc17070a10 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -566,30 +566,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for GROUPVIT
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
diff --git a/tests/models/hgnet_v2/test_modeling_hgnet_v2.py b/tests/models/hgnet_v2/test_modeling_hgnet_v2.py
index d6e783677d49..1900f94657a8 100644
--- a/tests/models/hgnet_v2/test_modeling_hgnet_v2.py
+++ b/tests/models/hgnet_v2/test_modeling_hgnet_v2.py
@@ -16,7 +16,6 @@
 import unittest
 
 import torch
-from torch import nn
 
 from transformers import HGNetV2Config
 from transformers.testing_utils import require_torch, torch_device
@@ -208,22 +207,6 @@ def test_backbone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_backbone(*config_and_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, module in model.named_modules():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-                    self.assertTrue(
-                        torch.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        torch.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py
index 79c4b4e02118..dfa75fbb89c6 100644
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@@ -407,32 +407,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "quantizer.weight_proj.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # Hubert cannot be TorchScripted because of torch.nn.utils.weight_norm
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
         # TODO: fix it
@@ -671,32 +645,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "quantizer.weight_proj.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/internvl/test_modeling_internvl.py b/tests/models/internvl/test_modeling_internvl.py
index 69ed30495524..584680d57d58 100644
--- a/tests/models/internvl/test_modeling_internvl.py
+++ b/tests/models/internvl/test_modeling_internvl.py
@@ -41,7 +41,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -207,20 +207,6 @@ def test_flex_attention_with_grads(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     @unittest.skip(reason="Compile not yet supported because in LLava models")
     @pytest.mark.torch_compile_test
     def test_sdpa_can_compile_dynamic(self):
diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py
index e00528c74cf0..bcf1a3b491f5 100644
--- a/tests/models/jamba/test_modeling_jamba.py
+++ b/tests/models/jamba/test_modeling_jamba.py
@@ -34,7 +34,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -427,31 +427,6 @@ def test_load_balancing_loss(self):
         # After #40617, we still have 0.003 % of failure rate here.
         self.assertNotAlmostEqual(include_padding_result.aux_loss.item(), result.aux_loss.item())
 
-    def test_initialization(self):
-        r"""
-        Overriding the test_initialization test as the A_log and D params of the Mamba block are initialized differently
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if "A_log" in name:
-                        A = torch.arange(1, config.mamba_d_state + 1, dtype=torch.float32)[None, :]
-                        A = A.expand(config.mamba_expand * config.hidden_size, -1).contiguous()
-                        torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5)
-                    elif "D" in name:
-                        # check if it's a ones like
-                        torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def test_attention_outputs(self):
         r"""
         Overriding the test_attention_outputs test as the Jamba model outputs attention only for its attention layers
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index 48dae2ec820b..d578337df886 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -315,24 +315,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    # overwrite from common to skip `image_to_text_projection.latent_query`
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if name == "image_to_text_projection.latent_query":
-                        # The original code use ` nn.Parameter(torch.randn(...))` for which this test won't pass.
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
index 8a7811c766c1..fc7dac8f02b5 100644
--- a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
+++ b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
@@ -43,7 +43,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -378,24 +377,6 @@ def test_assisted_decoding_sample(self):
     def test_prompt_lookup_decoding_matches_greedy_search(self):
         pass
 
-    # overwrite from common to skip `image_to_text_projection.latent_query`
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if name == "image_to_text_projection.latent_query":
-                        # The original code use ` nn.Parameter(torch.randn(...))` for which this test won't pass.
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
index 1bc92b9afac5..c0ea557a33da 100644
--- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
+++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
@@ -41,7 +41,6 @@
 from ...test_modeling_common import (
     TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
 )
@@ -317,25 +316,6 @@ def test_tied_weights_keys(self):
     def test_generate_without_input_ids(self):
         pass
 
-    def test_initialization(self):
-        """
-        Overrides [ModelTesterMixin.test_initialization] because of specificities of Mimi codec model.
-        See https://github.com/huggingface/transformers/blob/1077603410cd73ba71d64a522033574d66d64b55/tests/models/mimi/test_modeling_mimi.py#L384-L397
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = ["conv", "input_proj", "output_proj"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     def test_eager_matches_sdpa_inference(
         self, name, dtype, padding_side, use_attention_mask, output_attentions, enable_kernels
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 2c1b157e3a90..4faf6aa61b4a 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -26,7 +26,7 @@
 from transformers.utils import is_detectron2_available, is_torch_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -432,23 +432,6 @@ def test_model_from_pretrained(self):
         model = LayoutLMv2Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "backbone" in name or "visual_segment_embedding" in name:
-                    continue
-
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_batching_equivalence(self):
         def equivalence(tensor1, tensor2):
             return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0)
diff --git a/tests/models/lfm2_vl/test_modeling_lfm2_vl.py b/tests/models/lfm2_vl/test_modeling_lfm2_vl.py
index c9d39c3ef9ee..d097d7e9e629 100644
--- a/tests/models/lfm2_vl/test_modeling_lfm2_vl.py
+++ b/tests/models/lfm2_vl/test_modeling_lfm2_vl.py
@@ -204,12 +204,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(
-        reason="Siglip2 backbone has a non-standard initialization scheme, that this test cannot handle easily"
-    )
-    def test_initialization(self):
-        pass
-
 
 @require_torch_accelerator
 @require_read_token
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index bd119be91be7..ef2b1609ba99 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -41,7 +41,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
 )
@@ -205,22 +204,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "image_newline" in name:
-                    continue
-                elif param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_mismatching_num_image_tokens(self):
         """
         Tests that VLMs through an error with explicit message saying what is wrong
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index ed9cdb12ab01..d8980f35fdcf 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -41,7 +41,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
 )
@@ -218,22 +217,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "image_newline" in name:
-                    continue
-                elif param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_mismatching_num_image_tokens(self):
         """
         Tests that VLMs through an error with explicit message saying what is wrong
diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py
index c8ce5c364daf..c2280478a35b 100644
--- a/tests/models/llava_onevision/test_modeling_llava_onevision.py
+++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@@ -41,7 +41,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
 )
@@ -215,23 +214,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                # LLaVa Onevision has SigLIP backbone which init weights differently from CLIP
-                if "image_newline" in name or "vision_tower" in name:
-                    continue
-                elif param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_odd_sized_image(self):
         # prepare model configuration
         config = self.model_tester.get_config()
diff --git a/tests/models/mamba/test_modeling_mamba.py b/tests/models/mamba/test_modeling_mamba.py
index a78a89868229..f63b3cf012a0 100644
--- a/tests/models/mamba/test_modeling_mamba.py
+++ b/tests/models/mamba/test_modeling_mamba.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 
-import math
 import unittest
 from unittest.util import safe_repr
 
@@ -25,7 +24,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -298,45 +297,6 @@ def test_mamba_lm_head_forward_and_backwards(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_mamba_lm_head_forward_and_backwards(*config_and_inputs)
 
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.rescale_prenorm_residual = True
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "dt_proj.bias" in name:
-                    dt = torch.exp(
-                        torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min))
-                        + math.log(config.time_step_min)
-                    ).clamp(min=config.time_step_floor)
-                    inv_dt = dt + torch.log(-torch.expm1(-dt))
-                    if param.requires_grad:
-                        self.assertTrue(param.data.max().item() <= inv_dt[1])
-                        self.assertTrue(param.data.min().item() >= inv_dt[0])
-                elif "A_log" in name:
-                    A = torch.arange(1, config.state_size + 1, dtype=torch.float32)[None, :]
-                    A = A.expand(config.intermediate_size, -1).contiguous()
-                    torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5)
-                elif "D" in name:
-                    if param.requires_grad:
-                        # check if it's a ones like
-                        torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
-                else:
-                    if param.requires_grad:
-                        if (
-                            "mixer.conv1d.weight" in name
-                            or "mixer.dt_proj.weight" in name
-                            or "mixer.out_proj.weight" in name
-                        ):
-                            continue
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @slow
     def test_model_from_pretrained(self):
         model = MambaModel.from_pretrained("hf-internal-testing/mamba-130m")
diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py
index 03a91dcbe559..0589b4778b8d 100644
--- a/tests/models/mamba2/test_modeling_mamba2.py
+++ b/tests/models/mamba2/test_modeling_mamba2.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 
-import math
 import unittest
 
 from transformers import AutoTokenizer, Mamba2Config, is_torch_available
@@ -29,7 +28,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -273,40 +272,6 @@ def test_mamba2_slow_vs_fast_forward_grouped(self):
         config_and_inputs[0].n_groups //= 2
         self.model_tester.create_and_check_mamba2_slow_vs_fast_forward(*config_and_inputs)
 
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.rescale_prenorm_residual = True
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "dt_proj.bias" in name:
-                    dt = torch.exp(
-                        torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min))
-                        + math.log(config.time_step_min)
-                    ).clamp(min=config.time_step_floor)
-                    inv_dt = dt + torch.log(-torch.expm1(-dt))
-                    if param.requires_grad:
-                        self.assertTrue(param.data.max().item() <= inv_dt[1])
-                        self.assertTrue(param.data.min().item() >= inv_dt[0])
-                elif "A_log" in name:
-                    A = torch.arange(1, config.num_heads + 1)
-                    torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5)
-                elif "D" in name:
-                    if param.requires_grad:
-                        # check if it's a ones like
-                        torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
-                else:
-                    if param.requires_grad:
-                        if "mixer.conv1d.weight" in name or "mixer.dt_bias" in name or "mixer.out_proj.weight" in name:
-                            continue
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(reason="A large mamba2 would be necessary (and costly) for that")
     def test_multi_gpu_data_parallel_forward(self):
         pass
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index 4f78399a2865..2b28936a0dc1 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -35,7 +35,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init
+from ...test_modeling_common import ModelTesterMixin
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -351,26 +351,6 @@ def test_backbone_selection(self):
             elif model.__class__.__name__ == "Mask2FormerForUniversalSegmentation":
                 self.assertEqual(model.model.pixel_level_module.encoder.out_indices, [1, 2, 3])
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if (
-                        "self_attn.sampling_offsets.bias" in name
-                        or "self_attn.value_proj.weight" in name
-                        or "self_attn.output_proj.weight" in name
-                    ):
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_initialization_pretrained_backbone(self):
         backbone_name = "microsoft/resnet-18"
 
diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py
index f1b1c9747438..0a944c49d9c9 100644
--- a/tests/models/maskformer/test_modeling_maskformer_swin.py
+++ b/tests/models/maskformer/test_modeling_maskformer_swin.py
@@ -311,10 +311,6 @@ def test_hidden_states_output_with_padding(self):
     def test_model_from_pretrained(self):
         pass
 
-    @unittest.skip(reason="This will be fixed once MaskFormerSwin is replaced by native Swin")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(reason="This will be fixed once MaskFormerSwin is replaced by native Swin")
     def test_gradient_checkpointing_backward_compatibility(self):
         pass
diff --git a/tests/models/metaclip_2/test_modeling_metaclip_2.py b/tests/models/metaclip_2/test_modeling_metaclip_2.py
index b6be33fa8e8b..b13cb0855923 100644
--- a/tests/models/metaclip_2/test_modeling_metaclip_2.py
+++ b/tests/models/metaclip_2/test_modeling_metaclip_2.py
@@ -572,30 +572,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for MetaClip2
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
@@ -761,10 +737,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="MetaClip2 uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     @slow
     @is_flaky()
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py
index ecb84f144932..865705481e3a 100644
--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -22,7 +22,7 @@
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -203,22 +203,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    # override as the `logit_scale` parameter initialization is different for MgpstrModel
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if isinstance(param, (nn.Linear, nn.Conv2d, nn.LayerNorm)):
-                    if param.requires_grad:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(reason="Retain_grad is tested in individual model tests")
     def test_retain_grad_hidden_states_attentions(self):
         pass
diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
index 54ce4763af5f..9b49c903baf5 100644
--- a/tests/models/mimi/test_modeling_mimi.py
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -385,21 +385,6 @@ def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = ["conv", "input_proj", "output_proj"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_identity_shortcut
     def test_identity_shortcut(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/mistral3/test_modeling_mistral3.py b/tests/models/mistral3/test_modeling_mistral3.py
index 5cd0c023e165..694cf3a267bb 100644
--- a/tests/models/mistral3/test_modeling_mistral3.py
+++ b/tests/models/mistral3/test_modeling_mistral3.py
@@ -36,7 +36,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -192,20 +192,6 @@ def check_config_can_be_init_without_params():
         self.config_tester.check_config_can_be_init_without_params = check_config_can_be_init_without_params
         self.config_tester.run_common_tests()
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     @unittest.skip(reason="Compile not yet supported because in LLava models")
     @pytest.mark.torch_compile_test
     def test_sdpa_can_compile_dynamic(self):
diff --git a/tests/models/mlcd/test_modeling_mlcd.py b/tests/models/mlcd/test_modeling_mlcd.py
index 81fe4df9051b..bca94e9bf679 100644
--- a/tests/models/mlcd/test_modeling_mlcd.py
+++ b/tests/models/mlcd/test_modeling_mlcd.py
@@ -32,7 +32,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 
 
 if is_torch_available():
@@ -141,20 +141,6 @@ def test_model_get_set_embeddings(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad and "class_pos_emb" not in name:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
 
 @require_torch
 class MLCDVisionModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/mm_grounding_dino/test_modeling_mm_grounding_dino.py b/tests/models/mm_grounding_dino/test_modeling_mm_grounding_dino.py
index fed96e23384b..f1e63787fc57 100644
--- a/tests/models/mm_grounding_dino/test_modeling_mm_grounding_dino.py
+++ b/tests/models/mm_grounding_dino/test_modeling_mm_grounding_dino.py
@@ -39,7 +39,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -574,34 +574,6 @@ def test_hf_backbone(self):
 
             self.assertTrue(outputs)
 
-    # Ignore copy
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if (
-                        "level_embed" in name
-                        or "sampling_offsets.bias" in name
-                        or "text_param" in name
-                        or "vision_param" in name
-                        or "value_proj" in name
-                        or "output_proj" in name
-                        or "reference_points" in name
-                        or "vision_proj" in name
-                        or "text_proj" in name
-                        or ("class_embed" in name and "bias" in name)
-                    ):
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     # Copied from tests.models.deformable_detr.test_modeling_deformable_detr.DeformableDetrModelTest.test_two_stage_training with DeformableDetr->MMGroundingDino
     def test_two_stage_training(self):
         model_class = MMGroundingDinoForObjectDetection
diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py
index 2e2fbe77417e..db4180d6d97c 100644
--- a/tests/models/modernbert/test_modeling_modernbert.py
+++ b/tests/models/modernbert/test_modeling_modernbert.py
@@ -32,7 +32,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -300,31 +300,6 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                # The classifier.weight from ModernBertForSequenceClassification and ModernBertForTokenClassification
-                # are initialized without `initializer_range`, so they're not set to ~0 via the _config_zero_init
-                if param.requires_grad and not (
-                    name == "classifier.weight"
-                    and model_class
-                    in [
-                        ModernBertForSequenceClassification,
-                        ModernBertForTokenClassification,
-                        ModernBertForQuestionAnswering,
-                        ModernBertForMultipleChoice,
-                    ]
-                ):
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
diff --git a/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py b/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py
index fcfb8f28e796..d6b1a13105d2 100644
--- a/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py
+++ b/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py
@@ -22,7 +22,6 @@
 )
 
 from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
-from ...test_modeling_common import _config_zero_init
 
 
 if is_torch_available():
@@ -53,32 +52,6 @@ class ModernBertDecoderModelTest(CausalLMModelTest, unittest.TestCase):
     )
     model_tester_class = ModernBertDecoderModelTester
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                # The classifier.weight from ModernBertDecoderForSequenceClassification
-                # is initialized without `initializer_range`, so it's not set to ~0 via the _config_zero_init
-                if param.requires_grad and not (
-                    name == "classifier.weight" and model_class == ModernBertDecoderForSequenceClassification
-                ):
-                    data = torch.flatten(param.data)
-                    n_elements = torch.numel(data)
-                    # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
-                    # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
-                    n_elements_to_skip_on_each_side = int(n_elements * 0.025)
-                    data_to_check = torch.sort(data).values
-                    if n_elements_to_skip_on_each_side > 0:
-                        data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
-                    self.assertIn(
-                        ((data_to_check.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
 
 @slow
 @require_torch
diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
index a724711e1c22..06f81f3ef79c 100644
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@@ -582,21 +582,6 @@ def _check_generate_outputs(self, output, config, use_cache=False, num_return_se
             output, config, use_cache=True, num_return_sequences=num_return_sequences, num_beams=num_beams
         )
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = ["conv", "input_proj", "output_proj"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(reason="Continuing from past key values is not straightforward as we're dealing with 3 inputs")
     def test_generate_continue_from_past_key_values(self):
         pass
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index e5b876b42604..e5f41ca358d3 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -874,29 +874,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    # override since the conv layers and lstm's in encodec are exceptions
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = ["conv"]
-                ignore_init = ["lstm"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif not any(x in name for x in ignore_init):
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # override since we have embeddings / LM heads over multiple codebooks
     def test_model_get_set_embeddings(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index 4d487546c741..6f5f8728dbab 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -877,29 +877,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    # override since the conv layers and lstm's in encodec are exceptions
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = ["conv"]
-                ignore_init = ["lstm"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif not any(x in name for x in ignore_init):
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # override since we have embeddings / LM heads over multiple codebooks
     def test_model_get_set_embeddings(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py
index a69ef2b3d6fe..bdb6a7477b66 100644
--- a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py
+++ b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py
@@ -32,7 +32,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -614,29 +614,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(encoder_attentions.grad)
         self.assertIsNotNone(cross_attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if (
-                        "embeddings" in name
-                        or ".fc" in name
-                        or "decoder.channel_projection_layers" in name
-                        or "query_position_head" in name
-                        or "decoder.encoder_vision_features" in name
-                        or "language_backbone.text_projection" in name
-                    ):
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} seems not properly initialized",
-                    )
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index 4ae6bba12033..0fc2ae46952c 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -35,7 +35,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init
+from ...test_modeling_common import ModelTesterMixin
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -364,34 +364,6 @@ def test_attention_outputs(self):
             outputs = model(**inputs, output_attentions=True)
             self.assertTrue(outputs.attentions is not None)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.is_training = True
-        config.contrastive_temperature = 1
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if (
-                        "self_attn.sampling_offsets.bias" in name
-                        or "self_attn.value_proj.weight" in name
-                        or "self_attn.output_proj.weight" in name
-                        or "self_attn.in_proj_weight" in name
-                        or "self_attn.out_proj.weight" in name
-                        or "mlp.fc1.weight" in name
-                        or "mlp.fc2.weight" in name
-                        or "text_mapper.text_encoder.positional_embedding" in name
-                        or "text_mapper.text_encoder.token_embedding.weight" in name
-                    ):
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_initialization_pretrained_backbone(self):
         backbone_name = "microsoft/resnet-18"
 
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index ba0d041801de..d8afcbb45836 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -459,30 +459,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for OWLV2
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
@@ -668,10 +644,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="Test_initialization is tested in individual model tests")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(reason="Test_forward_signature is tested in individual model tests")
     def test_forward_signature(self):
         pass
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index fa87b57072c6..185d95244759 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -454,30 +454,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for OWLVIT
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
@@ -661,10 +637,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="Test_initialization is tested in individual model tests")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(reason="Test_forward_signature is tested in individual model tests")
     def test_forward_signature(self):
         pass
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index d052bfb97f81..5797f75b2fb5 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -265,10 +265,6 @@ def test_disk_offload_safetensors(self):
     def test_model_parallelism(self):
         pass
 
-    @unittest.skip(reason="PaliGemma's SigLip encoder uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     # TODO extend valid outputs to include this test @Molbap
     @unittest.skip(reason="PaliGemma has currently one output format.")
     def test_model_outputs_equivalence(self):
diff --git a/tests/models/paligemma2/test_modeling_paligemma2.py b/tests/models/paligemma2/test_modeling_paligemma2.py
index 536eb95bef24..8cb7feabe8a1 100644
--- a/tests/models/paligemma2/test_modeling_paligemma2.py
+++ b/tests/models/paligemma2/test_modeling_paligemma2.py
@@ -244,10 +244,6 @@ def test_disk_offload_safetensors(self):
     def test_model_parallelism(self):
         pass
 
-    @unittest.skip(reason="PaliGemma's SigLip encoder uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     # TODO extend valid outputs to include this test @Molbap
     @unittest.skip(reason="PaliGemma has currently one output format.")
     def test_model_outputs_equivalence(self):
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index 15738c55a3ea..ba2bf1c582c3 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -293,10 +293,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_can_init_all_missing_weights(self):
         pass
 
-    @unittest.skip(reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(
         reason="PE/TIMM's attention implementation is self configured and won't raise ValueError on global attention implementation."
     )
diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
index 0363476e0fc6..61c72fe857c1 100644
--- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
@@ -208,10 +208,6 @@ def setUp(self):
         self.model_tester = Phi4MultimodalModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Phi4MultimodalConfig)
 
-    @unittest.skip(reason="Unstable test")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(reason="Depending on input modalities, some params may not have gradients")
     def test_training_gradient_checkpointing(self):
         pass
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index efeefa6bc155..813334106ee4 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -524,41 +524,6 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    # override as the `logit_scale` parameter initialization is different for Pix2Struct
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        # See PR #38607 (to avoid flakiness)
-                        data = torch.flatten(param.data)
-                        n_elements = torch.numel(data)
-                        # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
-                        # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
-                        n_elements_to_skip_on_each_side = int(n_elements * 0.025)
-                        data_to_check = torch.sort(data).values
-                        if n_elements_to_skip_on_each_side > 0:
-                            data_to_check = data_to_check[
-                                n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side
-                            ]
-                        self.assertIn(
-                            ((data_to_check.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite because `vocab_size` is not an attribute of `Pix2StructConfig` but rather `Pix2StructTextConfig`
     def test_resize_tokens_embeddings(self):
         original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py
index bea696a36568..ff1264e79888 100644
--- a/tests/models/pvt/test_modeling_pvt.py
+++ b/tests/models/pvt/test_modeling_pvt.py
@@ -171,17 +171,6 @@ def test_inputs_embeds(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, param in model.named_parameters():
-                self.assertTrue(
-                    -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                    msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                )
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/pvt_v2/test_modeling_pvt_v2.py b/tests/models/pvt_v2/test_modeling_pvt_v2.py
index 7cfa0d7bbad5..cfeef2d4d8d6 100644
--- a/tests/models/pvt_v2/test_modeling_pvt_v2.py
+++ b/tests/models/pvt_v2/test_modeling_pvt_v2.py
@@ -188,17 +188,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
         # torch.utils.checkpoint.checkpoint
         self.check_training_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": True})
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, param in model.named_parameters():
-                self.assertTrue(
-                    -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                    msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                )
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
index f3f4fa2ffe4f..1ef2da1af3cf 100644
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -44,7 +44,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
 )
@@ -236,20 +235,6 @@ def test_text_config(self):
         self.assertEqual(base_config.patch_size, 8)
         self.assertNotEqual(base_config.vision_config.patch_size, 8)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_mismatching_num_image_tokens(self):
         """
         Tests that VLMs through an error with explicit message saying what is wrong
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 9cb028eb3a73..e455c092ba0b 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -42,7 +42,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
 )
@@ -216,20 +215,6 @@ def test_text_config(self):
         self.assertEqual(base_config.patch_size, 8)
         self.assertNotEqual(base_config.vision_config.patch_size, 8)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_mismatching_num_image_tokens(self):
         """
         Tests that VLMs through an error with explicit message saying what is wrong
diff --git a/tests/models/qwen3_next/test_modeling_qwen3_next.py b/tests/models/qwen3_next/test_modeling_qwen3_next.py
index 3589f32584a9..f0dcdf5ddd4a 100644
--- a/tests/models/qwen3_next/test_modeling_qwen3_next.py
+++ b/tests/models/qwen3_next/test_modeling_qwen3_next.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import tempfile
 import unittest
 
@@ -40,7 +39,6 @@
 from ...generation.test_utils import has_similar_generate_outputs
 from ...test_modeling_common import (
     TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
-    _config_zero_init,
     _test_eager_matches_sdpa_inference,
 )
 
@@ -279,24 +277,6 @@ def test_attention_outputs(self):
             self.assertEqual(len(self_attentions), sum(layer == "full_attention" for layer in config.layer_types))
             self.assertListEqual(list(self_attentions[0].shape[-3:]), [config.num_attention_heads, seq_len, seq_len])
 
-    def test_initialization(self):
-        "Some parameters need to be skipped."
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=copy.deepcopy(configs_no_init))
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # this one need to be skipped, it's initialized as log(uniform(0, 16))
-                    if "A_log" in name:
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     def test_eager_matches_sdpa_inference(
         self,
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index dcd57782b749..db9d2cf42655 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -91,10 +91,6 @@ def test_left_padding_compatibility(self):
     def test_assisted_decoding_sample(self):
         pass
 
-    @unittest.skip(reason="TODO @arthurzucker not super important and failing.")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests")
     @pytest.mark.generate
     def test_beam_sample_generate_dict_output(self):
diff --git a/tests/models/regnet/test_modeling_regnet.py b/tests/models/regnet/test_modeling_regnet.py
index 97702a108dba..73579fa7d9e8 100644
--- a/tests/models/regnet/test_modeling_regnet.py
+++ b/tests/models/regnet/test_modeling_regnet.py
@@ -27,7 +27,6 @@
 
 if is_torch_available():
     import torch
-    from torch import nn
 
     from transformers import RegNetForImageClassification, RegNetModel
 
@@ -162,22 +161,6 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, module in model.named_modules():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-                    self.assertTrue(
-                        torch.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        torch.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/resnet/test_modeling_resnet.py b/tests/models/resnet/test_modeling_resnet.py
index c764ff6b2e25..48f250f9ec62 100644
--- a/tests/models/resnet/test_modeling_resnet.py
+++ b/tests/models/resnet/test_modeling_resnet.py
@@ -28,7 +28,6 @@
 
 if is_torch_available():
     import torch
-    from torch import nn
 
     from transformers import ResNetBackbone, ResNetForImageClassification, ResNetModel
 
@@ -207,22 +206,6 @@ def test_backbone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_backbone(*config_and_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, module in model.named_modules():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-                    self.assertTrue(
-                        torch.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        torch.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/rt_detr/test_modeling_rt_detr.py b/tests/models/rt_detr/test_modeling_rt_detr.py
index d1969c3c0b6e..eddf6d9a6f91 100644
--- a/tests/models/rt_detr/test_modeling_rt_detr.py
+++ b/tests/models/rt_detr/test_modeling_rt_detr.py
@@ -39,7 +39,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -588,56 +588,6 @@ def test_hf_backbone(self):
 
             self.assertTrue(outputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.initializer_bias_prior_prob = 0.2
-        bias_value = -1.3863  # log_e ((1 - 0.2) / 0.2)
-
-        failed_cases = []
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "RTDetrConvEncoder":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if ("class_embed" in name and "bias" in name) or "enc_score_head.bias" in name:
-                        bias_tensor = torch.full_like(param.data, bias_value)
-                        if not torch.allclose(param.data, bias_tensor, atol=1e-4):
-                            failed_cases.append(
-                                f"Parameter {name} of model {model_class} seems not properly initialized. "
-                                f"Biases should be initialized to {bias_value}, got {param.data}"
-                            )
-                    elif (
-                        "level_embed" in name
-                        or "sampling_offsets.bias" in name
-                        or "value_proj" in name
-                        or "output_proj" in name
-                        or "reference_points" in name
-                        or "enc_score_head.weight" in name
-                        or ("class_embed" in name and "weight" in name)
-                        or name in backbone_params
-                    ):
-                        continue
-                    else:
-                        mean = param.data.mean()
-                        round_mean = (mean * 1e9).round() / 1e9
-                        round_mean = round_mean.item()
-                        if round_mean not in [0.0, 1.0]:
-                            failed_cases.append(
-                                f"Parameter {name} of model {model_class} seems not properly initialized. "
-                                f"Mean is {round_mean}, but should be in [0, 1]"
-                            )
-
-        message = "\n" + "\n".join(failed_cases)
-        self.assertTrue(not failed_cases, message)
-
     @parameterized.expand(["float32", "float16", "bfloat16"])
     @require_torch_accelerator
     @slow
diff --git a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
index 80c40195bfd5..2b7713af6e84 100644
--- a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
+++ b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
@@ -38,7 +38,7 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -592,56 +592,6 @@ def test_hf_backbone(self):
 
             self.assertTrue(outputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.initializer_bias_prior_prob = 0.2
-        bias_value = -1.3863  # log_e ((1 - 0.2) / 0.2)
-
-        failed_cases = []
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "RTDetrV2ConvEncoder":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if ("class_embed" in name and "bias" in name) or "enc_score_head.bias" in name:
-                        bias_tensor = torch.full_like(param.data, bias_value)
-                        if not torch.allclose(param.data, bias_tensor, atol=1e-4):
-                            failed_cases.append(
-                                f"Parameter {name} of model {model_class} seems not properly initialized. "
-                                f"Biases should be initialized to {bias_value}, got {param.data}"
-                            )
-                    elif (
-                        "level_embed" in name
-                        or "sampling_offsets.bias" in name
-                        or "value_proj" in name
-                        or "output_proj" in name
-                        or "reference_points" in name
-                        or "enc_score_head.weight" in name
-                        or ("class_embed" in name and "weight" in name)
-                        or name in backbone_params
-                    ):
-                        continue
-                    else:
-                        mean = param.data.mean()
-                        round_mean = (mean * 1e9).round() / 1e9
-                        round_mean = round_mean.item()
-                        if round_mean not in [0.0, 1.0]:
-                            failed_cases.append(
-                                f"Parameter {name} of model {model_class} seems not properly initialized. "
-                                f"Mean is {round_mean}, but should be in [0, 1]"
-                            )
-
-        message = "\n" + "\n".join(failed_cases)
-        self.assertTrue(not failed_cases, message)
-
     @parameterized.expand(["float32", "float16", "bfloat16"])
     @require_torch_accelerator
     @slow
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index 9a61160a275f..692b5ab5d9bd 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -265,35 +265,6 @@ def test_state_equivalency(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_state_equivalency(*config_and_inputs)
 
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, param in model.named_parameters():
-                if "time_decay" in name:
-                    if param.requires_grad:
-                        self.assertTrue(param.data.max().item() == 3.0)
-                        self.assertTrue(param.data.min().item() == -5.0)
-                elif "time_first" in name:
-                    if param.requires_grad:
-                        # check if it's a ones like
-                        torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
-                elif any(x in name for x in ["time_mix_key", "time_mix_receptance"]):
-                    if param.requires_grad:
-                        self.assertInterval(
-                            param.data,
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                elif "time_mix_value" in name:
-                    if param.requires_grad:
-                        self.assertInterval(
-                            param.data,
-                            [0.0, 1.3],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def test_attention_outputs(self):
         r"""
         Overriding the test_attention_outputs test as the attention outputs of Rwkv are different from other models
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index 596df6daeda2..ab026334c099 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -25,7 +25,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -374,44 +373,6 @@ def test_model_from_pretrained(self):
         model = SeamlessM4TModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                    "adapter",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(reason="SeamlessM4TSpeechEncoder doesn't have an embedding layer")
     def test_inputs_embeds(self):
         pass
@@ -618,44 +579,6 @@ def test_model_from_pretrained(self):
         model = SeamlessM4TModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                    "adapter",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(
         reason="Expected missing keys serve when using SeamlessM4TForXXX.from_pretrained from a checkpoint saved by SeamlessM4TModel.save_pretrained."
     )
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index 521e5f864e98..66f3353d7482 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -25,7 +25,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -400,44 +399,6 @@ def test_model_from_pretrained(self):
         model = SeamlessM4Tv2Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                    "adapter",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(reason="SeamlessM4Tv2SpeechEncoder doesn't have an embedding layer")
     def test_inputs_embeds(self):
         pass
@@ -631,44 +592,6 @@ def test_model_from_pretrained(self):
         model = SeamlessM4Tv2Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                    "adapter",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(
         reason="Expected missing keys serve when using SeamlessM4Tv2ForXXX.from_pretrained from a checkpoint saved by SeamlessM4Tv2Model.save_pretrained."
     )
diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py
index 70eb4e25095d..be8b27c17712 100644
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@@ -24,7 +24,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -375,32 +374,6 @@ def test_seq_classifier_train(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_seq_classifier_training(*config_and_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.parametrizations.weight",
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "quantizer.weight_proj.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py
index c4d97b43a09d..84c0cc3c56ff 100644
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@@ -24,7 +24,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -386,32 +385,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.parametrizations.weight",
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "quantizer.weight_proj.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index 047f96bbdf1c..95a88f28c2e8 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -239,10 +239,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="Siglip uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "google/siglip-base-patch16-224"
@@ -384,10 +380,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="Siglip uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "google/siglip-base-patch16-224"
@@ -495,10 +487,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="Siglip uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest._create_and_check_torchscript with CLIP->Siglip
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
@@ -654,10 +642,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="Siglip uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/siglip2/test_modeling_siglip2.py b/tests/models/siglip2/test_modeling_siglip2.py
index 9524ea1d6221..f417b512004b 100644
--- a/tests/models/siglip2/test_modeling_siglip2.py
+++ b/tests/models/siglip2/test_modeling_siglip2.py
@@ -331,10 +331,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="Siglip2 uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "google/siglip2-base-patch16-naflex"
@@ -472,10 +468,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="Siglip2 uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "google/siglip2-base-patch16-naflex"
@@ -588,10 +580,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="Siglip2 uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
     def test_load_vision_text_config(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -685,10 +673,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="Siglip2 uses a non-standard initialization scheme")
-    def test_initialization(self):
-        pass
-
 
 # Draw a circle on an images with different aspect ratios
 def prepare_images():
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index bfb305c38d17..103cc29e92d3 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -35,7 +35,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -556,33 +555,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # this model has no inputs_embeds
     @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
@@ -957,29 +929,6 @@ def test_forward_signature(self):
             expected_arg_names.extend(["encoder_outputs"])
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(reason="Model has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
@@ -1649,33 +1598,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
         pass
@@ -1863,10 +1785,6 @@ def test_forward_signature(self):
     def test_hidden_states_output(self):
         pass
 
-    @unittest.skip
-    def test_initialization(self):
-        pass
-
     @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index 91a508b97559..6e84a3a5c42f 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -26,7 +26,7 @@
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -232,22 +232,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if name.endswith(".w_g"):
-                    continue
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9) / 1e9).round().item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py
index 9f602438bcd3..e6d23a8019c4 100644
--- a/tests/models/swin/test_modeling_swin.py
+++ b/tests/models/swin/test_modeling_swin.py
@@ -23,7 +23,7 @@
 
 from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -447,20 +447,6 @@ def test_model_from_pretrained(self):
         model = SwinModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
 
 @require_vision
 @require_torch
diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py
index 0099f3cdc644..e4bf84c68283 100644
--- a/tests/models/swin2sr/test_modeling_swin2sr.py
+++ b/tests/models/swin2sr/test_modeling_swin2sr.py
@@ -20,7 +20,7 @@
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -237,32 +237,6 @@ def test_model_from_pretrained(self):
         model = Swin2SRModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    # overwriting because of `logit_scale` parameter
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "logit_scale" in name:
-                    continue
-                if param.requires_grad:
-                    # See PR #38607 (to avoid flakiness)
-                    data = torch.flatten(param.data)
-                    n_elements = torch.numel(data)
-                    # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
-                    # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
-                    n_elements_to_skip_on_each_side = int(n_elements * 0.025)
-                    data_to_check = torch.sort(data).values
-                    if n_elements_to_skip_on_each_side > 0:
-                        data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
-                    self.assertIn(
-                        ((data_to_check.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py
index fefe7bb7f841..28c1f8671f0c 100644
--- a/tests/models/swinv2/test_modeling_swinv2.py
+++ b/tests/models/swinv2/test_modeling_swinv2.py
@@ -24,7 +24,7 @@
 
 from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -442,20 +442,6 @@ def test_model_from_pretrained(self):
     def test_feed_forward_chunking(self):
         pass
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and "logit_scale" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
 
 @require_vision
 @require_torch
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 460f41495d73..10985988920e 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -23,7 +23,7 @@
 from transformers.testing_utils import Expectations, require_timm, require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -532,29 +532,6 @@ def test_greyscale_images(self):
 
             self.assertTrue(outputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.init_xavier_std = 1e9
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if "bbox_attention" in name and "bias" not in name:
-                        self.assertLess(
-                            100000,
-                            abs(param.data.max().item()),
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
 
 TOLERANCE = 1e-4
 
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index b1a324c7a660..b839e583b990 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -36,7 +36,6 @@
 
 if is_torch_available():
     import torch
-    from torch import nn
 
     from transformers import TextNetBackbone, TextNetForImageClassification, TextNetModel
 
@@ -246,22 +245,6 @@ def test_backbone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_backbone(*config_and_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, module in model.named_modules():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-                    self.assertTrue(
-                        torch.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        torch.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index f274fd65c956..aafcb141db7c 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -139,10 +139,6 @@ def test_hidden_states_output(self):
     def test_can_init_all_missing_weights(self):
         pass
 
-    @unittest.skip(reason="TimmBackbone initialization is managed on the timm side")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(reason="TimmBackbone models doesn't have inputs_embeds")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
index 33592276a640..69e2f5a134db 100644
--- a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
+++ b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
@@ -157,10 +157,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_can_init_all_missing_weights(self):
         pass
 
-    @unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
-    def test_initialization(self):
-        pass
-
     def test_gradient_checkpointing(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         model = TimmWrapperModel._from_config(config)
diff --git a/tests/models/tvp/test_modeling_tvp.py b/tests/models/tvp/test_modeling_tvp.py
index c002127cbd9a..9672f74d4eee 100644
--- a/tests/models/tvp/test_modeling_tvp.py
+++ b/tests/models/tvp/test_modeling_tvp.py
@@ -22,7 +22,6 @@
 
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -194,23 +193,6 @@ def test_inputs_embeds(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for TVP
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # params are randomly initialized.
-                    self.assertAlmostEqual(
-                        param.data.mean().item(),
-                        0.0,
-                        delta=1.0,
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     @require_timm
     def test_backbone_selection(self):
         def _validate_backbone_init():
diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py
index 65e108df0dbb..7aa4cdf1c1f4 100644
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@@ -26,7 +26,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -420,39 +419,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
index cc0c8772969d..02564dea72f8 100644
--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@@ -26,7 +26,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -460,41 +459,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "label_embeddings_concat",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
@@ -671,41 +635,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "label_embeddings_concat",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index 4bc68977fff2..f816341a3abe 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -30,7 +30,7 @@
 from transformers.utils.import_utils import get_torch_major_and_minor_version
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -219,21 +219,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.backbone_config = _config_zero_init(configs_no_init.backbone_config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     @require_timm
     def test_backbone_selection(self):
         config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index d06524305a4e..d0571e4a0299 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -34,7 +34,7 @@
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -320,23 +320,6 @@ def test_flash_attn_2_inference_equivalence(self):
     def test_flash_attn_2_inference_equivalence_right_padding(self):
         pass
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                # This is an exception in the module, it's initialized with xavier_uniform without using initializer_range
-                if name.endswith("patch_embeddings.projection.weight"):
-                    continue
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/vitdet/test_modeling_vitdet.py b/tests/models/vitdet/test_modeling_vitdet.py
index e0d7c9344dc9..e099f44d074c 100644
--- a/tests/models/vitdet/test_modeling_vitdet.py
+++ b/tests/models/vitdet/test_modeling_vitdet.py
@@ -16,7 +16,7 @@
 import unittest
 
 from transformers import VitDetConfig
-from transformers.testing_utils import is_flaky, require_torch, torch_device
+from transformers.testing_utils import require_torch, torch_device
 from transformers.utils import is_torch_available
 
 from ...test_backbone_common import BackboneTesterMixin
@@ -173,28 +173,24 @@ def setUp(self):
         self.model_tester = VitDetModelTester(self)
         self.config_tester = ConfigTester(self, config_class=VitDetConfig, has_text_modality=False, hidden_size=37)
 
-    @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.")
-    def test_initialization(self):
-        super().test_initialization()
-
     # TODO: Fix me (once this model gets more usage)
     @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_cpu_offload(self):
-        super().test_cpu_offload()
+        pass
 
     # TODO: Fix me (once this model gets more usage)
     @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_disk_offload_bin(self):
-        super().test_disk_offload()
+        pass
 
     @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_disk_offload_safetensors(self):
-        super().test_disk_offload()
+        pass
 
     # TODO: Fix me (once this model gets more usage)
     @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_model_parallelism(self):
-        super().test_model_parallelism()
+        pass
 
     def test_config(self):
         self.config_tester.run_common_tests()
diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
index 2876d95e1f3e..654503c0bac0 100644
--- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -141,11 +141,6 @@ def test_config(self):
     def test_batching_equivalence(self, atol=3e-4, rtol=3e-4):
         super().test_batching_equivalence(atol=atol, rtol=rtol)
 
-    # TODO: @Pavel
-    @unittest.skip(reason="currently failing")
-    def test_initialization(self):
-        pass
-
     @unittest.skip(reason="VitPoseBackbone does not support input and output embeddings")
     def test_model_common_attributes(self):
         pass
diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py
index a6661d6946e6..f45382e7ce95 100644
--- a/tests/models/vits/test_modeling_vits.py
+++ b/tests/models/vits/test_modeling_vits.py
@@ -220,46 +220,6 @@ def test_determinism(self):
     def test_batching_equivalence(self):
         pass
 
-    @is_flaky(
-        max_attempts=3,
-        description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range",
-    )
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        uniform_init_parms = [
-            "emb_rel_k",
-            "emb_rel_v",
-            "conv_1",
-            "conv_2",
-            "conv_pre",
-            "conv_post",
-            "conv_proj",
-            "conv_dds",
-            "project",
-            "wavenet.in_layers",
-            "wavenet.res_skip_layers",
-            "upsampler",
-            "resblocks",
-        ]
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     @unittest.skip(reason="VITS has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/vjepa2/test_modeling_vjepa2.py b/tests/models/vjepa2/test_modeling_vjepa2.py
index 191dbc54848e..3a2a24c54901 100644
--- a/tests/models/vjepa2/test_modeling_vjepa2.py
+++ b/tests/models/vjepa2/test_modeling_vjepa2.py
@@ -21,7 +21,6 @@
 
 from transformers import VJEPA2Config
 from transformers.testing_utils import (
-    is_flaky,
     require_torch,
     require_vision,
     slow,
@@ -167,10 +166,6 @@ def setUp(self):
         self.model_tester = VJEPA2ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=VJEPA2Config, has_text_modality=False, hidden_size=37)
 
-    @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.")
-    def test_initialization(self):
-        super().test_initialization()
-
     def test_config(self):
         self.config_tester.run_common_tests()
 
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index e41d37aa86a6..204eb797ee41 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -606,40 +606,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
@@ -949,40 +915,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
index 2ada8472b09e..7388f177d602 100644
--- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
@@ -31,7 +31,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -575,44 +574,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
index 4135f2a0e1ac..0304faaad3f7 100644
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -33,7 +33,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -546,44 +545,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py
index 82edabc0bf17..787d3b738669 100644
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@@ -25,7 +25,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -397,42 +396,6 @@ def test_retain_grad_hidden_states_attentions(self):
 
         self.assertIsNotNone(hidden_states.grad)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "label_embeddings_concat",
-                    "rel_attn_embed",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 78c089af4957..6d1827b9073b 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -561,32 +561,6 @@ def test_model_get_set_embeddings(self):
     def test_feed_forward_chunking(self):
         pass
 
-    # override as the `logit_scale`, `prompts_generator.alpha` parameters require special treatment
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif name == "prompts_generator.alpha":
-                        self.assertAlmostEqual(param.data.mean().item(), model.config.prompt_alpha)
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
diff --git a/tests/models/xcodec/test_modeling_xcodec.py b/tests/models/xcodec/test_modeling_xcodec.py
index 6e00bf19062e..41f71c409ba6 100644
--- a/tests/models/xcodec/test_modeling_xcodec.py
+++ b/tests/models/xcodec/test_modeling_xcodec.py
@@ -345,25 +345,6 @@ def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                # skipping the parametrizations original0 tensor
-                if name == "semantic_model.encoder.pos_conv_embed.conv.parametrizations.weight.original0":
-                    continue
-
-                uniform_init_parms = ["conv"]
-
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of {model_class.__name__} seems not properly initialized",
-                        )
-
     @unittest.skip(reason="The XcodecModel does not have support dynamic compile yet")
     def test_sdpa_can_compile_dynamic(self):
         pass
diff --git a/tests/models/xlstm/test_modeling_xlstm.py b/tests/models/xlstm/test_modeling_xlstm.py
index c25c4f12e3e5..f1fb652e8b93 100644
--- a/tests/models/xlstm/test_modeling_xlstm.py
+++ b/tests/models/xlstm/test_modeling_xlstm.py
@@ -168,17 +168,6 @@ def setUp(self):
             self, config_class=xLSTMConfig, n_embd=37, common_properties=["hidden_size", "num_hidden_layers"]
         )
 
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, param in model.named_parameters():
-                if "D" in name:
-                    if param.requires_grad:
-                        # check if it's a ones like
-                        self.assertTrue(torch.allclose(param.data, torch.ones_like(param.data), atol=1e-5, rtol=1e-5))
-
     @unittest.skip(reason="xLSTM cache slicing test case is an edge case")
     def test_generate_without_input_ids(self):
         pass
diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py
index 30c36e78dce5..74ca658b9e09 100644
--- a/tests/models/zamba/test_modeling_zamba.py
+++ b/tests/models/zamba/test_modeling_zamba.py
@@ -21,7 +21,6 @@
 
 from transformers import AutoTokenizer, ZambaConfig, is_torch_available
 from transformers.testing_utils import (
-    is_flaky,
     require_bitsandbytes,
     require_flash_attn,
     require_torch,
@@ -32,7 +31,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -346,50 +345,6 @@ def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    @is_flaky(description="TODO: ydshieh")
-    def test_initialization(self):
-        r"""
-        Overriding the test_initialization test as the A_log and D params of the Mamba block are initialized differently
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if "A_log" in name:
-                        A = torch.arange(1, config.mamba_d_state + 1, dtype=torch.float32)[None, :]
-                        intermediate_dim = config.mamba_expand * config.hidden_size
-                        A = A.expand(intermediate_dim, -1).reshape(
-                            config.n_mamba_heads, intermediate_dim // config.n_mamba_heads, -1
-                        )
-                        torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5)
-                    elif "D" in name:
-                        # check if it's a ones like
-                        torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
-                    elif "x_proj" in name or "dt_proj_weight" in name:
-                        self.assertIn(
-                            ((param.data.mean() * 1e2).round() / 1e2).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized (raw value {param.data.mean()})",
-                        )
-                    elif "dt_proj_bias" in name:
-                        dt = torch.exp(
-                            torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min))
-                            + math.log(config.time_step_min)
-                        ).clamp(min=config.time_step_floor)
-                        inv_dt = dt + torch.log(-torch.expm1(-dt))
-                        if param.requires_grad:
-                            self.assertTrue(param.data.max().item() <= inv_dt[1])
-                            self.assertTrue(param.data.min().item() >= inv_dt[0])
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def test_attention_outputs(self):
         r"""
         Overriding the test_attention_outputs test as the Zamba model outputs attention only for its attention layers
diff --git a/tests/models/zamba2/test_modeling_zamba2.py b/tests/models/zamba2/test_modeling_zamba2.py
index 2b667dc89b40..665c03b83837 100644
--- a/tests/models/zamba2/test_modeling_zamba2.py
+++ b/tests/models/zamba2/test_modeling_zamba2.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Zamba model."""
 
-import math
 import tempfile
 import unittest
 
@@ -34,7 +33,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -387,39 +386,6 @@ def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_initialization(self):
-        r"""
-        Overriding the test_initialization test as the A_log and D params of the Mamba block are initialized differently
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if "A_log" in name:
-                        A = torch.arange(1, config.n_mamba_heads + 1, dtype=torch.float32)[None, :]
-                        self.assertTrue(torch.allclose(param.data, torch.log(A), atol=1e-5, rtol=1e-5))
-                    elif "D" in name:
-                        # check if it's a ones like
-                        self.assertTrue(torch.allclose(param.data, torch.ones_like(param.data), atol=1e-5, rtol=1e-5))
-                    elif "dt_bias" in name:
-                        dt = torch.exp(
-                            torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min))
-                            + math.log(config.time_step_min)
-                        ).clamp(min=config.time_step_floor)
-                        inv_dt = dt + torch.log(-torch.expm1(-dt))
-                        if param.requires_grad:
-                            self.assertTrue(param.data.max().item() <= inv_dt[1])
-                            self.assertTrue(param.data.min().item() >= inv_dt[0])
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
     def test_attention_outputs(self):
         r"""
         Overriding the test_attention_outputs test as the Zamba2 model outputs attention only for its attention layers
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 15d221e7a48d..0896c1899329 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1021,28 +1021,6 @@ def check_equal(loaded):
                 torch.save(state_dict, pt_checkpoint_path, _use_new_zipfile_serialization=False)
                 check_equal(load_state_dict(pt_checkpoint_path))
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=copy.deepcopy(configs_no_init))
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    data = torch.flatten(param.data)
-                    n_elements = torch.numel(data)
-                    # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
-                    # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
-                    n_elements_to_skip_on_each_side = int(n_elements * 0.025)
-                    data_to_check = torch.sort(data).values
-                    if n_elements_to_skip_on_each_side > 0:
-                        data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
-                    self.assertIn(
-                        ((data_to_check.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_determinism(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()