NVIDIA-NeMo · shanmugamr1992 · Feb 18, 2026 · Feb 18, 2026 · Feb 19, 2026 · Feb 19, 2026
@@ -146,6 +146,11 @@ policy:
     moe_enable_deepep: false
     moe_token_dispatcher_type: "allgather"
     moe_shared_expert_overlap: false
+    moe_pad_experts_for_cuda_graph_inference: false
+    cuda_graph_impl: "local"
+    cuda_graph_scope: null
+    use_te_rng_tracker: true
+    inference_rng_tracker: true
 
     optimizer:
       optimizer: "adam"
@@ -252,13 +257,16 @@ policy:
     stop_strings: null
     mcore_generation_config:
       buffer_size_gb: 20  # Total GPU memory (in GB) allocated for KV cache buffers
-      buffer_guaranteed_fraction: 0.1  # Fraction of buffer reserved for guaranteed active requests
       num_cuda_graphs: 16  # Number of CUDA graphs to pre-compile for different batch sizes
       block_size_tokens: 256  # Size of each KV cache block in tokens (affects memory granularity)
       use_cuda_graphs_for_non_decode_steps: true  # Enable CUDA graphs for prefill/context processing
-      enable_chunked_prefill: true  # Split long prefills into chunks for better memory management
-      unified_memory_level: 0  # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
+      unified_memory_level: 0  # Unified memory usage level (0=disabled, 1+=enables unified memory )
       max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
+      enable_chunked_prefill: false
+      kv_cache_management_mode: "persist" # Can be "persist", "offload", or "recompute"
+      static_kv_memory_pointers: false # Relevant only for offload and recompute modes
+      materialize_only_last_token_logits: false
+
     vllm_cfg:
       async_engine: false
       precision: ${policy.precision}

@@ -100,6 +100,12 @@ policy:
     moe_shared_expert_overlap: false
     #gives ~20% training perf speedup with sequence packing 
     apply_rope_fusion: True
+    moe_pad_experts_for_cuda_graph_inference: false
+    cuda_graph_impl: "local"
+    cuda_graph_scope: "full_iteration_inference"
+    use_te_rng_tracker: true
+    inference_rng_tracker: true
+    batch_invariant_mode: false
 
     optimizer:
       optimizer: "adam"
@@ -125,6 +131,7 @@ policy:
       clip_grad: ${policy.max_grad_norm}
 
     scheduler:
+      override_opt_param_scheduler: true
       start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
       end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
       weight_decay_incr_style: "constant"
@@ -151,9 +158,12 @@ policy:
       num_cuda_graphs: 16  # Number of CUDA graphs to pre-compile for different batch sizes
       block_size_tokens: 256  # Size of each KV cache block in tokens (affects memory granularity)
       use_cuda_graphs_for_non_decode_steps: true  # Enable CUDA graphs for prefill/context processing
-      enable_chunked_prefill: false  # Split long prefills into chunks for better memory management
-      unified_memory_level: 0  # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
+      unified_memory_level: 0  # Unified memory usage level (0=disabled, 1+=enables unified memory )
       max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
+      enable_chunked_prefill: false
+      kv_cache_management_mode: "persist" # Can be "persist", "offload", or "recompute"
+      static_kv_memory_pointers: false # Relevant only for offload and recompute modes
+      materialize_only_last_token_logits: false
 
     vllm_cfg:
       tensor_parallel_size: 1

@@ -17,7 +17,7 @@ policy:
   train_global_batch_size: 512
   train_micro_batch_size: 1
   generation_batch_size: 32 # Only used when generating using HF backend
-  logprob_batch_size: 4
+  logprob_batch_size: ${policy.train_micro_batch_size}
   max_total_sequence_length: 4096
   precision: "bfloat16"
 
@@ -48,6 +48,7 @@ policy:
       params_dtype: "float32"
 
     scheduler:
+      override_opt_param_scheduler: true
       start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
       end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
       weight_decay_incr_style: "constant"

@@ -0,0 +1,161 @@
+# NCCL Timeout During CUDA Graph Warmup in MoE RL Training
+
+## Symptom
+
+After several successful GRPO training steps (anywhere from step 5 to step 20+), the job crashes with NCCL collective operation timeouts during the generation phase. The errors look like:
+
+```
+(MegatronPolicyWorker[rank=1] pid=151407) [rank1]:[E228 00:10:27.202494521 ProcessGroupNCCL.cpp:2057] [PG ID 12 PG GUID 99(EXPERT_MODEL_PARALLEL_GROUP) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1145527, OpType=ALLTOALL_BASE, NumelIn=206438400, NumelOut=206438400, Timeout(ms)=600000) ran for 600016 milliseconds before timing out.
+(MegatronPolicyWorker[rank=1] pid=151407)
+(MegatronPolicyWorker[rank=1] pid=151407) [2026-02-28 00:10:27,258 E 151407 153013] logging.cc:118: Unhandled exception: N3c1016DistBackendErrorE. what(): [PG ID 12 PG GUID 99(EXPERT_MODEL_PARALLEL_GROUP) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1145527, OpType=ALLTOALL_BASE, NumelIn=206438400, NumelOut=206438400, Timeout(ms)=600000) ran for 600016 milliseconds before timing out.
+(MegatronPolicyWorker[rank=1] pid=151407)
+(MegatronPolicyWorker[rank=1] pid=151407)
+(MegatronPolicyWorker[rank=7] pid=151429) [rank7]:[E228 00:10:27.168785312 ProcessGroupNCCL.cpp:2057] [PG ID 5 PG GUID 36(TENSOR_MODEL_PARALLEL_GROUP) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=2036074, OpType=_REDUCE_SCATTER_BASE, NumelIn=3225600, NumelOut=1612800, Timeout(ms)=600000) ran for 600000 milliseconds before timing out.
+(MegatronPolicyWorker[rank=7] pid=151429)
+(MegatronPolicyWorker[rank=7] pid=151429) [2026-02-28 00:10:27,224 E 151429 152962] logging.cc:118: Unhandled exception: N3c1016DistBackendErrorE. what(): [PG ID 5 PG GUID 36(TENSOR_MODEL_PARALLEL_GROUP) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=2036074, OpType=_REDUCE_SCATTER_BASE, NumelIn=3225600, NumelOut=1612800, Timeout(ms)=600000) ran for 600000 milliseconds before timing out.
+```
+
+Key signatures:
+- Different ranks report **different NCCL operation types** (ALLTOALL_BASE, REDUCE_SCATTER_BASE, ALLREDUCE, COALESCED) -- a collective mismatch
+- The crash always happens during `cuda graph warmup` at the start of generation
+- A new NCCL communicator is lazily initialized at the failing step (`NCCL version 2.27.5+cuda12.9` printed mid-warmup)
+- Steps 1 through N-1 complete normally; the crash is non-deterministic
+
+## Background: The Training-Inference Cycle
+
+In the RL training loop, each step does:
+
+```
+generate() {
+    _wake()          // resume inference engine (realloc KV cache, rebuild CUDA graphs)
+    <submit requests, run generation>
+    _sleep()         // suspend inference engine (dealloc KV cache, delete CUDA graphs)
+}
+```
+
+With `static_kv_memory_pointers=false` and `kv_cache_management_mode=recompute`, every suspend/resume cycle **destroys and recreates CUDA graphs**. Graph warmup runs forward passes through the model, which for MoE models includes NCCL alltoall collectives across expert-parallel (EP) ranks. All EP/TP ranks must execute the same sequence of NCCL operations in lockstep during this warmup.
+
+## Architecture: Two Communication Systems on One Event Loop
+
+The `DynamicInferenceEngine` runs an async engine loop on a dedicated event loop thread. This single event loop handles two different communication systems:
+
+| System | Purpose | Mechanism |
+|--------|---------|-----------|
+| **EP consensus** (`_ep_group_has_work`) | Coordinate EP ranks on work availability | Async ZMQ all-reduce |
+| **CUDA graph warmup** (inside `resume()`) | Capture model forward passes into graphs | Blocking NCCL collectives |
+
+Both run on the **same event loop thread**. This is the root of the problem.
+
+## Root Cause
+
+The engine loop has this structure (simplified from `run_engine_with_coordinator`):
+
+```python
+while True:
+    self.schedule_requests()                          # read ZMQ messages
+    ep_group_has_work = await self._ep_group_has_work(...)  # ZMQ all-reduce across EP ranks
+    if not ep_group_has_work:
+        if self.suspend_signal:
+            self.suspend()      # no-op when already suspended
+        else:
+            self.resume()       # CUDA graph warmup -- blocks with NCCL!
+        await asyncio.sleep(0.02)
+```
+
+When the coordinator sends `RESUME + UNPAUSE` to all engines, the signals arrive asynchronously. EP ranks process them at different times depending on ZMQ delivery and event loop scheduling. This leads to a **divergence**:
+
+```
+Rank A (received RESUME):  suspend_signal=False  -->  calls resume()  -->  NCCL alltoall BLOCKS event loop
+Rank B (not yet received): suspend_signal=True   -->  calls suspend() (no-op)  -->  sleeps 20ms
+```
+
+On the next iteration, Rank B calls `_ep_group_has_work()` which does an async ZMQ all-reduce. This requires Rank A to respond. But Rank A's event loop is **blocked inside NCCL** (graph warmup forward pass). Rank A can never respond to ZMQ while NCCL is blocking its event loop.
+
+**Deadlock: Rank A waits for Rank B in NCCL. Rank B waits for Rank A in ZMQ.**
+
+After 10 minutes, the NCCL watchdog times out and kills the process.
+
+### Why it's non-deterministic
+
+The deadlock only occurs when at least one EP rank enters `resume()` before all other EP ranks have received the `RESUME` signal. When all ranks happen to process the signals within the same ~20ms engine loop cycle, they all enter `resume()` together and the warmup succeeds. This timing depends on ZMQ delivery, event loop scheduling, and OS thread scheduling -- hence the non-determinism.
+
+
+### Implementation 1 (This causes delay of 25%)
+
+```python
+def _wake(self):
+    # Phase 1: Unpause the engine loop (async, event loop stays free for ZMQ)
+    asyncio.run_coroutine_threadsafe(self._unpause_engine(), self._inference_loop).result()
+
+    # Phase 2: Synchronized resume on the main thread
+    self._synchronized_resume()
+
+async def _unpause_engine(self):
+    # Send only UNPAUSE (not RESUME) -- keeps suspend_signal=True so the engine
+    # loop never calls resume() on its own
+    if torch.distributed.get_rank() == 0:
+        self.inference_client.unpause_engines()
+    await self.dynamic_inference_engine.running.wait()
+
+def _synchronized_resume(self):
+    engine = self.dynamic_inference_engine
+
+    # Guard: replace suspend() with a no-op while we resume
+    original_suspend = engine.suspend
+    engine.suspend = lambda: None
+
+    try:
+        torch.distributed.barrier()        # all ranks ready
+        engine.resume()                     # CUDA graph warmup (NCCL collectives)
+        engine.suspend_signal = False       # let engine loop transition to normal mode
+        torch.distributed.barrier()        # all ranks done
+    finally:
+        engine.suspend = original_suspend
+```
+
+### Why this works
+
+**No event-loop blocking.** The NCCL barriers and `resume()` run on the main thread. The event loop thread continues running the engine loop, freely handling ZMQ communication for EP consensus. No rank's ZMQ is ever starved.
+
+**No RESUME signal divergence.** We never send the `RESUME` header to the coordinator. Instead, we send only `UNPAUSE` (which restarts the engine loop) and keep `suspend_signal=True`. The engine loop sees `suspend_signal=True`, calls `suspend()` (no-op since already suspended), and idles. It never calls `resume()` on its own. We control exactly when `resume()` happens -- after the barrier on the main thread.
+
+**No thread-safety race.** When `resume()` runs on the main thread, it sets `is_suspended=False`. Without the guard, the engine loop's next `suspend()` call (on the event loop thread) would read `is_suspended=False`, enter the suspend body, and **deallocate buffers while the main thread is still creating CUDA graphs**. The `suspend()` guard (replacing it with `lambda: None`) prevents this. The guard is removed only after `suspend_signal=False` is set, so the engine loop transitions to calling `resume()` (which is a no-op since we already resumed) instead of `suspend()`.
+
+### Thread interaction timeline
+
+```
+Main Thread                              Event Loop Thread (engine loop)
+-----------                              --------------------------------
+_unpause_engine() ----sends UNPAUSE---->
+                                         schedule_requests(): reads UNPAUSE
+                                         _ep_group_has_work(): ZMQ all-reduce
+                                         suspend_signal=True -> suspend() [no-op]
+                                         asyncio.sleep(0.02)
+
+barrier() ............all ranks sync.... (ZMQ continues running freely)
+
+engine.suspend = no-op                   suspend() -> no-op [guarded]
+engine.resume()                          (ZMQ continues, no GPU conflict)
+  -> reinitialize buffers
+  -> create_cuda_graphs() [NCCL]
+engine.suspend_signal = False            (ZMQ continues)
+
+barrier() ............all ranks done....
+
+engine.suspend = original                suspend_signal=False -> resume() [no-op]
+                                         (engine is ready for requests)
+```
+
+## Affected Configuration
+
+This bug affects MoE models using the megatron generation backend with:
+- `moe_token_dispatcher_type=alltoall` (NCCL alltoall inside CUDA graph warmup)
+- `static_kv_memory_pointers=false` (CUDA graphs deleted/recreated each cycle)
+- `kv_cache_management_mode=recompute` (full dealloc on suspend)
+- `num_cuda_graphs > 0`
+- Expert parallelism (EP) > 1
+
+Dense models or configs with `static_kv_memory_pointers=true` are not affected because CUDA graphs are not recreated on resume.
+
+### Implementation 2 
+In dynamic_engine.py you set asyncio.sleep(0) instead of 0.02. This works 
@@ -60,6 +60,7 @@
 from nemo_rl.distributed.ray_actor_environment_registry import get_actor_python_env
 from nemo_rl.distributed.virtual_cluster import ClusterConfig, RayVirtualCluster
 from nemo_rl.environments.interfaces import EnvironmentInterface
+from nemo_rl.models.generation.megatron import MegatronGeneration
 from nemo_rl.experience.rollouts import (
     run_async_multi_turn_rollout,
     run_async_nemo_gym_rollout,
@@ -395,11 +396,6 @@ def setup(
         )
 
     else:
-        assert generation_config["backend"] != "megatron", (
-            "Non-colocated inference is not supported for Megatron generation backends. "
-            "Please use vLLM backend for generation."
-        )
-
         # train resources will be updated through overall and inference resources below
         train_gpus_per_node = cluster_config["gpus_per_node"]
         train_nodes = policy_nodes
@@ -539,6 +535,18 @@ def init_sglang():
         pg.finish_generation()
         return pg, time.perf_counter() - t0
 
+    def init_megatron_generation():
+        """Initialize Megatron generation workers for non-colocated inference."""
+        t0 = time.perf_counter()
+        mg = MegatronGeneration(
+            cluster=inference_cluster,
+            config=policy_config,
+            tokenizer=tokenizer,
+            processor=processor,
+            weights_path=weights_path,
+        )
+        return mg, time.perf_counter() - t0
+
     def initialize_generation_with_policy(
         init_generation_fn,
         generation_name: str,
@@ -603,14 +611,38 @@ def initialize_generation_with_policy(
     # Handle generation-specific setup
     if backend == "megatron":
         # Megatron generation: policy_generation is None, only initialize policy
-        policy_generation = None
-        print(
-            f"  ✓ Using {backend} backend for generation with {policy_config['model_name']}",
-            flush=True,
-        )
+        if colocated_inference:
+            policy_generation = None
+            print(
+                f"  ✓ Using {backend} backend for generation with {policy_config['model_name']}",
+                flush=True,
+            )
 
-        policy, policy_time = init_policy()
-        worker_init_timing_metrics["policy_init_time_s"] = policy_time
+            policy, policy_time = init_policy()
+            worker_init_timing_metrics["policy_init_time_s"] = policy_time
+        else:
+            # Non-colocated Megatron backend: separate inference workers
+            print(
+                "  ⚡ Using parallel worker initialization (non-colocated Megatron mode)",
+                flush=True,
+            )
+
+            # Execute both initializations in parallel
+            parallel_start_time = time.perf_counter()
+            with ThreadPoolExecutor(max_workers=2) as executor:
+                megatron_gen_future = executor.submit(init_megatron_generation)
+                policy_future = executor.submit(init_policy)
+                policy_generation, megatron_gen_time = megatron_gen_future.result()
+                policy, policy_time = policy_future.result()
+            parallel_wall_time = time.perf_counter() - parallel_start_time
+
+            # Store timing metrics
+            worker_init_timing_metrics["megatron_generation_init_time_s"] = (
+                megatron_gen_time
+            )
+            worker_init_timing_metrics["policy_init_time_s"] = policy_time
+            worker_init_timing_metrics["parallel_wall_time_s"] = parallel_wall_time
+            worker_init_timing_metrics["parallel_init_enabled"] = True
 
     elif backend == "vllm":
         # vLLM generation: setup config, then initialize with policy

@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_rl.models.generation.megatron.megatron_generation import (
+    MegatronGeneration,
+)
+
+__all__ = ["MegatronGeneration"]
+12 −1		.github/workflows/release-docs.yml
+29 −0		.github/workflows/release-nightly-docs.yml
+1 −1		docs/conf.py
+1 −1		docs/project.json
+7 −2		docs/versions1.json
+3 −0		examples/multimodal/layer_scaling.py
+5 −1		examples/multimodal/model.py
+4 −2		examples/multimodal/nvlm/internvit.py
+1 −0		megatron/core/dist_checkpointing/strategies/base.py
+60 −33		megatron/core/dist_checkpointing/strategies/filesystem_async.py
+15 −2		megatron/core/dist_checkpointing/strategies/torch.py
+0 −8		megatron/core/enums.py
+2 −0		megatron/core/extensions/transformer_engine.py
+8 −6		megatron/core/fusions/fused_bias_dropout.py
+1 −1		megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
+1 −0		megatron/core/inference/quantization/__init__.py
+37 −0		megatron/core/inference/quantization/mxfp8_tensor.py
+92 −0		megatron/core/inference/quantization/utils.py
+2 −1		megatron/core/models/gpt/fine_grained_callables.py
+0 −20		megatron/core/parallel_state.py
+0 −9		megatron/core/pipeline_parallel/schedules.py
+2 −0		megatron/core/post_training/modelopt/layers.py
+4 −2		megatron/core/rerun_state_machine.py
+0 −5		megatron/core/ssm/mamba_block.py
+2 −4		megatron/core/ssm/mamba_layer.py
+6 −0		megatron/core/ssm/mamba_mixer.py
+32 −8		megatron/core/tensor_parallel/inference_layers.py
+0 −12		megatron/core/transformer/custom_layers/transformer_engine.py
+0 −9		megatron/core/transformer/enums.py
+1 −3		megatron/core/transformer/multi_latent_attention.py
+29 −21		megatron/core/transformer/multi_token_prediction.py
+18 −0		megatron/core/transformer/transformer_config.py
+25 −16		megatron/core/transformer/transformer_layer.py
+39 −0		megatron/core/utils.py
+6 −1		megatron/inference/utils.py
+7 −3		megatron/rl/sequence_packing_utils.py
+35 −10		megatron/training/arguments.py
+20 −2		megatron/training/checkpointing.py
+18 −0		megatron/training/config/__init__.py
+0 −0		megatron/training/config/common_config.py
+0 −0		megatron/training/config/resilience_config.py
+19 −0		megatron/training/config/training_config.py
+1 −6		tests/functional_tests/test_cases/common/ckpt_converter/__main__.py
+4 −7		tests/functional_tests/test_cases/common/moe_perf/__main__.py
+4 −2		tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py
+24 −11		tests/unit_tests/dist_checkpointing/models/test_moe_experts.py
+1 −2		tests/unit_tests/dist_checkpointing/test_torch_dist.py
+6 −7		tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py
+274 −1		tests/unit_tests/fusions/test_bias_dropout_fusion.py
+1 −6		tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py
+12 −9		tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py
+23 −25		tests/unit_tests/models/test_bert_model.py
+33 −21		tests/unit_tests/models/test_llava_model.py
+35 −0		tests/unit_tests/rl/test_sequence_packing_utils.py
+9 −3		tests/unit_tests/ssm/test_mamba_layer.py
+17 −5		tests/unit_tests/ssm/test_mamba_mixer.py
+9 −7		tests/unit_tests/test_utils.py
+17 −13		tests/unit_tests/transformer/moe/test_grouped_mlp.py
+13 −24		tests/unit_tests/transformer/moe/test_moe_layer.py
+8 −26		tests/unit_tests/transformer/moe/test_moe_layer_discrepancy.py
+11 −20		tests/unit_tests/transformer/moe/test_routers.py
+3 −5		tests/unit_tests/transformer/moe/test_sequential_mlp.py
+5 −9		tests/unit_tests/transformer/moe/test_shared_experts.py
+3 −7		tests/unit_tests/transformer/moe/test_token_dispatcher.py
+12 −10		tests/unit_tests/transformer/test_attention.py
+5 −3		tests/unit_tests/transformer/test_attention_no_rope.py
+5 −3		tests/unit_tests/transformer/test_attention_packed_seq.py
+2 −0		tests/unit_tests/transformer/test_cuda_graphs.py
+3 −2		tests/unit_tests/transformer/test_mlp.py
+12 −7		tests/unit_tests/transformer/test_multi_latent_attention.py
+7 −13		tests/unit_tests/transformer/test_spec_customization.py
+5 −3		tests/unit_tests/transformer/test_submodule_callables.py
+1 −1		tests/unit_tests/transformer/test_transformer_block_custom_pgs.py
+6 −4		tests/unit_tests/transformer/test_transformer_layer.py
+4 −13		tools/run_inference_performance_test.py
+48 −47		uv.lock