NVIDIA-NeMo
diff --git a/‎nemo/tron/checkpointing.py‎
Lines changed: 1 addition & 1 deletion b/‎nemo/tron/checkpointing.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nemo/tron/eval.py‎
Lines changed: 7 additions & 3 deletions b/‎nemo/tron/eval.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎nemo/tron/examples/lingua-1b_dclm.py‎
Lines changed: 1 addition & 71 deletions b/‎nemo/tron/examples/lingua-1b_dclm.py‎
Lines changed: 1 addition & 71 deletions
diff --git a/‎nemo/tron/llm/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎nemo/tron/llm/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎nemo/tron/llm/gpt.py‎
Lines changed: 63 additions & 0 deletions b/‎nemo/tron/llm/gpt.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎nemo/tron/llm/utils.py‎
Lines changed: 129 additions & 0 deletions b/‎nemo/tron/llm/utils.py‎
Lines changed: 129 additions & 0 deletions
@@ -40,9 +40,9 @@
     FullyParallelLoadStrategyWrapper,
     FullyParallelSaveStrategyWrapper,
 )
+from megatron.core.fp8_utils import is_float8tensor
 from megatron.core.num_microbatches_calculator import update_num_microbatches
 from megatron.core.rerun_state_machine import get_rerun_state_machine
-from megatron.core.utils import is_float8tensor
 
 from nemo.tron import fault_tolerance
 from nemo.tron.config import ConfigContainer
 
@@ -25,6 +25,7 @@
 from nemo.tron import fault_tolerance
 from nemo.tron.state import GlobalState
 from nemo.tron.utils.common_utils import is_last_rank, print_rank_0, print_rank_last
+from nemo.tron.utils.train_utils import check_forward_step_func_num_args, maybe_inject_state
 
 
 def evaluate(
@@ -38,8 +39,10 @@ def evaluate(
     non_loss_data_func=None,
 ):
     """Evaluation."""
-    timers = state.timers
+    # Check num args to forward_step_func
+    num_fw_args = check_forward_step_func_num_args(forward_step_func)
 
+    timers = state.timers
     timers("evaluate", log_level=0).start(barrier=True)
 
     # Turn on evaluation mode which disables dropout.
@@ -66,12 +69,13 @@ def evaluate(
             if verbose:
                 print_rank_0(f"Evaluating iter {iteration}/{state.cfg.train_config.eval_iters}")
 
+            wrapped_forward_step = maybe_inject_state(forward_step_func, state, num_fw_args=num_fw_args)
             forward_backward_func = get_forward_backward_func()
             # Don't care about timing during evaluation
             config.timers = None
             fault_tolerance.on_eval_step_start(state)
             loss_dicts = forward_backward_func(
-                forward_step_func=forward_step_func,
+                forward_step_func=wrapped_forward_step,
                 data_iterator=data_iterator,
                 model=model,
                 num_microbatches=eval_num_microbatches,
@@ -119,7 +123,7 @@ def evaluate(
             collected_non_loss_data = non_loss_data_func(model)
         elif process_non_loss_data_func is not None and is_last_rank():
             collected_non_loss_data = forward_backward_func(
-                forward_step_func=forward_step_func,
+                forward_step_func=wrapped_forward_step,
                 data_iterator=data_iterator,
                 model=model,
                 num_microbatches=get_num_microbatches(),
 
@@ -13,16 +13,13 @@
 # limitations under the License.
 
 import math
-from functools import partial
 
 import torch
 import torch.distributed
-from megatron.core import mpu
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
 
 from nemo.collections import llm
-from nemo.collections.llm.gpt.model.base import gpt_data_step
 from nemo.tron.api import megatron_pretrain
 from nemo.tron.config import (
     CheckpointConfig,
@@ -35,74 +32,7 @@
     TrainingConfig,
 )
 from nemo.tron.data.dataset import get_blend_and_blend_per_split
-from nemo.tron.state import GlobalState
-
-# define spiky loss as a variation of 20% or more
-SPIKY_LOSS_PERC = 0.2
-
-
-def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
-    """Loss function.
-
-    Args:
-        loss_mask (torch.Tensor): Used to mask out some portions of the loss
-        output_tensor (torch.Tensor): The tensor with the losses
-
-    Returns:
-        the loss scalar for this micro-batch
-        the number of non-padded tokens in this microbatch
-        a dict containing reporting metrics on the loss and number of tokens across
-            the data parallel ranks
-    """
-    state = GlobalState()
-    losses = output_tensor.float()
-    loss_mask = loss_mask.view(-1).float()
-    total_tokens = loss_mask.sum()
-    loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)])
-
-    if state.cfg.model_config.context_parallel_size > 1:
-        torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group())
-
-    # Reduce loss for logging.
-    reporting_loss = loss.clone().detach()
-    torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
-
-    local_num_tokens = loss[1].clone().detach().to(torch.int)
-    return (
-        loss[0] * state.cfg.model_config.context_parallel_size,
-        local_num_tokens,
-        {"lm loss": (reporting_loss[0], reporting_loss[1])},
-    )
-
-
-def forward_step(data_iterator, model):
-    """Forward training step.
-
-    Args:
-        data_iterator : Input data iterator
-        model (GPTModel): The GPT Model
-    """
-    timers = GlobalState().timers
-
-    # Get the batch.
-    timers("batch-generator", log_level=2).start()
-    batch = gpt_data_step(data_iterator)
-    if "attention_mask" not in batch:
-        batch["attention_mask"] = None
-
-    tokens, labels, loss_mask, attention_mask, position_ids = (
-        batch["tokens"],
-        batch["labels"],
-        batch["loss_mask"],
-        batch["attention_mask"],
-        batch["position_ids"],
-    )
-    timers("batch-generator").stop()
-
-    output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
-
-    return output_tensor, partial(loss_func, loss_mask)
-
+from nemo.tron.llm.gpt import forward_step
 
 if __name__ == "__main__":
     global_batch_size = 256
 
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,63 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Iterable
+
+from megatron.core import parallel_state
+from megatron.core.models.gpt import GPTModel
+from megatron.core.utils import get_batch_on_this_cp_rank
+
+from nemo.tron.config import ConfigContainer
+from nemo.tron.llm.utils import get_batch_on_this_tp_rank
+from nemo.tron.losses import masked_next_token_loss
+from nemo.tron.state import GlobalState
+
+
+def get_batch(data_iterator, cfg: ConfigContainer):
+    """Generate a batch."""
+
+    if (not parallel_state.is_pipeline_first_stage()) and (not parallel_state.is_pipeline_last_stage()):
+        return None, None, None, None, None
+
+    # get batches based on the TP rank you are on
+    batch = get_batch_on_this_tp_rank(data_iterator, cfg)
+
+    # slice batch along sequence dimension for context parallelism
+    batch = get_batch_on_this_cp_rank(batch)
+
+    return batch.values()
+
+
+def forward_step(state: GlobalState, data_iterator: Iterable, model: GPTModel):
+    """Forward training step.
+
+    Args:
+        state (GlobalState): Global state for the run
+        data_iterator : Input data iterator
+        model (GPTModel): The GPT Model
+    """
+
+    timers = state.timers
+    straggler_timer = state.straggler_timer
+
+    timers("batch-generator", log_level=2).start()
+    with straggler_timer(bdata=True):
+        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator, state.cfg)
+    timers("batch-generator").stop()
+
+    with straggler_timer:
+        output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
+
+    return output_tensor, partial(masked_next_token_loss, loss_mask)
@@ -0,0 +1,129 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Iterable
+import torch
+from megatron.core import parallel_state
+from nemo.tron.config import ConfigContainer
+
+
+def get_batch_on_this_tp_rank(data_iterator: Iterable, cfg: ConfigContainer) -> Dict[str, torch.Tensor]:
+    def _broadcast(item):
+        if item is not None:
+            torch.distributed.broadcast(
+                item,
+                parallel_state.get_tensor_model_parallel_src_rank(),
+                group=parallel_state.get_tensor_model_parallel_group(),
+            )
+
+    if parallel_state.get_tensor_model_parallel_rank() == 0:
+        if data_iterator is not None:
+            data = next(data_iterator)
+        else:
+            data = None
+
+        batch = {
+            "tokens": data["tokens"].cuda(non_blocking=True),
+            "labels": data["labels"].cuda(non_blocking=True),
+            "loss_mask": data["loss_mask"].cuda(non_blocking=True),
+            "attention_mask": None if "attention_mask" not in data else data["attention_mask"].cuda(non_blocking=True),
+            "position_ids": data["position_ids"].cuda(non_blocking=True),
+        }
+
+        if cfg.model_config.pipeline_model_parallel_size == 1:
+            _broadcast(batch["tokens"])
+            _broadcast(batch["labels"])
+            _broadcast(batch["loss_mask"])
+            _broadcast(batch["attention_mask"])
+            _broadcast(batch["position_ids"])
+
+        elif parallel_state.is_pipeline_first_stage():
+            _broadcast(batch["tokens"])
+            _broadcast(batch["attention_mask"])
+            _broadcast(batch["position_ids"])
+
+        elif parallel_state.is_pipeline_last_stage():
+            _broadcast(batch["labels"])
+            _broadcast(batch["loss_mask"])
+            _broadcast(batch["attention_mask"])
+
+    else:
+        mbs = cfg.train_config.micro_batch_size
+        seq_length = cfg.model_config.seq_length
+        tokens = torch.empty(
+            (mbs, seq_length),
+            dtype=torch.int64,
+            device=torch.cuda.current_device(),
+        )
+        labels = torch.empty(
+            (mbs, seq_length),
+            dtype=torch.int64,
+            device=torch.cuda.current_device(),
+        )
+        loss_mask = torch.empty(
+            (mbs, seq_length),
+            dtype=torch.float32,
+            device=torch.cuda.current_device(),
+        )
+        if cfg.dataset_config.create_attention_mask:
+            attention_mask = torch.empty(
+                (
+                    mbs,
+                    1,
+                    seq_length,
+                    seq_length,
+                ),
+                dtype=torch.bool,
+                device=torch.cuda.current_device(),
+            )
+        else:
+            attention_mask = None
+        position_ids = torch.empty(
+            (mbs, seq_length),
+            dtype=torch.int64,
+            device=torch.cuda.current_device(),
+        )
+
+        if cfg.model_config.pipeline_model_parallel_size == 1:
+            _broadcast(tokens)
+            _broadcast(labels)
+            _broadcast(loss_mask)
+            _broadcast(attention_mask)
+            _broadcast(position_ids)
+
+        elif parallel_state.is_pipeline_first_stage():
+            labels = None
+            loss_mask = None
+
+            _broadcast(tokens)
+            _broadcast(attention_mask)
+            _broadcast(position_ids)
+
+        elif parallel_state.is_pipeline_last_stage():
+            tokens = None
+            position_ids = None
+
+            _broadcast(labels)
+            _broadcast(loss_mask)
+            _broadcast(attention_mask)
+
+        batch = {
+            "tokens": tokens,
+            "labels": labels,
+            "loss_mask": loss_mask,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+        }
+
+    return batch