NVIDIA-NeMo · nune-tadevosyan · Aug 28, 2025 · Aug 28, 2025 · Dec 13, 2025 · Dec 14, 2025
diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
@@ -180,6 +180,29 @@ Customization is available during both greedy and beam decoding. After :ref:`tra
 
 See detailed documentation here: :ref:`asr_language_modeling_and_customization`.
 
+Transcribe long audio files (chunking mode)
+-------------------------------------------
+You can transcribe long audio files by using the **chunking mode**: set `enable_chunking=True` in the `transcribe` method.
+Chunking is available only when you pass a single audio file or set `batch_size=1`; it is not used when the input is a pre-built DataLoader.
+
+.. code-block:: python
+
+    import nemo.collections.asr as nemo_asr
+    asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
+    transcript = asr_model.transcribe(["path/to/audio_file.wav"], enable_chunking=True)[0].text
+
+**Workflow**
+
+Long audio is split into overlapping segments (chunks) of configurable duration. For each chunk the model runs normally; the per-chunk hypotheses are then merged into a single transcript. This keeps memory and compute manageable while still producing one continuous result. Consecutive chunks overlap by about 1 second so that words spanning chunk boundaries can be merged correctly in the final text.
+
+**chunk_range**
+
+`chunk_range` is a pair ``[min_seconds, max_seconds]`` that defines the allowed duration of each chunk.
+
+* **Defaults:** Parakeet-style models use ``[240, 300]`` seconds; Canary-style (e.g. multi-task AED) models use ``[30, 40]`` seconds.
+* **Lhotse** Chunk duration is fixed in the Lhotse-based dataloaders used for file input: `LhotseSpeechToTextBpeDataset` and `PromptedAudioToTextLhotseDataset`.
+* **Tensors or numpy arrays:** When audio is passed as a tensor or numpy array, `chunk_range` is taken from :class:`TranscribeConfig` for `TranscriptionTensorDataset` (e.g. via `override_config` or the default ``[240, 300]``).
+
 Use real-time transcription
 ---------------------------
 

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
@@ -52,6 +52,7 @@
   dataset_manifest: path to dataset JSON manifest file (in NeMo formats
   compute_langs: Bool to request language ID information (if the model supports it)
   timestamps: Bool to request greedy time stamp information (if the model supports it) by default None 
+  enable_chunking: Bool to enable chunking for long audio files (default: False)
 
   (Optionally: You can limit the type of timestamp computations using below overrides)
   ctc_decoding.ctc_timestamp_type="all"  # (default all, can be [all, char, word, segment])
@@ -140,6 +141,7 @@ class TranscriptionConfig:
 
     # Set to True to output greedy timestamp information (only supported models) and returns full alignment hypotheses
     timestamps: Optional[bool] = None
+    enable_chunking: Optional[bool] = False
 
     # Set to True to return hypotheses instead of text from the transcribe function
     return_hypotheses: bool = False
@@ -409,6 +411,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
             override_cfg.text_field = cfg.gt_text_attr_name
             override_cfg.lang_field = cfg.gt_lang_attr_name
             override_cfg.timestamps = cfg.timestamps
+            override_cfg.enable_chunking = cfg.enable_chunking
             if hasattr(override_cfg, "prompt"):
                 override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
 
@@ -424,6 +427,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
                     audio=filepaths,
                     override_config=override_cfg,
                     timestamps=cfg.timestamps,
+                    enable_chunking=cfg.enable_chunking,
                 )
                 # stop timer, log time
                 timer.stop(device=device)

diff --git a/nemo/collections/asr/data/audio_to_text_lhotse.py b/nemo/collections/asr/data/audio_to_text_lhotse.py
@@ -14,6 +14,7 @@
 
 from typing import Dict, Optional, Tuple
 
+import torch
 import torch.utils.data
 from lhotse.dataset import AudioSamples
 from lhotse.dataset.collation import collate_vectors
@@ -51,16 +52,20 @@ def __init__(self, tokenizer: TokenizerSpec, return_cuts: bool = False):
 
     def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]:
         audio, audio_lens, cuts = self.load_audio(cuts)
-        tokens = [
-            torch.cat(
+
+        def _tokens_from_cut(cut):
+            return torch.cat(
                 [
                     torch.as_tensor(s.tokens if hasattr(s, "tokens") else self.tokenizer(s.text or "", s.language))
-                    for s in c.supervisions
+                    for s in cut.supervisions
                 ],
                 dim=0,
             )
-            for c in cuts
-        ]
+
+        base_tokens = [_tokens_from_cut(cut) for cut in cuts]
+
+        tokens = base_tokens
+
         token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long)
         tokens = collate_vectors(tokens, padding_value=0)
         if self.return_cuts:

diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -63,11 +63,6 @@ class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):
     We support cuts with multiple supervision segments -- their tokenized texts will be concatenated before we add the prompt tokens.
     This is useful, for example, in code-switched scenarios where each segment is spoken in a different language.
 
-    Chunking:
-    If `enable_chunking` is True, each audio sample is split into optimally sized chunks
-    (see `find_optimal_chunk_size` and `chunk_waveform`). This is useful for long audio inputs,
-    allowing the model to process them in manageable segments.
-
     NOTE:
     If the environment variable `USE_AIS_GET_BATCH` is set to `true` (case-insensitive),
     then batch audio loading from AIStore will be enabled for this dataset. This will use the
@@ -78,7 +73,6 @@ def __init__(
         self,
         tokenizer: TokenizerSpec,
         prompt: PromptFormatter,
-        enable_chunking: bool = False,
     ):
         super().__init__()
         self.tokenizer = tokenizer
@@ -100,30 +94,11 @@ def __init__(
 
         self.padding_value = self.tokenizer.pad_id
         self.prompt = prompt
-        self.enable_chunking = enable_chunking
 
     def __getitem__(self, cuts: CutSet) -> PromptedAudioToTextMiniBatch:
         # Load the audio's from AIS and add them to the CutSet
         audio, audio_lens, cuts = self.load_audio(cuts)
 
-        # Will work if batch_size is set to 1.
-        if self.enable_chunking:
-            # If dynamic chunking is enabled, split each audio sample into chunks.
-            new_audio = []
-            new_audio_lens = []
-            for i in range(audio.shape[0]):
-                waveform = audio[i, : audio_lens[i]]
-                # Split the waveform into chunks and get their lengths.
-                chunks, chunk_lens = self._chunk_waveform(waveform)
-                new_audio.extend(chunks)
-                new_audio_lens.extend(chunk_lens)
-            # Stack all chunks into a batch.
-            audio = torch.stack(new_audio)
-            audio_lens = torch.tensor(new_audio_lens, dtype=torch.long)
-            # Adding this to allow gathering results of the same audio from different batches
-            if cuts[0].start != 0:
-                cuts[0].id = cuts[0].id + '_cut_segmented'
-        # Fast-path: the tokenization and prompt format ting was already done before sampling.
         attrs = ("input_ids", "context_ids", "answer_ids")
         pre_formatted = all(hasattr(c, a) for c in cuts for a in attrs)
         if pre_formatted:
@@ -156,93 +131,6 @@ def _collate_tokens(self, tokens: list[Union[list[int], torch.Tensor]]) -> tuple
         tokens = collate_vectors(tokens, padding_value=self.padding_value)
         return tokens, token_lens
 
-    def _find_optimal_chunk_size(
-        self, total_len: int, min_sec: int = 30, max_sec: int = 40, sample_rate: int = 16000, overlap_sec: float = 1.0
-    ) -> int:
-        """
-        Find the optimal chunk size for audio processing that minimizes paddings to the last chunk.
-
-        Args:
-            total_len (int): Total length of the audio waveform in samples
-            min_sec (int, optional): Minimum chunk size in seconds. Defaults to 30.
-            max_sec (int, optional): Maximum chunk size in seconds. Defaults to 40.
-            sample_rate (int, optional): Audio sample rate in Hz. Defaults to 16000.
-            overlap_sec (float, optional): Overlap duration between consecutive chunks in seconds.
-                                         Defaults to 1.0.
-
-        Returns:
-            int: Optimal chunk size in samples that maximizes the last chunk length
-        """
-        best_chunk_size = min_sec * sample_rate
-        best_last_chunk_len = 0
-        if total_len < max_sec * sample_rate:
-            return total_len
-        # Try each possible chunk duration in the range
-        for sec in range(min_sec, max_sec + 1):
-            chunk_size = sec * sample_rate
-            overlap_size = int(overlap_sec * sample_rate)
-            step_size = chunk_size - overlap_size
-
-            if step_size <= 0:  # Invalid overlap
-                continue
-            if chunk_size > total_len:
-                continue
-
-            # Calculate how many chunks we'd need and the last chunk's length
-            n_chunks = (total_len + step_size - 1) // step_size
-            last_chunk_len = total_len - step_size * (n_chunks - 1)
-
-            if last_chunk_len > best_last_chunk_len:
-                best_last_chunk_len = last_chunk_len
-                best_chunk_size = chunk_size
-
-        return best_chunk_size
-
-    def _chunk_waveform(
-        self, waveform: torch.Tensor, chunk_size: int = None, overlap_sec: float = 1.0, sample_rate: int = 16000
-    ) -> tuple[list[torch.Tensor], list[int]]:
-        """
-        Split a waveform tensor into overlapping chunks.
-
-        Args:
-            waveform (torch.Tensor): Input audio waveform tensor of shape (time_samples,)
-            chunk_size (int, optional): Size of each chunk in samples. If None, automatically
-                                       determines optimal chunk size using find_optimal_chunk_size().
-                                       Defaults to None.
-            sample_rate (int, optional): Audio sample rate in Hz. Defaults to 16000.
-            overlap_sec (float, optional): Overlap duration between consecutive chunks in seconds.
-                                          Used to calculate step size. Defaults to 2.
-
-        Returns:
-            tuple[list[torch.Tensor], list[int]]: A tuple containing:
-                - List of chunk tensors, each of shape (chunk_size,)
-                - List of original lengths for each chunk before padding (useful for masking
-                  padded regions during processing.
-        """
-        # If chunk_size is None, find the optimal chunk size for this waveform
-        total_len = waveform.shape[0]
-        if chunk_size is None:
-            chunk_size = self._find_optimal_chunk_size(total_len, overlap_sec=overlap_sec)
-        if chunk_size >= total_len:
-            return [waveform], [total_len]
-        overlap_size = int(overlap_sec * sample_rate)
-        step_size = chunk_size - overlap_size
-        chunks = []
-        chunk_lens = []
-        start = 0
-        while start + overlap_size < total_len:
-            end = min(start + chunk_size, total_len)
-            chunk = waveform[start:end]
-            length = chunk.shape[0]
-            if length < chunk_size:
-                pad = torch.zeros(chunk_size - length, dtype=chunk.dtype, device=chunk.device)
-                chunk = torch.cat([chunk, pad], dim=0)
-            chunks.append(chunk)
-            chunk_lens.append(length)
-            start += step_size
-
-        return chunks, chunk_lens
-
 
 class ProbablyIncorrectLanguageKeyError(RuntimeError):
     pass

diff --git a/nemo/collections/asr/inference/pipelines/buffered_salm_pipeline.py b/nemo/collections/asr/inference/pipelines/buffered_salm_pipeline.py
@@ -30,7 +30,7 @@
 from nemo.collections.asr.inference.streaming.framing.request_options import ASRRequestOptions
 from nemo.collections.asr.inference.streaming.state.salm_state import SALMStreamingState
 from nemo.collections.asr.inference.utils.enums import ASROutputGranularity, MergingStrategy, RequestType
-from nemo.collections.asr.inference.utils.lcs_merge import lcs_merge
+from nemo.collections.asr.parts.utils.streaming_utils import lcs_merge
 from nemo.utils.decorators import experimental
 
 if TYPE_CHECKING:

diff --git a/nemo/collections/asr/inference/utils/lcs_merge.py b/nemo/collections/asr/inference/utils/lcs_merge.py