Adding prefix option for transcribe to allow multiple parallel runs

nune-tadevosyan · nune-tadevosyan · commit 648ff9337c88 · 2025-05-27T16:41:39.000+04:00
Signed-off-by: Nune &lt;ntadevosyan@nvidia.com&gt;
diff --git a/examples/asr/run_write_transcribed_files.py b/examples/asr/run_write_transcribed_files.py
@@ -44,7 +44,7 @@ def create_transcribed_shard_manifests(prediction_filepaths: List[str], prefix)
     for prediction_filepath in prediction_filepaths:
         max_shard_id = 0
         shard_data = {}
-        full_path = os.path.join(prediction_filepath, "predictions_all.json")
+        full_path = os.path.join(prediction_filepath, f"{prefix}_predictions_all.json")
         with open(full_path, 'r') as f:
             for line in f.readlines():
                 data_entry = json.loads(line)
@@ -90,7 +90,7 @@ def create_transcribed_manifests(prediction_filepaths: List[str], prefix) -> Lis
     """
     all_manifest_filepaths = []
     for prediction_filepath in prediction_filepaths:
-        prediction_name = os.path.join(prediction_filepath, "predictions_all.json")
+        prediction_name = os.path.join(prediction_filepath, f"{prefix}_predictions_all.json")
         transcripted_name = os.path.join(prediction_filepath, f"{prefix}_transcribed_manifest.json")
 
         # Open and read the original predictions_all.json file
@@ -129,7 +129,7 @@ def write_sampled_shard_transcriptions(manifest_filepaths: List[str], prefix) ->
     for prediction_filepath in manifest_filepaths:
         predicted_shard_data = {}
         # Collect entries from prediction files based on shard id
-        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
+        prediction_path = os.path.join(prediction_filepath, f"{prefix}_predictions_all.json")
         with open(prediction_path, 'r') as f:
             for line in f:
                 data_entry = json.loads(line)
@@ -190,7 +190,7 @@ def write_sampled_transcriptions(manifest_filepaths: List[str], prefix) -> List[
     all_manifest_filepaths = []
     for prediction_filepath in manifest_filepaths:
         predicted_data = {}
-        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
+        prediction_path = os.path.join(prediction_filepath, f"{prefix}_predictions_all.json")
         with open(prediction_path, 'r') as f:
             for line in f:
                 data_entry = json.loads(line)
diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py
@@ -191,7 +191,7 @@ def main(cfg: ParallelTranscriptionConfig):
     os.makedirs(cfg.output_path, exist_ok=True)
     # trainer.global_rank is not valid before predict() is called. Need this hack to find the correct global_rank.
     global_rank = trainer.node_rank * trainer.num_devices + int(os.environ.get("LOCAL_RANK", 0))
-    output_file = os.path.join(cfg.output_path, f"predictions_{global_rank}.json")
+    output_file = os.path.join(cfg.output_path, f"{cfg.predict_ds.prefix}_predictions_{global_rank}.json")
     predictor_writer = ASRPredictionWriter(dataset=data_loader.dataset, output_file=output_file)
     trainer.callbacks.extend([predictor_writer])
 
@@ -211,7 +211,7 @@ def main(cfg: ParallelTranscriptionConfig):
     pred_text_list = []
     text_list = []
     if is_global_rank_zero():
-        output_file = os.path.join(cfg.output_path, f"predictions_all.json")
+        output_file = os.path.join(cfg.output_path, f"{cfg.predict_ds.prefix}_predictions_all.json")
         logging.info(f"Prediction files are being aggregated in {output_file}.")
         with open(output_file, 'w') as outf:
             for rank in range(trainer.world_size):