Skip to content

Commit 27aad7a

Browse files
authored
Merge branch 'main' into nemo-ux/tokenizer-import
2 parents 89687d4 + 8a0d1f7 commit 27aad7a

File tree

3 files changed

+22
-12
lines changed

3 files changed

+22
-12
lines changed

nemo/lightning/nemo_logger.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,9 @@
99
import pytorch_lightning as pl
1010
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint as PTLModelCheckpoint
1111

12-
from nemo.constants import NEMO_ENV_VARNAME_TESTING, NEMO_ENV_VARNAME_VERSION
1312
from nemo.lightning.pytorch.callbacks import ModelCheckpoint
1413
from nemo.utils import logging
1514
from nemo.utils.app_state import AppState
16-
from nemo.utils.env_var_parsing import get_envbool
17-
from nemo.utils.exp_manager import check_explicit_log_dir
18-
from nemo.utils.get_rank import is_global_rank_zero
19-
from nemo.utils.mcore_logger import add_handlers_to_mcore_logger
2015

2116

2217
@dataclass
@@ -67,6 +62,12 @@ def setup(
6762
Returns:
6863
AppState: The application state with updated log directory and other settings.
6964
"""
65+
from nemo.constants import NEMO_ENV_VARNAME_TESTING, NEMO_ENV_VARNAME_VERSION
66+
from nemo.utils.env_var_parsing import get_envbool
67+
from nemo.utils.exp_manager import check_explicit_log_dir
68+
from nemo.utils.get_rank import is_global_rank_zero
69+
from nemo.utils.mcore_logger import add_handlers_to_mcore_logger
70+
7071
local_rank = int(os.environ.get("LOCAL_RANK", 0))
7172
global_rank = trainer.node_rank * trainer.world_size + local_rank
7273
logging.rank = global_rank

nemo/lightning/pytorch/callbacks/megatron_model_checkpoint.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import os
1616
import re
1717
import shutil
18-
from dataclasses import dataclass
1918
from datetime import timedelta
2019
from pathlib import Path
2120
from typing import Any, Dict, Iterable, Optional, Union
@@ -27,12 +26,8 @@
2726
from pytorch_lightning.callbacks.model_checkpoint import _is_local_file_protocol
2827
from pytorch_lightning.utilities import rank_zero_info
2928

30-
from nemo.collections.common.callbacks import EMA
3129
from nemo.utils import logging
3230
from nemo.utils.app_state import AppState
33-
from nemo.utils.exp_manager import get_git_diff, get_git_hash
34-
from nemo.utils.get_rank import is_global_rank_zero
35-
from nemo.utils.lightning_logger_patch import add_filehandlers_to_pl_logger
3631
from nemo.utils.model_utils import ckpt_to_dir
3732

3833

@@ -74,6 +69,10 @@ def __init__(
7469
)
7570

7671
def on_train_start(self, trainer, pl_module):
72+
from nemo.utils.exp_manager import get_git_diff, get_git_hash
73+
from nemo.utils.get_rank import is_global_rank_zero
74+
from nemo.utils.lightning_logger_patch import add_filehandlers_to_pl_logger
75+
7776
app_state = AppState()
7877
if self.save_top_k != -1 and app_state.restore:
7978
logging.debug("Checking previous runs")
@@ -205,6 +204,8 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
205204
self._remove_invalid_entries_from_topk()
206205

207206
def setup(self, *args, **kwargs) -> None:
207+
from nemo.utils.get_rank import is_global_rank_zero
208+
208209
if is_global_rank_zero():
209210
logging.debug("Removing unfinished checkpoints if any...")
210211
ModelCheckpoint._remove_unfinished_checkpoints(self.dirpath)
@@ -260,6 +261,7 @@ def on_train_end(self, trainer, pl_module):
260261
trainer._checkpoint_connector.restore(self.best_model_path)
261262

262263
def _del_model_without_trainer(self, filepath: str) -> None:
264+
from nemo.utils.get_rank import is_global_rank_zero
263265

264266
filepath = Path(filepath)
265267

@@ -273,7 +275,9 @@ def _del_model_without_trainer(self, filepath: str) -> None:
273275
if torch.distributed.is_initialized():
274276
torch.distributed.barrier()
275277

276-
def _ema_callback(self, trainer: 'pytorch_lightning.Trainer') -> Optional[EMA]:
278+
def _ema_callback(self, trainer: 'pytorch_lightning.Trainer'):
279+
from nemo.collections.common.callbacks import EMA
280+
277281
ema_callback = None
278282
for callback in trainer.callbacks:
279283
if isinstance(callback, EMA):
@@ -321,6 +325,8 @@ def set_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barrier_
321325
barrier_after: Synchronize ranks after writing the marker file.
322326
Defaults to False.
323327
"""
328+
from nemo.utils.get_rank import is_global_rank_zero
329+
324330
if is_global_rank_zero():
325331
marker_path = ModelCheckpoint.format_checkpoint_unfinished_marker_path(checkpoint_path)
326332
marker_path.parent.mkdir(parents=True, exist_ok=True)
@@ -338,6 +344,8 @@ def remove_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barri
338344
barrier_before: Synchronize ranks before removing the marker file.
339345
Defaults to False.
340346
"""
347+
from nemo.utils.get_rank import is_global_rank_zero
348+
341349
try:
342350
if barrier_before and torch.distributed.is_initialized():
343351
torch.distributed.barrier()
@@ -434,6 +442,7 @@ def _saved_checkpoint_paths(self) -> Iterable[Path]:
434442

435443
@staticmethod
436444
def _remove_unfinished_checkpoints(checkpoint_dir: Union[Path, str]) -> None:
445+
from nemo.utils.get_rank import is_global_rank_zero
437446

438447
# Delete unfinished checkpoints from the filesystems.
439448
# "Unfinished marker" files are removed as well.

nemo/lightning/resume.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
from nemo.utils import logging
88
from nemo.utils.app_state import AppState
9-
from nemo.utils.exp_manager import NotFoundError, _filter_out_unfinished_checkpoints
109

1110

1211
class Resume:
@@ -70,6 +69,7 @@ def __init__(
7069
self.resume_ignore_no_checkpoint = resume_ignore_no_checkpoint
7170

7271
def nemo_path(self, model=None) -> Optional[Path]:
72+
from nemo.utils.exp_manager import NotFoundError, _filter_out_unfinished_checkpoints
7373

7474
if self.import_path:
7575
if model is None:

0 commit comments

Comments
 (0)