diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py index a9fb7a98ff..84d445562a 100644 --- a/deepmd/pd/train/training.py +++ b/deepmd/pd/train/training.py @@ -120,6 +120,7 @@ def __init__( self.restart_training = restart_model is not None model_params = config["model"] training_params = config["training"] + optimizer_params = config.get("optimizer", {}) self.multi_task = "model_dict" in model_params self.finetune_links = finetune_links self.finetune_update_stat = False @@ -157,14 +158,17 @@ def __init__( self.lcurve_should_print_header = True def get_opt_param(params: dict[str, Any]) -> tuple[str, dict[str, Any]]: - opt_type = params.get("opt_type", "Adam") - opt_param = { - "kf_blocksize": params.get("kf_blocksize", 5120), - "kf_start_pref_e": params.get("kf_start_pref_e", 1), - "kf_limit_pref_e": params.get("kf_limit_pref_e", 1), - "kf_start_pref_f": params.get("kf_start_pref_f", 1), - "kf_limit_pref_f": params.get("kf_limit_pref_f", 1), - } + """ + Extract optimizer parameters. + + Note: Default values are already filled by argcheck.normalize() + before this function is called. + """ + opt_type = params.get("type", "Adam") + if opt_type != "Adam": + raise ValueError(f"Not supported optimizer type '{opt_type}'") + opt_param = dict(params) + opt_param.pop("type", None) return opt_type, opt_param def get_data_loader( @@ -256,22 +260,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR: return lr_schedule # Optimizer - if self.multi_task and training_params.get("optim_dict", None) is not None: - self.optim_dict = training_params.get("optim_dict") - missing_keys = [ - key for key in self.model_keys if key not in self.optim_dict - ] - assert not missing_keys, ( - f"These keys are not in optim_dict: {missing_keys}!" - ) - self.opt_type = {} - self.opt_param = {} - for model_key in self.model_keys: - self.opt_type[model_key], self.opt_param[model_key] = get_opt_param( - self.optim_dict[model_key] - ) - else: - self.opt_type, self.opt_param = get_opt_param(training_params) + self.opt_type, self.opt_param = get_opt_param(optimizer_params) # loss_param_tmp for Hessian activation loss_param_tmp = None @@ -677,7 +666,11 @@ def single_model_finetune( ), ) self.optimizer = paddle.optimizer.Adam( - learning_rate=self.scheduler, parameters=self.wrapper.parameters() + learning_rate=self.scheduler, + parameters=self.wrapper.parameters(), + beta1=float(self.opt_param["adam_beta1"]), + beta2=float(self.opt_param["adam_beta2"]), + weight_decay=float(self.opt_param["weight_decay"]), ) if optimizer_state_dict is not None and self.restart_training: self.optimizer.set_state_dict(optimizer_state_dict) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 4b29be0463..9bea9edbcd 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -142,6 +142,7 @@ def __init__( self.restart_training = restart_model is not None model_params = config["model"] training_params = config["training"] + optimizer_params = config.get("optimizer", {}) self.multi_task = "model_dict" in model_params self.finetune_links = finetune_links self.finetune_update_stat = False @@ -185,26 +186,17 @@ def __init__( self.lcurve_should_print_header = True def get_opt_param(params: dict[str, Any]) -> tuple[str, dict[str, Any]]: - opt_type = params.get("opt_type", "Adam") - opt_param = { - # LKF parameters - "kf_blocksize": params.get("kf_blocksize", 5120), - "kf_start_pref_e": params.get("kf_start_pref_e", 1), - "kf_limit_pref_e": params.get("kf_limit_pref_e", 1), - "kf_start_pref_f": params.get("kf_start_pref_f", 1), - "kf_limit_pref_f": params.get("kf_limit_pref_f", 1), - # Common parameters - "weight_decay": params.get("weight_decay", 0.001), - # Muon/AdaMuon parameters - "momentum": params.get("momentum", 0.95), - "adam_beta1": params.get("adam_beta1", 0.9), - "adam_beta2": params.get("adam_beta2", 0.95), - "lr_adjust": params.get("lr_adjust", 10.0), - "lr_adjust_coeff": params.get("lr_adjust_coeff", 0.2), - "muon_2d_only": params.get("muon_2d_only", True), - "min_2d_dim": params.get("min_2d_dim", 1), - "flash_muon": params.get("flash_muon", True), - } + """ + Extract optimizer parameters. + + Note: Default values are already filled by argcheck.normalize() + before this function is called. + """ + opt_type = params.get("type", "Adam") + if opt_type not in ("Adam", "AdamW", "LKF", "AdaMuon", "HybridMuon"): + raise ValueError(f"Not supported optimizer type '{opt_type}'") + opt_param = dict(params) + opt_param.pop("type", None) return opt_type, opt_param def cycle_iterator(iterable: Iterable) -> Generator[Any, None, None]: @@ -313,22 +305,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR: return lr_schedule # Optimizer - if self.multi_task and training_params.get("optim_dict", None) is not None: - self.optim_dict = training_params.get("optim_dict") - missing_keys = [ - key for key in self.model_keys if key not in self.optim_dict - ] - assert not missing_keys, ( - f"These keys are not in optim_dict: {missing_keys}!" - ) - self.opt_type = {} - self.opt_param = {} - for model_key in self.model_keys: - self.opt_type[model_key], self.opt_param[model_key] = get_opt_param( - self.optim_dict[model_key] - ) - else: - self.opt_type, self.opt_param = get_opt_param(training_params) + self.opt_type, self.opt_param = get_opt_param(optimizer_params) if self.zero_stage > 0 and self.multi_task: raise ValueError( "training.zero_stage is currently only supported in single-task training." @@ -792,71 +769,48 @@ def single_model_finetune( # TODO add optimizers for multitask # author: iProzd initial_lr = self.lr_schedule.value(self.start_step) - if self.opt_type in ["Adam", "AdamW"]: - # Initialize optimizer with the actual learning rate at start_step - # to ensure warmup is applied from the first step - if self.opt_type == "Adam": - self.optimizer = self._create_optimizer( - torch.optim.Adam, - lr=initial_lr, - fused=DEVICE.type != "cpu", - ) - else: - self.optimizer = self._create_optimizer( - torch.optim.AdamW, - lr=initial_lr, - weight_decay=float(self.opt_param["weight_decay"]), - fused=DEVICE.type != "cpu", - ) - self._load_optimizer_state(optimizer_state_dict) - self.scheduler = torch.optim.lr_scheduler.LambdaLR( - self.optimizer, - lambda step: ( - self.lr_schedule.value(step + self.start_step) / initial_lr - ), - last_epoch=self.start_step - 1, - ) - elif self.opt_type == "LKF": + if self.opt_type == "LKF": self.optimizer = LKFOptimizer( self.wrapper.parameters(), 0.98, 0.99870, self.opt_param["kf_blocksize"] ) - elif self.opt_type == "AdaMuon": - self.optimizer = self._create_optimizer( - AdaMuonOptimizer, - lr=initial_lr, - momentum=float(self.opt_param["momentum"]), - weight_decay=float(self.opt_param["weight_decay"]), - adam_betas=( - float(self.opt_param["adam_beta1"]), - float(self.opt_param["adam_beta2"]), - ), - lr_adjust=float(self.opt_param["lr_adjust"]), - lr_adjust_coeff=float(self.opt_param["lr_adjust_coeff"]), - ) - if optimizer_state_dict is not None and self.restart_training: - self.optimizer.load_state_dict(optimizer_state_dict) - self.scheduler = torch.optim.lr_scheduler.LambdaLR( - self.optimizer, - lambda step: ( - self.lr_schedule.value(step + self.start_step) / initial_lr - ), - last_epoch=self.start_step - 1, + else: + # === Common path for gradient-based optimizers === + adam_betas = ( + float(self.opt_param["adam_beta1"]), + float(self.opt_param["adam_beta2"]), ) - elif self.opt_type == "HybridMuon": + weight_decay = float(self.opt_param["weight_decay"]) + + if self.opt_type in ("Adam", "AdamW"): + cls = torch.optim.Adam if self.opt_type == "Adam" else torch.optim.AdamW + extra = {"betas": adam_betas, "fused": DEVICE.type != "cpu"} + elif self.opt_type == "AdaMuon": + cls = AdaMuonOptimizer + extra = { + "adam_betas": adam_betas, + "momentum": float(self.opt_param["momentum"]), + "lr_adjust": float(self.opt_param["lr_adjust"]), + "lr_adjust_coeff": float(self.opt_param["lr_adjust_coeff"]), + } + elif self.opt_type == "HybridMuon": + cls = HybridMuonOptimizer + extra = { + "adam_betas": adam_betas, + "momentum": float(self.opt_param["momentum"]), + "lr_adjust": float(self.opt_param["lr_adjust"]), + "lr_adjust_coeff": float(self.opt_param["lr_adjust_coeff"]), + "muon_2d_only": bool(self.opt_param["muon_2d_only"]), + "min_2d_dim": int(self.opt_param["min_2d_dim"]), + "flash_muon": bool(self.opt_param["flash_muon"]), + } + else: + raise ValueError(f"Not supported optimizer type '{self.opt_type}'") + self.optimizer = self._create_optimizer( - HybridMuonOptimizer, + cls, lr=initial_lr, - momentum=float(self.opt_param["momentum"]), - weight_decay=float(self.opt_param["weight_decay"]), - adam_betas=( - float(self.opt_param["adam_beta1"]), - float(self.opt_param["adam_beta2"]), - ), - lr_adjust=float(self.opt_param["lr_adjust"]), - lr_adjust_coeff=float(self.opt_param["lr_adjust_coeff"]), - muon_2d_only=bool(self.opt_param["muon_2d_only"]), - min_2d_dim=int(self.opt_param["min_2d_dim"]), - flash_muon=bool(self.opt_param["flash_muon"]), + weight_decay=weight_decay, + **extra, ) self._load_optimizer_state(optimizer_state_dict) self.scheduler = torch.optim.lr_scheduler.LambdaLR( @@ -866,8 +820,6 @@ def single_model_finetune( ), last_epoch=self.start_step - 1, ) - else: - raise ValueError(f"Not supported optimizer type '{self.opt_type}'") if self.zero_stage > 0 and self.rank == 0: if self.zero_stage == 1: diff --git a/deepmd/tf/entrypoints/train.py b/deepmd/tf/entrypoints/train.py index 419ab4a1fe..c0031a9def 100755 --- a/deepmd/tf/entrypoints/train.py +++ b/deepmd/tf/entrypoints/train.py @@ -167,7 +167,6 @@ def train( jdata["model"] = json.loads(t_training_script)["model"] jdata = update_deepmd_input(jdata, warning=True, dump="input_v2_compat.json") - jdata = normalize(jdata) if not is_compress and not skip_neighbor_stat: diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py index 97651b1543..983e03c443 100644 --- a/deepmd/tf/train/trainer.py +++ b/deepmd/tf/train/trainer.py @@ -136,6 +136,22 @@ def get_lr_and_coef( # learning rate lr_param = jdata["learning_rate"] self.lr, self.scale_lr_coef = get_lr_and_coef(lr_param) + # optimizer + # Note: Default values are already filled by argcheck.normalize() + optimizer_param = jdata.get("optimizer", {}) + self.optimizer_type = optimizer_param.get("type", "Adam") + self.optimizer_beta1 = float(optimizer_param.get("adam_beta1")) + self.optimizer_beta2 = float(optimizer_param.get("adam_beta2")) + self.optimizer_weight_decay = float(optimizer_param.get("weight_decay")) + if self.optimizer_type != "Adam": + raise RuntimeError( + f"Unsupported optimizer type {self.optimizer_type} for TensorFlow backend." + ) + if self.optimizer_weight_decay != 0.0: + raise RuntimeError( + "TensorFlow Adam optimizer does not support weight_decay. " + "Set optimizer/weight_decay to 0." + ) # loss # infer loss type by fitting_type loss_param = jdata.get("loss", {}) @@ -328,17 +344,31 @@ def _build_network(self, data: DeepmdDataSystem, suffix: str = "") -> None: log.info("built network") def _build_optimizer(self) -> Any: + if self.optimizer_type != "Adam": + raise RuntimeError( + f"Unsupported optimizer type {self.optimizer_type} for TensorFlow backend." + ) if self.run_opt.is_distrib: if self.scale_lr_coef > 1.0: log.info("Scale learning rate by coef: %f", self.scale_lr_coef) optimizer = tf.train.AdamOptimizer( - self.learning_rate * self.scale_lr_coef + self.learning_rate * self.scale_lr_coef, + beta1=self.optimizer_beta1, + beta2=self.optimizer_beta2, ) else: - optimizer = tf.train.AdamOptimizer(self.learning_rate) + optimizer = tf.train.AdamOptimizer( + self.learning_rate, + beta1=self.optimizer_beta1, + beta2=self.optimizer_beta2, + ) optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) else: - optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) + optimizer = tf.train.AdamOptimizer( + learning_rate=self.learning_rate, + beta1=self.optimizer_beta1, + beta2=self.optimizer_beta2, + ) if self.mixed_prec is not None: _TF_VERSION = Version(TF_VERSION) diff --git a/deepmd/tf/utils/compat.py b/deepmd/tf/utils/compat.py index e80a366b83..87b41ed23d 100644 --- a/deepmd/tf/utils/compat.py +++ b/deepmd/tf/utils/compat.py @@ -4,6 +4,7 @@ from deepmd.utils.compat import ( convert_input_v0_v1, convert_input_v1_v2, + convert_optimizer_v31_to_v32, deprecate_numb_test, update_deepmd_input, ) @@ -11,6 +12,7 @@ __all__ = [ "convert_input_v0_v1", "convert_input_v1_v2", + "convert_optimizer_v31_to_v32", "deprecate_numb_test", "update_deepmd_input", ] diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index eaa0892369..d1331a711a 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -2742,6 +2742,302 @@ def _check_lr_args(data: dict[str, Any]) -> bool: ) +# --- Optimizer configurations: --- # +opt_args_plugin = ArgsPlugin() + + +@opt_args_plugin.register("Adam") +def optimizer_adam() -> list[Argument]: + doc_adam_beta1 = "Adam beta1 coefficient for first moment decay." + doc_adam_beta2 = "Adam beta2 coefficient for second moment decay." + doc_weight_decay = ( + "Weight decay coefficient for Adam. In PyTorch and Paddle, this is an L2 " + "penalty applied to gradients. TensorFlow does not support weight_decay and " + "requires this value to be 0." + ) + return [ + Argument( + "adam_beta1", + float, + optional=True, + default=0.9, + doc=doc_adam_beta1, + ), + Argument( + "adam_beta2", + float, + optional=True, + default=0.999, + doc=doc_adam_beta2, + ), + Argument( + "weight_decay", + float, + optional=True, + default=0.0, + doc=doc_weight_decay, + ), + ] + + +@opt_args_plugin.register("AdamW", doc=doc_only_pt_supported) +def optimizer_adamw() -> list[Argument]: + doc_adam_beta1 = "AdamW beta1 coefficient for first moment decay." + doc_adam_beta2 = "AdamW beta2 coefficient for second moment decay." + doc_weight_decay = ( + "Decoupled weight decay coefficient for AdamW optimizer (PyTorch only)." + ) + return [ + Argument( + "adam_beta1", + float, + optional=True, + default=0.9, + doc=doc_only_pt_supported + doc_adam_beta1, + ), + Argument( + "adam_beta2", + float, + optional=True, + default=0.999, + doc=doc_only_pt_supported + doc_adam_beta2, + ), + Argument( + "weight_decay", + float, + optional=True, + default=0.001, + doc=doc_only_pt_supported + doc_weight_decay, + ), + ] + + +@opt_args_plugin.register("LKF", doc=doc_only_pt_supported) +def optimizer_lkf() -> list[Argument]: + doc_kf_blocksize = "The blocksize for the Kalman filter." + doc_kf_start_pref_e = ( + "The prefactor of energy loss at the start of Kalman filter updates." + ) + doc_kf_limit_pref_e = ( + "The prefactor of energy loss at the end of training for Kalman filter updates." + ) + doc_kf_start_pref_f = ( + "The prefactor of force loss at the start of Kalman filter updates." + ) + doc_kf_limit_pref_f = ( + "The prefactor of force loss at the end of training for Kalman filter updates." + ) + return [ + Argument( + "kf_blocksize", + int, + optional=True, + default=5120, + doc=doc_only_pt_supported + doc_kf_blocksize, + ), + Argument( + "kf_start_pref_e", + float, + optional=True, + default=1.0, + doc=doc_only_pt_supported + doc_kf_start_pref_e, + ), + Argument( + "kf_limit_pref_e", + float, + optional=True, + default=1.0, + doc=doc_only_pt_supported + doc_kf_limit_pref_e, + ), + Argument( + "kf_start_pref_f", + float, + optional=True, + default=1.0, + doc=doc_only_pt_supported + doc_kf_start_pref_f, + ), + Argument( + "kf_limit_pref_f", + float, + optional=True, + default=1.0, + doc=doc_only_pt_supported + doc_kf_limit_pref_f, + ), + ] + + +@opt_args_plugin.register("AdaMuon", doc=doc_only_pt_supported) +def optimizer_adamuon() -> list[Argument]: + return [ + Argument( + "momentum", + float, + optional=True, + default=0.95, + alias=["muon_momentum"], + doc=doc_only_pt_supported + "Momentum coefficient for AdaMuon optimizer.", + ), + Argument( + "adam_beta1", + float, + optional=True, + default=0.9, + doc=doc_only_pt_supported + "Adam beta1 coefficient for AdaMuon optimizer.", + ), + Argument( + "adam_beta2", + float, + optional=True, + default=0.95, + doc=doc_only_pt_supported + "Adam beta2 coefficient for AdaMuon optimizer.", + ), + Argument( + "weight_decay", + float, + optional=True, + default=0.001, + doc=doc_only_pt_supported + + "Weight decay coefficient. Applied only to >=2D parameters (AdaMuon path).", + ), + Argument( + "lr_adjust", + float, + optional=True, + default=10.0, + doc=doc_only_pt_supported + + "Learning rate adjustment factor for Adam (1D params). " + "If lr_adjust <= 0: use match-RMS scaling (scale = lr_adjust_coeff * sqrt(max(m, n))), Adam uses lr directly. " + "If lr_adjust > 0: use rectangular correction (scale = sqrt(max(1.0, m/n))), Adam uses lr/lr_adjust.", + ), + Argument( + "lr_adjust_coeff", + float, + optional=True, + default=0.2, + doc=doc_only_pt_supported + + "Coefficient for match-RMS scaling. Only effective when lr_adjust <= 0.", + ), + ] + + +@opt_args_plugin.register("HybridMuon", doc=doc_only_pt_supported) +def optimizer_hybrid_muon() -> list[Argument]: + return [ + Argument( + "momentum", + float, + optional=True, + default=0.95, + alias=["muon_momentum"], + doc=doc_only_pt_supported + + "Momentum coefficient for HybridMuon optimizer (>=2D params). " + "Used in Nesterov momentum update: m_t = beta*m_{t-1} + (1-beta)*g_t.", + ), + Argument( + "adam_beta1", + float, + optional=True, + default=0.9, + doc=doc_only_pt_supported + + "Adam beta1 coefficient for 1D parameters (biases, norms).", + ), + Argument( + "adam_beta2", + float, + optional=True, + default=0.95, + doc=doc_only_pt_supported + + "Adam beta2 coefficient for 1D parameters (biases, norms).", + ), + Argument( + "weight_decay", + float, + optional=True, + default=0.001, + doc=doc_only_pt_supported + + "Weight decay coefficient. Applied only to Muon-routed parameters", + ), + Argument( + "lr_adjust", + float, + optional=True, + default=10.0, + doc=doc_only_pt_supported + + "Learning rate adjustment mode for HybridMuon scaling and Adam learning rate. " + "If lr_adjust <= 0: use match-RMS scaling (scale = coeff*sqrt(max(m,n))), Adam uses lr directly. " + "If lr_adjust > 0: use rectangular correction (scale = sqrt(max(1, m/n))), Adam uses lr/lr_adjust. " + "Default is 10.0 (Adam lr = lr/10).", + ), + Argument( + "lr_adjust_coeff", + float, + optional=True, + default=0.2, + doc=doc_only_pt_supported + + "Coefficient for match-RMS scaling. Only effective when lr_adjust <= 0.", + ), + Argument( + "muon_2d_only", + bool, + optional=True, + default=True, + doc=doc_only_pt_supported + + "If True, only 2D parameters use Muon (matching PyTorch's torch.optim.Muon). " + + "Parameters with ndim > 2 use Adam without weight decay. " + + "If False, all >=2D parameters use Muon.", + ), + Argument( + "min_2d_dim", + int, + optional=True, + default=1, + alias=["muon_min_2d_dim"], + doc=doc_only_pt_supported + + "Minimum min(m, n) threshold for HybridMuon on 2D matrices. " + "Matrices with min(m, n) >= min_2d_dim use HybridMuon; " + "those with min(m, n) < min_2d_dim use Adam fallback. " + "Set to 1 to disable fallback.", + ), + Argument( + "flash_muon", + bool, + optional=True, + default=True, + doc=doc_only_pt_supported + + "Enable triton-accelerated Newton-Schulz orthogonalization. " + "Requires triton and CUDA. Falls back to PyTorch implementation " + "when triton is unavailable or running on CPU.", + ), + ] + + +def optimizer_variant_type_args() -> Variant: + doc_opt_type = "The type of optimizer to use." + return Variant( + "type", + opt_args_plugin.get_all_argument(), + optional=True, + default_tag="Adam", + doc=doc_opt_type, + ) + + +def optimizer_args(fold_subdoc: bool = False) -> Argument: + doc_optimizer = ( + "The definition of optimizer. Supported optimizer types depend on backend: " + "TensorFlow/Paddle: Adam; PyTorch: Adam, AdamW, LKF, AdaMuon, HybridMuon." + ) + return Argument( + "optimizer", + dict, + [], + [optimizer_variant_type_args()], + optional=True, + doc=doc_optimizer, + fold_subdoc=fold_subdoc, + ) + + # --- Loss configurations: --- # def start_pref(item: str, label: str | None = None, abbr: str | None = None) -> str: if label is None: @@ -3457,8 +3753,6 @@ def training_args( "If the file extension is .h5 or .hdf5, an HDF5 file is used to store the statistics; " "otherwise, a directory containing NumPy binary files are used." ) - doc_opt_type = "The type of optimizer to use." - doc_kf_blocksize = "The blocksize for the Kalman filter." doc_model_prob = ( "The visiting probability of each model for each training step in the " "multi-task mode. Only used when num_epoch_dict is not set. If not set " @@ -3618,190 +3912,6 @@ def training_args( doc=doc_only_pt_supported + doc_zero_stage, ), ] - variants = [ - Variant( - "opt_type", - choices=[ - Argument("Adam", dict, [], [], optional=True), - Argument("AdamW", dict, [], [], optional=True), - Argument( - "LKF", - dict, - [ - Argument( - "kf_blocksize", - int, - optional=True, - doc=doc_only_pt_supported + doc_kf_blocksize, - ), - ], - [], - optional=True, - ), - Argument( - "AdaMuon", - dict, - [ - Argument( - "momentum", - float, - optional=True, - default=0.95, - alias=["muon_momentum"], - doc=doc_only_pt_supported - + "Momentum coefficient for AdaMuon optimizer.", - ), - Argument( - "adam_beta1", - float, - optional=True, - default=0.9, - doc=doc_only_pt_supported - + "Adam beta1 coefficient for AdaMuon optimizer.", - ), - Argument( - "adam_beta2", - float, - optional=True, - default=0.95, - doc=doc_only_pt_supported - + "Adam beta2 coefficient for AdaMuon optimizer.", - ), - Argument( - "weight_decay", - float, - optional=True, - default=0.001, - doc=doc_only_pt_supported - + "Weight decay coefficient. Applied only to >=2D parameters (AdaMuon path).", - ), - Argument( - "lr_adjust", - float, - optional=True, - default=10.0, - doc=doc_only_pt_supported - + "Learning rate adjustment factor for Adam (1D params). " - "If lr_adjust <= 0: use match-RMS scaling (scale = lr_adjust_coeff * sqrt(max(m, n))), Adam uses lr directly. " - "If lr_adjust > 0: use rectangular correction (scale = sqrt(max(1.0, m/n))), Adam uses lr/lr_adjust.", - ), - Argument( - "lr_adjust_coeff", - float, - optional=True, - default=0.2, - doc=doc_only_pt_supported - + "Coefficient for match-RMS scaling. Only effective when lr_adjust <= 0.", - ), - ], - [], - optional=True, - ), - Argument( - "HybridMuon", - dict, - [ - Argument( - "momentum", - float, - optional=True, - default=0.95, - alias=["muon_momentum"], - doc=doc_only_pt_supported - + "Momentum coefficient for HybridMuon optimizer (>=2D params). " - "Used in Nesterov momentum update: m_t = beta*m_{t-1} + (1-beta)*g_t.", - ), - Argument( - "adam_beta1", - float, - optional=True, - default=0.9, - doc=doc_only_pt_supported - + "Adam beta1 coefficient for 1D parameters (biases, norms).", - ), - Argument( - "adam_beta2", - float, - optional=True, - default=0.95, - doc=doc_only_pt_supported - + "Adam beta2 coefficient for 1D parameters (biases, norms).", - ), - Argument( - "weight_decay", - float, - optional=True, - default=0.001, - doc=doc_only_pt_supported - + "Weight decay coefficient. Applied to Muon-routed parameters and >=2D Adam-routed parameters (AdamW-style decoupled decay). Not applied to 1D Adam parameters.", - ), - Argument( - "lr_adjust", - float, - optional=True, - default=10.0, - doc=doc_only_pt_supported - + "Learning rate adjustment mode for HybridMuon scaling and Adam learning rate. " - "If lr_adjust <= 0: use match-RMS scaling (scale = coeff*sqrt(max(m,n))), Adam uses lr directly. " - "If lr_adjust > 0: use rectangular correction (scale = sqrt(max(1, m/n))), Adam uses lr/lr_adjust. " - "Default is 10.0 (Adam lr = lr/10).", - ), - Argument( - "lr_adjust_coeff", - float, - optional=True, - default=0.2, - doc=doc_only_pt_supported - + "Coefficient for match-RMS scaling. Only effective when lr_adjust <= 0.", - ), - Argument( - "muon_2d_only", - bool, - optional=True, - default=True, - doc=doc_only_pt_supported - + "If True, only 2D parameters use Muon (matching PyTorch's torch.optim.Muon). " - + "Parameters with ndim > 2 use AdamW-style updates. " - + "If False, all >=2D parameters are eligible for Muon (with min_2d_dim fallback to AdamW-style updates).", - ), - Argument( - "min_2d_dim", - int, - optional=True, - default=1, - alias=["muon_min_2d_dim"], - doc=doc_only_pt_supported - + "Minimum min(m, n) threshold for HybridMuon on matrix-view parameters. " - "Parameters with min(m, n) >= min_2d_dim use HybridMuon; " - "those with min(m, n) < min_2d_dim use Adam fallback. " - "Set to 1 to disable fallback.", - ), - Argument( - "flash_muon", - bool, - optional=True, - default=True, - doc=doc_only_pt_supported - + "Enable triton-accelerated Newton-Schulz orthogonalization. " - "Requires triton and CUDA. Falls back to PyTorch implementation " - "when triton is unavailable or running on CPU.", - ), - ], - [], - optional=True, - doc=doc_only_pt_supported - + "HybridMuon optimizer (DeePMD-kit custom implementation). " - + "This is a Hybrid optimizer that automatically combines Muon and Adam. " - + "For >=2D params: Muon update with Newton-Schulz (or Adam fallback when matrix-view dimensions are too small). " - + "For 1D params: Standard Adam. " - + "This is DIFFERENT from PyTorch's torch.optim.Muon which ONLY supports 2D parameters.", - ), - ], - optional=True, - default_tag="Adam", - doc=doc_only_pt_supported + doc_opt_type, - ) - ] def training_extra_check(data: dict | None) -> bool: if data is None: @@ -3845,7 +3955,7 @@ def training_extra_check(data: dict | None) -> bool: "training", dict, args, - variants, + [], doc=doc_training, extra_check=training_extra_check, ) @@ -3922,6 +4032,7 @@ def gen_args(multi_task: bool = False) -> list[Argument]: return [ model_args(), learning_rate_args(), + optimizer_args(), loss_args(), training_args(multi_task=multi_task), nvnmd_args(), @@ -3930,6 +4041,7 @@ def gen_args(multi_task: bool = False) -> list[Argument]: return [ multi_model_args(), learning_rate_args(fold_subdoc=True), + optimizer_args(fold_subdoc=True), multi_loss_args(), training_args(multi_task=multi_task), nvnmd_args(fold_subdoc=True), diff --git a/deepmd/utils/compat.py b/deepmd/utils/compat.py index c82b3a739e..c6b839a8f6 100644 --- a/deepmd/utils/compat.py +++ b/deepmd/utils/compat.py @@ -457,4 +457,92 @@ def is_deepmd_v1_input(jdata: dict[str, Any]) -> bool: jdata = deprecate_numb_test(jdata, warning, dump) jdata = migrate_training_warmup(jdata, warning=warning) + jdata = convert_optimizer_v31_to_v32(jdata, warning=warning) + return jdata + + +def convert_optimizer_v31_to_v32( + jdata: dict[str, Any], warning: bool = True +) -> dict[str, Any]: + """Convert optimizer format from v3.1 to v3.2. + + v3.1 format: optimizer parameters (opt_type, kf_blocksize, etc.) in training section. + v3.2 format: separate optimizer section with type field. + + Parameters + ---------- + jdata : dict[str, Any] + loaded json/yaml file + warning : bool, optional + whether to show deprecation warning, by default True + + Returns + ------- + dict[str, Any] + converted output with optimizer section + """ + # Default optimizer values (must match argcheck.py defaults) + default_optimizer = { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0, + } + + training_cfg = jdata.get("training", {}) + optimizer_cfg = jdata.get("optimizer", {}) + + # === Step 1. Extract legacy optimizer parameters from training === + optimizer_keys = [ + "opt_type", + "kf_blocksize", + "kf_start_pref_e", + "kf_limit_pref_e", + "kf_start_pref_f", + "kf_limit_pref_f", + "weight_decay", + "momentum", + "muon_momentum", + "adam_beta1", + "adam_beta2", + "lr_adjust", + "lr_adjust_coeff", + "muon_2d_only", + "min_2d_dim", + ] + has_legacy_optimizer = any(key in training_cfg for key in optimizer_keys) + if has_legacy_optimizer: + extracted_cfg = {} + for key in optimizer_keys: + if key in training_cfg: + extracted_cfg[key] = training_cfg.pop(key) + + # Convert opt_type to type for new format + if "opt_type" in extracted_cfg: + extracted_cfg["type"] = extracted_cfg.pop("opt_type") + + # Merge with existing optimizer config (conversion takes precedence) + optimizer_cfg = {**optimizer_cfg, **extracted_cfg} + + if warning: + warnings.warn( + "Placing optimizer parameters (opt_type, kf_blocksize, etc.) in the training section " + "is deprecated. Use a separate 'optimizer' section with 'type' field instead.", + DeprecationWarning, + stacklevel=2, + ) + + # === Step 2. Fill in missing defaults === + if "type" not in optimizer_cfg: + optimizer_cfg["type"] = default_optimizer["type"] + + # Fill in defaults for Adam optimizer type + if optimizer_cfg["type"] in ("Adam", "AdamW"): + for key, value in default_optimizer.items(): + if key not in optimizer_cfg: + optimizer_cfg[key] = value + + # Set/update the optimizer section + jdata["optimizer"] = optimizer_cfg + return jdata diff --git a/doc/model/train-fitting-dos.md b/doc/model/train-fitting-dos.md index 2a007d55ad..386479acc3 100644 --- a/doc/model/train-fitting-dos.md +++ b/doc/model/train-fitting-dos.md @@ -16,7 +16,7 @@ $deepmd_source_dir/examples/dos/input.json The training and validation data are also provided our examples. But note that **the data provided along with the examples are of limited amount, and should not be used to train a production model.** -Similar to the `input.json` used in `ener` mode, training JSON is also divided into {ref}`model `, {ref}`learning_rate `, {ref}`loss ` and {ref}`training `. Most keywords remain the same as `ener` mode, and their meaning can be found [here](train-se-e2-a.md). To fit the `dos`, one needs to modify {ref}`model[standard]/fitting_net ` and {ref}`loss `. +Similar to the `input.json` used in `ener` mode, training JSON is also divided into {ref}`model `, {ref}`learning_rate `, {ref}`optimizer `, {ref}`loss ` and {ref}`training `. Most keywords remain the same as `ener` mode, and their meaning can be found in the [SE-E2-A guide](train-se-e2-a.md). To fit the `dos`, one needs to modify {ref}`model[standard]/fitting_net ` and {ref}`loss `. ## The fitting Network diff --git a/doc/model/train-fitting-property.md b/doc/model/train-fitting-property.md index 895232b817..6517658b7a 100644 --- a/doc/model/train-fitting-property.md +++ b/doc/model/train-fitting-property.md @@ -14,7 +14,7 @@ $deepmd_source_dir/examples/property/train The training and validation data are also provided our examples. But note that **the data provided along with the examples are of limited amount, and should not be used to train a production model.** -Similar to the `input.json` used in `ener` mode, training JSON is also divided into {ref}`model `, {ref}`learning_rate `, {ref}`loss ` and {ref}`training `. Most keywords remain the same as `ener` mode, and their meaning can be found [here](train-se-atten.md). To fit the `property`, one needs to modify {ref}`model[standard]/fitting_net ` and {ref}`loss `. +Similar to the `input.json` used in `ener` mode, training JSON is also divided into {ref}`model `, {ref}`learning_rate `, {ref}`optimizer `, {ref}`loss ` and {ref}`training `. Most keywords remain the same as `ener` mode, and their meaning can be found in the [SE-Atten guide](train-se-atten.md). To fit the `property`, one needs to modify {ref}`model[standard]/fitting_net ` and {ref}`loss `. ## The fitting Network diff --git a/doc/model/train-fitting-tensor.md b/doc/model/train-fitting-tensor.md index 7f8505de27..96c7324d7e 100644 --- a/doc/model/train-fitting-tensor.md +++ b/doc/model/train-fitting-tensor.md @@ -28,7 +28,7 @@ $deepmd_source_dir/examples/water_tensor/polar/polar_input_torch.json The training and validation data are also provided our examples. But note that **the data provided along with the examples are of limited amount, and should not be used to train a production model.** -Similar to the `input.json` used in `ener` mode, training JSON is also divided into {ref}`model `, {ref}`learning_rate `, {ref}`loss ` and {ref}`training `. Most keywords remain the same as `ener` mode, and their meaning can be found [here](train-se-e2-a.md). +Similar to the `input.json` used in `ener` mode, training JSON is also divided into {ref}`model `, {ref}`learning_rate `, {ref}`optimizer `, {ref}`loss ` and {ref}`training `. Most keywords remain the same as `ener` mode, and their meaning can be found in the [SE-E2-A guide](train-se-e2-a.md). To fit a tensor, one needs to modify {ref}`fitting_net ` and {ref}`loss `. ## Theory @@ -99,7 +99,7 @@ The JSON of `polar` type should be provided like }, ``` -- `type` specifies which type of fitting net should be used. It should be either `dipole` or `polar`. Note that `global_polar` mode in version 1.x is already **deprecated** and is merged into `polar`. To specify whether a system is global or atomic, please see [here](train-se-e2-a.md). +- `type` specifies which type of fitting net should be used. It should be either `dipole` or `polar`. Note that `global_polar` mode in version 1.x is already **deprecated** and is merged into `polar`. To specify whether a system is global or atomic, please see the [SE-E2-A guide](train-se-e2-a.md). - `sel_type` is a list specifying which type of atoms have the quantity you want to fit. For example, in the water system, `sel_type` is `[0]` since `0` represents atom `O`. If left unset, all types of atoms will be fitted. - The rest arguments have the same meaning as they do in `ener` mode. ::: @@ -134,7 +134,7 @@ The JSON of `polar` type should be provided like }, ``` -- `type` specifies which type of fitting net should be used. It should be either `dipole` or `polar`. Note that `global_polar` mode in version 1.x is already **deprecated** and is merged into `polar`. To specify whether a system is global or atomic, please see [here](train-se-e2-a.md). +- `type` specifies which type of fitting net should be used. It should be either `dipole` or `polar`. Note that `global_polar` mode in version 1.x is already **deprecated** and is merged into `polar`. To specify whether a system is global or atomic, please see the [SE-E2-A guide](train-se-e2-a.md). - `atom_exclude_types` is a list specifying the which type of atoms have the quantity you want to set to zero. For example, in the water system, `atom_exclude_types` is `[1]` since `1` represents atom `H`. - The rest arguments have the same meaning as they do in `ener` mode. ::: diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md index 8ad955aa81..179041f6d9 100644 --- a/doc/train/training-advanced.md +++ b/doc/train/training-advanced.md @@ -6,6 +6,22 @@ In this section, we will take `$deepmd_source_dir/examples/water/se_e2_a/input.j See {doc}`learning-rate` for detailed documentation on learning rate schedules. +## Optimizer + +The {ref}`optimizer ` section in `input.json` is given as follows + +```json + "optimizer" :{ + "type": "Adam", + "_comment": "that's all" + } +``` + +- TensorFlow/Paddle: only {ref}`Adam ` is supported. +- PyTorch: {ref}`Adam `, {ref}`AdamW `, {ref}`LKF `, {ref}`AdaMuon `, {ref}`HybridMuon `. +- {ref}`adam_beta1 ` and {ref}`adam_beta2 ` control the Adam/AdamW moment decay. +- {ref}`weight_decay ` applies L2 penalty in Adam, while {ref}`weight_decay ` is decoupled in AdamW. TensorFlow does not support weight decay in Adam. + ## Training parameters Other training parameters are given in the {ref}`training ` section. diff --git a/examples/water/dpa3/input_torch.json b/examples/water/dpa3/input_torch.json index 7b67e8cf55..6aaa6e2776 100644 --- a/examples/water/dpa3/input_torch.json +++ b/examples/water/dpa3/input_torch.json @@ -68,6 +68,12 @@ "limit_pref_v": 1, "_comment": " that's all" }, + "optimizer": { + "type": "AdamW", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.001 + }, "training": { "stat_file": "./dpa3.hdf5", "training_data": { diff --git a/examples/water/dpa3/input_torch_dynamic.json b/examples/water/dpa3/input_torch_dynamic.json index edb7e53414..dbd93cd4e7 100644 --- a/examples/water/dpa3/input_torch_dynamic.json +++ b/examples/water/dpa3/input_torch_dynamic.json @@ -70,6 +70,12 @@ "limit_pref_v": 1, "_comment": " that's all" }, + "optimizer": { + "type": "AdamW", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.001 + }, "training": { "stat_file": "./dpa3.hdf5", "training_data": { diff --git a/source/tests/common/test_compat_optimizer.py b/source/tests/common/test_compat_optimizer.py new file mode 100644 index 0000000000..729b8fe207 --- /dev/null +++ b/source/tests/common/test_compat_optimizer.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +import unittest + +from deepmd.utils.compat import ( + convert_optimizer_v31_to_v32, +) + + +class TestConvertOptimizerV31ToV32(unittest.TestCase): + """Test convert_optimizer_v31_to_v32 function.""" + + def test_legacy_opt_type_conversion(self) -> None: + """Test conversion of opt_type to type.""" + jdata = { + "training": { + "opt_type": "Adam", + "adam_beta1": 0.95, + "adam_beta2": 0.99, + } + } + result = convert_optimizer_v31_to_v32(jdata, warning=False) + self.assertIn("optimizer", result) + self.assertEqual(result["optimizer"]["type"], "Adam") + self.assertEqual(result["optimizer"]["adam_beta1"], 0.95) + self.assertEqual(result["optimizer"]["adam_beta2"], 0.99) + self.assertNotIn("opt_type", result["training"]) + + def test_kf_optimizer_parameters(self) -> None: + """Test extraction of KF optimizer parameters.""" + jdata = { + "training": { + "opt_type": "KF", + "kf_blocksize": 5120, + "kf_start_pref_e": 1.0, + "kf_limit_pref_e": 2.0, + "kf_start_pref_f": 1.0, + "kf_limit_pref_f": 1.0, + } + } + result = convert_optimizer_v31_to_v32(jdata, warning=False) + self.assertEqual(result["optimizer"]["type"], "KF") + self.assertEqual(result["optimizer"]["kf_blocksize"], 5120) + self.assertEqual(result["optimizer"]["kf_start_pref_e"], 1.0) + self.assertEqual(result["optimizer"]["kf_limit_pref_e"], 2.0) + self.assertNotIn("kf_blocksize", result["training"]) + + def test_default_values_adam(self) -> None: + """Test default values are filled for Adam optimizer.""" + jdata = {"training": {}} + result = convert_optimizer_v31_to_v32(jdata, warning=False) + self.assertEqual(result["optimizer"]["type"], "Adam") + self.assertEqual(result["optimizer"]["adam_beta1"], 0.9) + self.assertEqual(result["optimizer"]["adam_beta2"], 0.999) + self.assertEqual(result["optimizer"]["weight_decay"], 0.0) + + def test_default_values_adamw(self) -> None: + """Test default values are filled for AdamW optimizer.""" + jdata = {"training": {"opt_type": "AdamW"}} + result = convert_optimizer_v31_to_v32(jdata, warning=False) + self.assertEqual(result["optimizer"]["type"], "AdamW") + self.assertEqual(result["optimizer"]["adam_beta1"], 0.9) + self.assertEqual(result["optimizer"]["adam_beta2"], 0.999) + self.assertEqual(result["optimizer"]["weight_decay"], 0.0) + + def test_existing_optimizer_section(self) -> None: + """Test merging with existing optimizer section.""" + jdata = { + "training": { + "opt_type": "Adam", + "adam_beta1": 0.95, + }, + "optimizer": { + "weight_decay": 0.01, + }, + } + result = convert_optimizer_v31_to_v32(jdata, warning=False) + # Legacy parameters should take precedence + self.assertEqual(result["optimizer"]["type"], "Adam") + self.assertEqual(result["optimizer"]["adam_beta1"], 0.95) + self.assertEqual(result["optimizer"]["weight_decay"], 0.01) + + +if __name__ == "__main__": + unittest.main() diff --git a/source/tests/pd/model/water/multitask.json b/source/tests/pd/model/water/multitask.json index 0f748cd570..6d768ceb45 100644 --- a/source/tests/pd/model/water/multitask.json +++ b/source/tests/pd/model/water/multitask.json @@ -68,6 +68,12 @@ "stop_lr": 3.51e-08, "_comment": "that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "loss_dict": { "model_1": { "type": "ener", diff --git a/source/tests/pd/model/water/multitask_sharefit.json b/source/tests/pd/model/water/multitask_sharefit.json index 41990fd212..1d45c04645 100644 --- a/source/tests/pd/model/water/multitask_sharefit.json +++ b/source/tests/pd/model/water/multitask_sharefit.json @@ -61,6 +61,12 @@ "stop_lr": 3.51e-08, "_comment": "that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "loss_dict": { "model_1": { "type": "ener", diff --git a/source/tests/pd/model/water/se_atten.json b/source/tests/pd/model/water/se_atten.json index 70abf6759c..7d9377473f 100644 --- a/source/tests/pd/model/water/se_atten.json +++ b/source/tests/pd/model/water/se_atten.json @@ -56,6 +56,12 @@ "limit_pref_v": 0, "_comment": " that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "training": { "training_data": { "systems": [ diff --git a/source/tests/pd/model/water/se_e2_a.json b/source/tests/pd/model/water/se_e2_a.json index 96f51ba5aa..83d304e0dd 100644 --- a/source/tests/pd/model/water/se_e2_a.json +++ b/source/tests/pd/model/water/se_e2_a.json @@ -50,6 +50,12 @@ "limit_pref_f": 1, "_comment": " that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "training": { "training_data": { "systems": [ diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py index 692a8fb32f..e125335365 100644 --- a/source/tests/pd/test_training.py +++ b/source/tests/pd/test_training.py @@ -21,6 +21,9 @@ from deepmd.pd.utils.finetune import ( get_finetune_rules, ) +from deepmd.utils.compat import ( + convert_optimizer_v31_to_v32, +) from .model.test_permutation import ( model_dpa1, @@ -151,6 +154,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -168,6 +172,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -188,6 +193,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -209,6 +215,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -225,6 +232,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file diff --git a/source/tests/pt/dos/input.json b/source/tests/pt/dos/input.json index a76afe368a..b5a0653310 100644 --- a/source/tests/pt/dos/input.json +++ b/source/tests/pt/dos/input.json @@ -51,6 +51,12 @@ "decay_steps": 5000, "stop_lr": 1e-08 }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "training": { "stop_batch": 100000, "seed": 1, diff --git a/source/tests/pt/model/models/se_e2_a.json b/source/tests/pt/model/models/se_e2_a.json index d3b83b5a5c..814c59f122 100644 --- a/source/tests/pt/model/models/se_e2_a.json +++ b/source/tests/pt/model/models/se_e2_a.json @@ -48,6 +48,12 @@ "limit_pref_f": 1, "_comment": " that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "training": { "training_data": { "systems": [ diff --git a/source/tests/pt/model/water/lkf.json b/source/tests/pt/model/water/lkf.json index 4385d02136..b597d10d28 100644 --- a/source/tests/pt/model/water/lkf.json +++ b/source/tests/pt/model/water/lkf.json @@ -42,6 +42,11 @@ "stop_lr": 3.51e-8, "_comment": "that's all" }, + "optimizer": { + "type": "LKF", + "kf_blocksize": 1024, + "_comment": "that's all" + }, "loss": { "type": "ener", "start_pref_e": 0.02, @@ -71,8 +76,6 @@ "disp_file": "lcurve.out", "disp_freq": 1, "save_freq": 1, - "opt_type": "LKF", - "kf_blocksize": 1024, "_comment": "that's all" }, "_comment": "that's all" diff --git a/source/tests/pt/model/water/multitask.json b/source/tests/pt/model/water/multitask.json index fa0bb0ae31..f3a8631073 100644 --- a/source/tests/pt/model/water/multitask.json +++ b/source/tests/pt/model/water/multitask.json @@ -68,6 +68,12 @@ "stop_lr": 3.51e-08, "_comment": "that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "loss_dict": { "model_1": { "type": "ener", diff --git a/source/tests/pt/model/water/multitask_sharefit.json b/source/tests/pt/model/water/multitask_sharefit.json index a43a2eaf9a..6203ecfac8 100644 --- a/source/tests/pt/model/water/multitask_sharefit.json +++ b/source/tests/pt/model/water/multitask_sharefit.json @@ -61,6 +61,12 @@ "stop_lr": 3.51e-08, "_comment": "that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "loss_dict": { "model_1": { "type": "ener", diff --git a/source/tests/pt/model/water/se_atten.json b/source/tests/pt/model/water/se_atten.json index 4b4c54e0d2..6225882f97 100644 --- a/source/tests/pt/model/water/se_atten.json +++ b/source/tests/pt/model/water/se_atten.json @@ -56,6 +56,12 @@ "limit_pref_v": 0, "_comment": " that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "training": { "training_data": { "systems": [ diff --git a/source/tests/pt/model/water/se_e2_a.json b/source/tests/pt/model/water/se_e2_a.json index 425ca3cbf5..f93d5cfb29 100644 --- a/source/tests/pt/model/water/se_e2_a.json +++ b/source/tests/pt/model/water/se_e2_a.json @@ -50,6 +50,12 @@ "limit_pref_f": 1, "_comment": " that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "training": { "training_data": { "systems": [ diff --git a/source/tests/pt/model/water/zbl.json b/source/tests/pt/model/water/zbl.json index cb5602d92d..e180b0eb6f 100644 --- a/source/tests/pt/model/water/zbl.json +++ b/source/tests/pt/model/water/zbl.json @@ -42,7 +42,6 @@ }, "_comment4": " that's all" }, - "learning_rate": { "type": "exp", "decay_steps": 5000, @@ -50,7 +49,6 @@ "stop_lr": 3.51e-8, "_comment5": "that's all" }, - "loss": { "type": "ener", "start_pref_e": 0.02, @@ -61,7 +59,12 @@ "limit_pref_v": 0, "_comment6": " that's all" }, - + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "training": { "training_data": { "systems": [ @@ -87,6 +90,5 @@ "save_freq": 1000, "_comment9": "that's all" }, - "_comment10": "that's all" } diff --git a/source/tests/pt/model_compression/input.json b/source/tests/pt/model_compression/input.json index 69adad0f35..566f4b63b2 100644 --- a/source/tests/pt/model_compression/input.json +++ b/source/tests/pt/model_compression/input.json @@ -57,6 +57,13 @@ "_comment6": " that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, + "training": { "training_data": { "systems": [ diff --git a/source/tests/pt/property/input.json b/source/tests/pt/property/input.json index 16ee9288f0..77921020ac 100644 --- a/source/tests/pt/property/input.json +++ b/source/tests/pt/property/input.json @@ -51,6 +51,12 @@ "type": "property", "_comment": " that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "training": { "training_data": { "systems": [ diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index ff4f00f912..bb9ca09503 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -19,6 +19,9 @@ from deepmd.pt.utils.finetune import ( get_finetune_rules, ) +from deepmd.utils.compat import ( + convert_optimizer_v31_to_v32, +) from .model.test_permutation import ( model_dos, @@ -178,6 +181,8 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + # Backward compatibility: convert old optimizer format + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -215,6 +220,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "dos/input.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "dos/data/atomic_system")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -233,6 +239,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/zbl.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -251,6 +258,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -272,6 +280,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -284,6 +293,7 @@ def setUp(self) -> None: input_json_zbl = str(Path(__file__).parent / "water/zbl.json") with open(input_json_zbl) as f: self.config_zbl = json.load(f) + self.config_zbl = convert_optimizer_v31_to_v32(self.config_zbl, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config_zbl["training"]["training_data"]["systems"] = data_file self.config_zbl["training"]["validation_data"]["systems"] = data_file @@ -300,6 +310,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -317,6 +328,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -333,6 +345,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water_tensor/se_e2_a.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file_atomic = str( Path(__file__).parent / "water_tensor/dipole/atomic_system" ) @@ -362,6 +375,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water_tensor/se_e2_a.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file_atomic = str( Path(__file__).parent / "water_tensor/dipole/atomic_system" ) @@ -391,6 +405,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water_tensor/se_e2_a.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file_atomic = str( Path(__file__).parent / "water_tensor/dipole/atomic_system" ) @@ -420,6 +435,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water_tensor/se_e2_a.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file_atomic = str( Path(__file__).parent / "water_tensor/polar/atomic_system" ) @@ -454,6 +470,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water_tensor/se_e2_a.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file_atomic = str( Path(__file__).parent / "water_tensor/polar/atomic_system" ) @@ -488,6 +505,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water_tensor/se_e2_a.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file_atomic = str( Path(__file__).parent / "water_tensor/polar/atomic_system" ) @@ -522,6 +540,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file @@ -533,6 +552,9 @@ def setUp(self) -> None: property_input = str(Path(__file__).parent / "property/input.json") with open(property_input) as f: self.config_property = json.load(f) + self.config_property = convert_optimizer_v31_to_v32( + self.config_property, warning=False + ) prop_data_file = [str(Path(__file__).parent / "property/double")] self.config_property["training"]["training_data"]["systems"] = prop_data_file self.config_property["training"]["validation_data"]["systems"] = prop_data_file @@ -590,6 +612,7 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) + self.config = convert_optimizer_v31_to_v32(self.config, warning=False) self.config["training"]["training_data"]["rglob_patterns"] = [ "water/data/data_*" ] diff --git a/source/tests/pt/water_tensor/se_e2_a.json b/source/tests/pt/water_tensor/se_e2_a.json index fc01107895..e12e9e3429 100644 --- a/source/tests/pt/water_tensor/se_e2_a.json +++ b/source/tests/pt/water_tensor/se_e2_a.json @@ -55,6 +55,12 @@ "pref_atomic": 1.0, "_comment6": " that's all" }, + "optimizer": { + "type": "Adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "weight_decay": 0.0 + }, "_comment7": " traing controls", "training": { "training_data": { diff --git a/source/tests/tf/data_modifier/dipole.json b/source/tests/tf/data_modifier/dipole.json index f4880b12c3..53294e7c27 100644 --- a/source/tests/tf/data_modifier/dipole.json +++ b/source/tests/tf/data_modifier/dipole.json @@ -53,26 +53,23 @@ "_comment4": " that's all" }, - "_comment5": " traing controls", - "training": { - "systems": [ - "data_modifier/sys_10" - ], - "stop_batch": 1000000, - "batch_size": 4, + "optimizer": { + "type": "Adam" + }, + "training": { + "training_data": { + "systems": [ + "data_modifier/sys_10" + ], + "batch_size": 4 + }, + "numb_steps": 1000000, "seed": 1, - - "_comment6": " display and restart", - "_comment7": " frequencies counted in batch", "disp_file": "lcurve.out", "disp_freq": 100, - "numb_test": 5, "save_freq": 500, "save_ckpt": "model.ckpt", - "load_ckpt": "model.ckpt", - "disp_training": true, - "time_training": true, "_comment8": "that's all" }, diff --git a/source/tests/tf/test_data_modifier.py b/source/tests/tf/test_data_modifier.py index 7a7558793e..55ae8875e5 100644 --- a/source/tests/tf/test_data_modifier.py +++ b/source/tests/tf/test_data_modifier.py @@ -19,6 +19,9 @@ from deepmd.tf.utils.data_system import ( DeepmdDataSystem, ) +from deepmd.utils.argcheck import ( + normalize, +) from .common import ( Data, @@ -53,17 +56,18 @@ def _setUp(self) -> None: restart=None, init_model=None, log_path=None, log_level=30, mpi_log="master" ) jdata = j_loader(INPUT) + jdata = normalize(jdata) # init model model = DPTrainer(jdata, run_opt=run_opt) rcut = model.model.get_rcut() # init data system - systems = jdata["training"]["systems"] + systems = jdata["training"]["training_data"]["systems"] # systems[0] = tests_path / systems[0] systems = [tests_path / ii for ii in systems] set_pfx = "set" - batch_size = jdata["training"]["batch_size"] - test_size = jdata["training"]["numb_test"] + batch_size = jdata["training"]["training_data"]["batch_size"] + test_size = 1 data = DeepmdDataSystem( systems, batch_size, test_size, rcut, set_prefix=set_pfx ) diff --git a/source/tests/tf/test_data_modifier_shuffle.py b/source/tests/tf/test_data_modifier_shuffle.py index 49b46ead6a..154765db93 100644 --- a/source/tests/tf/test_data_modifier_shuffle.py +++ b/source/tests/tf/test_data_modifier_shuffle.py @@ -23,6 +23,9 @@ from deepmd.tf.utils.data_system import ( DeepmdDataSystem, ) +from deepmd.utils.argcheck import ( + normalize, +) from ..seed import ( GLOBAL_SEED, @@ -58,6 +61,7 @@ def _setUp(self) -> None: restart=None, init_model=None, log_path=None, log_level=30, mpi_log="master" ) jdata = self._setUp_jdata() + jdata = normalize(jdata) self._setUp_data() # init model @@ -65,10 +69,10 @@ def _setUp(self) -> None: rcut = model.model.get_rcut() # init data system - systems = jdata["training"]["systems"] + systems = jdata["training"]["training_data"]["systems"] set_pfx = "set" - batch_size = jdata["training"]["batch_size"] - test_size = jdata["training"]["numb_test"] + batch_size = jdata["training"]["training_data"]["batch_size"] + test_size = 1 data = DeepmdDataSystem( systems, batch_size, test_size, rcut, set_prefix=set_pfx ) @@ -173,11 +177,13 @@ def _setUp_jdata(self): "decay_steps": 5000, "decay_rate": 0.95, }, + "optimizer": {"type": "Adam"}, "training": { - "systems": ["data_modifier/sys_test_0"], - "stop_batch": 1000000, - "batch_size": 1, - "numb_test": 2, + "training_data": { + "systems": ["data_modifier/sys_test_0"], + "batch_size": 1, + }, + "numb_steps": 1000000, }, } return jdata