Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 22 additions & 15 deletions deepmd/pd/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ def __init__(

# Iteration config
self.num_steps = training_params["numb_steps"]
self.acc_freq: int = training_params.get(
"acc_freq", 1
) # gradient accumulation steps
self.disp_file = training_params.get("disp_file", "lcurve.out")
Comment thread
njzjz marked this conversation as resolved.
self.disp_freq = training_params.get("disp_freq", 1000)
self.save_ckpt = training_params.get("save_ckpt", "model.ckpt")
Expand Down Expand Up @@ -744,7 +747,6 @@ def step(_step_id, task_key="Default") -> None:
_lr = self.lr_exp
cur_lr = _lr.value(_step_id)
pref_lr = cur_lr
self.optimizer.clear_grad(set_to_zero=False)

with nvprof_context(enable_profiling, "Fetching data"):
input_dict, label_dict, log_dict = self.get_data(
Expand Down Expand Up @@ -780,22 +782,27 @@ def step(_step_id, task_key="Default") -> None:
with nvprof_context(enable_profiling, "Backward pass"):
loss.backward()

Comment thread
njzjz marked this conversation as resolved.
# fuse + allreduce manually before optimization if use DDP + no_sync
# details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622
if self.world_size > 1:
hpu.fused_allreduce_gradients(list(self.wrapper.parameters()), None)

if self.gradient_max_norm > 0.0:
with nvprof_context(enable_profiling, "Gradient clip"):
paddle.nn.utils.clip_grad_norm_(
self.wrapper.parameters(),
self.gradient_max_norm,
error_if_nonfinite=True,
# gradient accumulation
Comment thread
njzjz marked this conversation as resolved.
if (_step_id + 1) % self.acc_freq == 0:
# fuse + allreduce manually before optimization if use DDP + no_sync
# details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622
if self.world_size > 1:
hpu.fused_allreduce_gradients(
list(self.wrapper.parameters()), None
)

with nvprof_context(enable_profiling, "Adam update"):
self.optimizer.step()
self.scheduler.step()
if self.gradient_max_norm > 0.0:
with nvprof_context(enable_profiling, "Gradient clip"):
paddle.nn.utils.clip_grad_norm_(
self.wrapper.parameters(),
self.gradient_max_norm,
error_if_nonfinite=True,
)

with nvprof_context(enable_profiling, "Adam update"):
self.optimizer.step()
Comment thread
njzjz marked this conversation as resolved.
self.optimizer.clear_grad(set_to_zero=False)
self.scheduler.step()
Comment thread
HydrogenSulfate marked this conversation as resolved.

else:
raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
Expand Down
9 changes: 9 additions & 0 deletions deepmd/utils/argcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

doc_only_tf_supported = "(Supported Backend: TensorFlow) "
doc_only_pt_supported = "(Supported Backend: PyTorch) "
doc_only_pd_supported = "(Supported Backend: Paddle) "
# descriptors
doc_loc_frame = "Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame."
doc_se_e2_a = "Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor."
Expand Down Expand Up @@ -3167,6 +3168,7 @@ def training_args(
doc_kf_blocksize = "The blocksize for the Kalman filter."
doc_model_prob = "The visiting probability of each model for each training step in the multi-task mode."
doc_data_dict = "The multiple definition of the data, used in the multi-task mode."
doc_acc_freq = "Gradient accumulation steps (number of steps to accumulate gradients before performing an update)."

arg_training_data = training_data_args()
arg_validation_data = validation_data_args()
Expand Down Expand Up @@ -3269,6 +3271,13 @@ def training_args(
optional=True,
doc=doc_only_pt_supported + doc_gradient_max_norm,
),
Argument(
"acc_freq",
int,
optional=True,
default=1,
doc=doc_only_pd_supported + doc_acc_freq,
),
]
variants = [
Variant(
Expand Down
20 changes: 18 additions & 2 deletions source/tests/pd/test_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,25 @@ def setUp(self) -> None:
self.config["model"] = deepcopy(model_se_e2_a)
self.config["training"]["numb_steps"] = 1
self.config["training"]["save_freq"] = 1
# import paddle
enable_prim(True)
# assert paddle.framework.core._is_eager_prim_enabled()

def tearDown(self) -> None:
DPTrainTest.tearDown(self)


class TestEnergyModelGradientAccumulation(unittest.TestCase, DPTrainTest):
def setUp(self) -> None:
input_json = str(Path(__file__).parent / "water/se_atten.json")
with open(input_json) as f:
self.config = json.load(f)
data_file = [str(Path(__file__).parent / "water/data/data_0")]
self.config["training"]["training_data"]["systems"] = data_file
self.config["training"]["validation_data"]["systems"] = data_file
self.config["model"] = deepcopy(model_se_e2_a)
self.config["training"]["numb_steps"] = 1
self.config["training"]["save_freq"] = 1
self.config["training"]["acc_freq"] = 4
enable_prim(True)

def tearDown(self) -> None:
DPTrainTest.tearDown(self)
Expand Down