Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions deepmd/entrypoints/compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from deepmd.common import j_loader
from deepmd.utils.argcheck import normalize
from deepmd.utils.compat import convert_input_v0_v1
from deepmd.utils.errors import GraphTooLargeError

from .freeze import freeze
from .train import train
Expand Down Expand Up @@ -79,7 +80,6 @@ def compress(
jdata["training"]["numb_steps"] = jdata["training"]["save_freq"]
jdata = normalize(jdata)


# check the descriptor info of the input file
assert (
jdata["model"]["descriptor"]["type"] == "se_a" or jdata["model"]["descriptor"]["type"] == "se_e2_a"
Expand All @@ -94,15 +94,23 @@ def compress(
control_file = "compress.json"
with open(control_file, "w") as fp:
json.dump(jdata, fp, indent=4)
train(
INPUT=control_file,
init_model=None,
restart=None,
output=control_file,
mpi_log=mpi_log,
log_level=log_level,
log_path=log_path,
)
try:
train(
INPUT=control_file,
init_model=None,
restart=None,
output=control_file,
mpi_log=mpi_log,
log_level=log_level,
log_path=log_path,
)
except GraphTooLargeError as e:
raise RuntimeError(
"The uniform step size of the tabulation's first table is %f, "
"which is too small. This leads to a very large graph size, "
"exceeding protobuf's limitation (2 GB). You should try to "
"increase the step size." % step
) from e

# stage 2: freeze the model
log.info("\n\n")
Expand Down
11 changes: 10 additions & 1 deletion deepmd/train/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import time
import shutil
import google.protobuf.message
import numpy as np
from deepmd.env import tf
from deepmd.env import default_tf_session_config
Expand All @@ -19,6 +20,7 @@
from deepmd.descriptor import DescrptHybrid
from deepmd.model import EnerModel, WFCModel, DipoleModel, PolarModel, GlobalPolarModel
from deepmd.loss import EnerStdLoss, EnerDipoleLoss, TensorLoss
from deepmd.utils.errors import GraphTooLargeError
from deepmd.utils.learning_rate import LearningRateExp
from deepmd.utils.neighbor_stat import NeighborStat
from deepmd.utils.sess import run_sess
Expand Down Expand Up @@ -542,7 +544,14 @@ def train (self, train_data, valid_data=None) :
train_time = 0
if self.save_freq > 0 and cur_batch % self.save_freq == 0 and self.run_opt.is_chief :
if self.saver is not None :
self.saver.save (self.sess, os.getcwd() + "/" + self.save_ckpt)
try:
self.saver.save (self.sess, os.getcwd() + "/" + self.save_ckpt)
except google.protobuf.message.DecodeError as e:
raise GraphTooLargeError(
"The graph size exceeds 2 GB, the hard limitation of protobuf."
" Then a DecodeError was raised by protobuf. You should "
"reduce the size of your model."
) from e
log.info("saved checkpoint %s" % self.save_ckpt)
if self.run_opt.is_chief:
fp.close ()
Expand Down
2 changes: 2 additions & 0 deletions deepmd/utils/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class GraphTooLargeError(Exception):
pass