diff --git a/deepmd/entrypoints/compress.py b/deepmd/entrypoints/compress.py index 1222b1e51b..3c80965bce 100644 --- a/deepmd/entrypoints/compress.py +++ b/deepmd/entrypoints/compress.py @@ -7,6 +7,7 @@ from deepmd.common import j_loader from deepmd.utils.argcheck import normalize from deepmd.utils.compat import convert_input_v0_v1 +from deepmd.utils.errors import GraphTooLargeError from .freeze import freeze from .train import train @@ -79,7 +80,6 @@ def compress( jdata["training"]["numb_steps"] = jdata["training"]["save_freq"] jdata = normalize(jdata) - # check the descriptor info of the input file assert ( jdata["model"]["descriptor"]["type"] == "se_a" or jdata["model"]["descriptor"]["type"] == "se_e2_a" @@ -94,15 +94,23 @@ def compress( control_file = "compress.json" with open(control_file, "w") as fp: json.dump(jdata, fp, indent=4) - train( - INPUT=control_file, - init_model=None, - restart=None, - output=control_file, - mpi_log=mpi_log, - log_level=log_level, - log_path=log_path, - ) + try: + train( + INPUT=control_file, + init_model=None, + restart=None, + output=control_file, + mpi_log=mpi_log, + log_level=log_level, + log_path=log_path, + ) + except GraphTooLargeError as e: + raise RuntimeError( + "The uniform step size of the tabulation's first table is %f, " + "which is too small. This leads to a very large graph size, " + "exceeding protobuf's limitation (2 GB). You should try to " + "increase the step size." % step + ) from e # stage 2: freeze the model log.info("\n\n") diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 7fa29635cf..8f283b61cf 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -3,6 +3,7 @@ import os import time import shutil +import google.protobuf.message import numpy as np from deepmd.env import tf from deepmd.env import default_tf_session_config @@ -19,6 +20,7 @@ from deepmd.descriptor import DescrptHybrid from deepmd.model import EnerModel, WFCModel, DipoleModel, PolarModel, GlobalPolarModel from deepmd.loss import EnerStdLoss, EnerDipoleLoss, TensorLoss +from deepmd.utils.errors import GraphTooLargeError from deepmd.utils.learning_rate import LearningRateExp from deepmd.utils.neighbor_stat import NeighborStat from deepmd.utils.sess import run_sess @@ -542,7 +544,14 @@ def train (self, train_data, valid_data=None) : train_time = 0 if self.save_freq > 0 and cur_batch % self.save_freq == 0 and self.run_opt.is_chief : if self.saver is not None : - self.saver.save (self.sess, os.getcwd() + "/" + self.save_ckpt) + try: + self.saver.save (self.sess, os.getcwd() + "/" + self.save_ckpt) + except google.protobuf.message.DecodeError as e: + raise GraphTooLargeError( + "The graph size exceeds 2 GB, the hard limitation of protobuf." + " Then a DecodeError was raised by protobuf. You should " + "reduce the size of your model." + ) from e log.info("saved checkpoint %s" % self.save_ckpt) if self.run_opt.is_chief: fp.close () diff --git a/deepmd/utils/errors.py b/deepmd/utils/errors.py new file mode 100644 index 0000000000..d7b62383f1 --- /dev/null +++ b/deepmd/utils/errors.py @@ -0,0 +1,2 @@ +class GraphTooLargeError(Exception): + pass