Skip to content

"RuntimeError: can not compress model when attention layer is not 0." when using se_atten_v2 #3643

@user-ting

Description

@user-ting

Summary

I was training a model with "type": "se_atten_v2" in deepmd-kit 2.2.7, I believe that the documentation says that "Descriptors with se_e2_a, se_e3, se_e2_r and se_atten_v2 types are supported by the model compression feature.", however, i still got the error when compressing the model: "RuntimeError: can not compress model when attention layer is not 0."
Is there anying wrong with my training settings, thank you

DeePMD-kit Version

DeePMD-kit v2.2.7

TensorFlow Version

2.9.0

Python Version, CUDA Version, GCC Version, LAMMPS Version, etc

python 3.10.13; gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-16)

Details

  1. I train the model in a dpgen workflow.
    The para.json was:
    {
    "type_map": [
    "C"
    ],
    "mass_map": [
    12
    ],
    "init_data_prefix": "/home1/zhangzt/mlPotential_DP/dpgen_test_attention/init/",
    "init_data_sys": [
    "density_varying_init/1.0_3000K/deepmd_data",
    "density_varying_init/1.5_3000K/deepmd_data",
    "density_varying_init/2.0_3000K/deepmd_data",
    "density_varying_init/2.4_3000K/deepmd_data",
    "density_varying_init/2.8_3000K/deepmd_data",
    "density_varying_init/3.0_4000K/deepmd_data",
    "density_varying_init/3.0_5000K/deepmd_data",
    "density_varying_init/3.2_3000K/deepmd_data",
    "density_varying_init/3.4_4000K/deepmd_data",
    "density_varying_init/3.4_5000K/deepmd_data",
    "density_varying_init/3.6_3000K/deepmd_data",
    "density_varying_init/4.0_5000K/deepmd_data",
    "density_varying_init/4.4_5000K/deepmd_data"
    ],
    "init_batch_size": [10,10,10,10,10,10,10,10,10,10,10,10,10],
    "sys_configs": [
    ["../mdconfigs/dense_1.0/00*/POSCAR"],
    ["../mdconfigs/dense_1.5/00*/POSCAR"],
    ["../mdconfigs/dense_2.0/00*/POSCAR"],
    ["../mdconfigs/dense_2.4/00*/POSCAR"],
    ["../mdconfigs/dense_2.8/00*/POSCAR"],
    ["../mdconfigs/dense_3.0/00*/POSCAR"],
    ["../mdconfigs/dense_3.2/00*/POSCAR"],
    ["../mdconfigs/dense_3.4/00*/POSCAR"],
    ["../mdconfigs/dense_3.6/00*/POSCAR"],
    ["../mdconfigs/dense_4.0/00*/POSCAR"],
    ["../mdconfigs/dense_4.4/00*/POSCAR"]
    ],
    "sys_batch_size": [8,8,8,8,8,8,8,8,8,8,8],
    "numb_models": 4,
    "default_training_param": {
    "model": {
    "_comment": " model parameters",
    "type_map": [
    "C"
    ],
    "descriptor": {
    "type": "se_atten_v2",
    "sel": [
    80
    ],
    "rcut": 4.0,
    "rcut_smth": 3.5,
    "neuron": [
    10,
    20,
    40
    ],
    "resnet_dt": false,
    "seed": 17
    },
    "fitting_net": {
    "type": "ener",
    "neuron": [
    120,
    120,
    120
    ],
    "resnet_dt": false,
    "seed": 9
    }
    },
    "learning_rate": {
    "type": "exp",
    "start_lr": 0.001,
    "stop_lr": 1e-08,
    "decay_steps": 500
    },
    "loss": {
    "type": "ener",
    "start_pref_e": 0.02,
    "limit_pref_e": 2.0,
    "start_pref_f": 1000,
    "limit_pref_f": 1.0,
    "start_pref_v": 0,
    "limit_pref_v": 0
    },
    "training": {
    "numb_steps": 10000,
    "seed": 11,
    "disp_file": "lcurve.out",
    "disp_freq": 1,
    "save_freq": 40,
    "save_ckpt": "model.ckpt",
    "disp_training": true,
    "time_training": true,
    "profiling": true,
    "profiling_file": "timeline.json"

    }
    },
    "dp_compress": true,
    "fp_task_max": 100,
    "fp_task_min": 10,
    "model_devi_engine": "lammps",
    "model_devi_jobs": [
    {
    "sys_idx": [0,3,
    4,5,8,
    9,10],
    "temps": [300,1000],
    "trj_freq": 10,
    "nsteps": 2000,
    "ensemble": "nvt"
    },
    {
    "sys_idx": [1,2,
    4,6,7,
    9,10],
    "temps": [300,2000],
    "trj_freq": 10,
    "nsteps": 2000,
    "ensemble": "nvt"
    }
    ],
    "model_devi_dt": 0.002,
    "model_devi_skip": 0,
    "model_devi_f_trust_lo": 0.01,
    "model_devi_f_trust_hi": 1,
    "model_devi_clean_traj": false,
    "shuffle_poscar": false,
    "fp_style": "vasp",
    "fp_pp_path": "../ff",
    "fp_pp_files": [
    "POTCAR"
    ],
    "fp_incar": "../incar/INCAR"
    }

  2. And the input.json in the work path was:
    {
    "model": {
    "type_map": [
    "C"
    ],
    "descriptor": {
    "type": "se_atten_v2",
    "sel": 80,
    "rcut": 4.0,
    "rcut_smth": 3.5,
    "neuron": [
    10,
    20,
    40
    ],
    "resnet_dt": false,
    "seed": 1509306854,
    "axis_neuron": 4,
    "activation_function": "tanh",
    "type_one_side": false,
    "precision": "default",
    "trainable": true,
    "exclude_types": [],
    "attn": 128,
    "attn_layer": 2,
    "attn_dotr": true,
    "attn_mask": false,
    "set_davg_zero": false
    },
    "fitting_net": {
    "type": "ener",
    "neuron": [
    120,
    120,
    120
    ],
    "resnet_dt": false,
    "seed": 3358502526,
    "numb_fparam": 0,
    "numb_aparam": 0,
    "activation_function": "tanh",
    "precision": "default",
    "trainable": true,
    "rcond": null,
    "atom_ener": [],
    "use_aparam_as_mask": false
    },
    "data_stat_nbatch": 10,
    "data_stat_protect": 0.01,
    "data_bias_nsample": 10,
    "srtab_add_bias": true,
    "type": "standard",
    "compress": {
    "model_file": "frozen_model.pb",
    "min_nbor_dist": 1.0073871479227832,
    "table_config": [
    5,
    0.01,
    0.1,
    -1
    ],
    "type": "se_e2_a"
    }
    },
    "learning_rate": {
    "type": "exp",
    "start_lr": 0.001,
    "stop_lr": 1e-08,
    "decay_steps": 500,
    "scale_by_worker": "linear"
    },
    "loss": {
    "type": "ener",
    "start_pref_e": 0.02,
    "limit_pref_e": 2.0,
    "start_pref_f": 1000,
    "limit_pref_f": 1.0,
    "start_pref_v": 0,
    "limit_pref_v": 0,
    "start_pref_ae": 0.0,
    "limit_pref_ae": 0.0,
    "start_pref_pf": 0.0,
    "limit_pref_pf": 0.0,
    "enable_atom_ener_coeff": false,
    "start_pref_gf": 0.0,
    "limit_pref_gf": 0.0,
    "numb_generalized_coord": 0
    },
    "training": {
    "numb_steps": 10000,
    "seed": 992369420,
    "disp_file": "lcurve.out",
    "disp_freq": 1,
    "save_freq": 40,
    "save_ckpt": "model-compression/model.ckpt",
    "disp_training": true,
    "time_training": true,
    "profiling": true,
    "profiling_file": "timeline.json",
    "training_data": {
    "systems": [
    "../data.init/density_varying_init/1.0_3000K/deepmd_data",
    "../data.init/density_varying_init/1.5_3000K/deepmd_data",
    "../data.init/density_varying_init/2.0_3000K/deepmd_data",
    "../data.init/density_varying_init/2.4_3000K/deepmd_data",
    "../data.init/density_varying_init/2.8_3000K/deepmd_data",
    "../data.init/density_varying_init/3.0_4000K/deepmd_data",
    "../data.init/density_varying_init/3.0_5000K/deepmd_data",
    "../data.init/density_varying_init/3.2_3000K/deepmd_data",
    "../data.init/density_varying_init/3.4_4000K/deepmd_data",
    "../data.init/density_varying_init/3.4_5000K/deepmd_data",
    "../data.init/density_varying_init/3.6_3000K/deepmd_data",
    "../data.init/density_varying_init/4.0_5000K/deepmd_data",
    "../data.init/density_varying_init/4.4_5000K/deepmd_data"
    ],
    "batch_size": [
    10,
    10,
    10,
    10,
    10,
    10,
    10,
    10,
    10,
    10,
    10,
    10,
    10
    ],
    "set_prefix": "set",
    "auto_prob": "prob_sys_size",
    "sys_probs": null
    },
    "validation_data": null,
    "enable_profiler": false,
    "tensorboard": false,
    "tensorboard_log_dir": "log",
    "tensorboard_freq": 1
    }
    }

  3. However, i got the error message in the train.log when compressing the model:

DEEPMD INFO saved checkpoint model.ckpt
DEEPMD INFO average training time: 4.8215 s/batch (exclude first 1 batches)
DEEPMD INFO finished training
DEEPMD INFO wall time: 65748.233 s
WARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/python/compat/v2_compat.py:107: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term
WARNING:root:To get the best performance, it is recommended to adjust the number of threads by setting the environment variables OMP_NUM_THREADS, TF_INTRA_OP_PARALLELISM_THREADS, and TF_INTER_OP_PARALLELISM_THREADS. See https://deepmd.rtfd.io/parallelism/ for more information.
DEEPMD WARNING The following nodes are not in the graph: {'fitting_attr/aparam_nall', 'spin_attr/ntypes_spin'}. Skip freezeing these nodes. You may be freezing a checkpoint generated by an old version.
DEEPMD INFO The following nodes will be frozen: ['o_energy', 'model_attr/model_version', 'fitting_attr/dfparam', 'model_attr/tmap', 'o_virial', 'train_attr/min_nbor_dist', 'o_atom_energy', 'descrpt_attr/rcut', 'model_attr/model_type', 't_mesh', 'o_force', 'o_atom_virial', 'model_type', 'fitting_attr/daparam', 'train_attr/training_script', 'descrpt_attr/ntypes']
WARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/freeze.py:370: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.compat.v1.graph_util.convert_variables_to_constants
WARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/freeze.py:370: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.compat.v1.graph_util.convert_variables_to_constants
WARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/python/framework/convert_to_constants.py:925: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.compat.v1.graph_util.extract_sub_graph
WARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/python/framework/convert_to_constants.py:925: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.compat.v1.graph_util.extract_sub_graph
DEEPMD INFO 1640 ops in the final graph.
WARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/python/compat/v2_compat.py:107: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term
WARNING:root:To get the best performance, it is recommended to adjust the number of threads by setting the environment variables OMP_NUM_THREADS, TF_INTRA_OP_PARALLELISM_THREADS, and TF_INTER_OP_PARALLELISM_THREADS. See https://deepmd.rtfd.io/parallelism/ for more information.
DEEPMD INFO

DEEPMD INFO stage 1: compress the model
DEEPMD WARNING Switch to serial execution due to lack of horovod module.
DEEPMD INFO _____ _____ __ __ _____ _ _ _
DEEPMD INFO | __ \ | __ \ | / || __ \ | | ()| |
DEEPMD INFO | | | | ___ ___ | |__) || \ / || | | | ______ | | __ _ | |

DEEPMD INFO | | | | / _ \ / _ | / | |/| || | | |||| |/ /| || |
DEEPMD INFO | || || /| /| | | | | || || | | < | || |
DEEPMD INFO |
/ _| _||| || |_||____/ ||_|| __|
DEEPMD INFO Please read and cite:
DEEPMD INFO Wang, Zhang, Han and E, Comput.Phys.Comm. 228, 178-184 (2018)
DEEPMD INFO Zeng et al, J. Chem. Phys., 159, 054801 (2023)
DEEPMD INFO See https://deepmd.rtfd.io/credits/ for details.
DEEPMD INFO installed to: /home1/zhangzt/anaconda3/envs/deepmd
DEEPMD INFO source : v2.2.7
DEEPMD INFO source brach: HEAD
DEEPMD INFO source commit: 839f4fe
DEEPMD INFO source commit at: 2023-10-27 21:10:24 +0800
DEEPMD INFO build float prec: double
DEEPMD INFO build variant: cpu
DEEPMD INFO build with tf inc: /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/include;/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/../../../../include
DEEPMD INFO build with tf lib:
DEEPMD INFO ---Summary of the training---------------------------------------
DEEPMD INFO running on: n0125
DEEPMD INFO computing device: cpu:0
DEEPMD INFO Count of visible GPU: 0
DEEPMD INFO num_intra_threads: 0
DEEPMD INFO num_inter_threads: 0
DEEPMD INFO -----------------------------------------------------------------
DEEPMD INFO training without frame parameter
Traceback (most recent call last):
File "/home1/zhangzt/anaconda3/envs/deepmd/bin/dp", line 10, in
sys.exit(main())
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd_cli/main.py", line 635, in main
deepmd_main(args)
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/main.py", line 82, in main
compress(**dict_args)
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/compress.py", line 150, in compress
train(
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/train.py", line 168, in train
_do_work(jdata, run_opt, is_compress)
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/train.py", line 280, in _do_work
model.build(train_data, stop_batch, origin_type_map=origin_type_map)
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/train/trainer.py", line 298, in build
self.model.enable_compression()
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/model/model.py", line 618, in enable_compression
self.descrpt.enable_compression(
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/descriptor/se_atten.py", line 391, in enable_compression
raise RuntimeError("can not compress model when attention layer is not 0.")
RuntimeError: can not compress model when attention layer is not 0.

Metadata

Metadata

Assignees

Labels

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions