I train the model in a dpgen workflow.
The para.json was:
{
"type_map": [
"C"
],
"mass_map": [
12
],
"init_data_prefix": "/home1/zhangzt/mlPotential_DP/dpgen_test_attention/init/",
"init_data_sys": [
"density_varying_init/1.0_3000K/deepmd_data",
"density_varying_init/1.5_3000K/deepmd_data",
"density_varying_init/2.0_3000K/deepmd_data",
"density_varying_init/2.4_3000K/deepmd_data",
"density_varying_init/2.8_3000K/deepmd_data",
"density_varying_init/3.0_4000K/deepmd_data",
"density_varying_init/3.0_5000K/deepmd_data",
"density_varying_init/3.2_3000K/deepmd_data",
"density_varying_init/3.4_4000K/deepmd_data",
"density_varying_init/3.4_5000K/deepmd_data",
"density_varying_init/3.6_3000K/deepmd_data",
"density_varying_init/4.0_5000K/deepmd_data",
"density_varying_init/4.4_5000K/deepmd_data"
],
"init_batch_size": [10,10,10,10,10,10,10,10,10,10,10,10,10],
"sys_configs": [
["../mdconfigs/dense_1.0/00*/POSCAR"],
["../mdconfigs/dense_1.5/00*/POSCAR"],
["../mdconfigs/dense_2.0/00*/POSCAR"],
["../mdconfigs/dense_2.4/00*/POSCAR"],
["../mdconfigs/dense_2.8/00*/POSCAR"],
["../mdconfigs/dense_3.0/00*/POSCAR"],
["../mdconfigs/dense_3.2/00*/POSCAR"],
["../mdconfigs/dense_3.4/00*/POSCAR"],
["../mdconfigs/dense_3.6/00*/POSCAR"],
["../mdconfigs/dense_4.0/00*/POSCAR"],
["../mdconfigs/dense_4.4/00*/POSCAR"]
],
"sys_batch_size": [8,8,8,8,8,8,8,8,8,8,8],
"numb_models": 4,
"default_training_param": {
"model": {
"_comment": " model parameters",
"type_map": [
"C"
],
"descriptor": {
"type": "se_atten_v2",
"sel": [
80
],
"rcut": 4.0,
"rcut_smth": 3.5,
"neuron": [
10,
20,
40
],
"resnet_dt": false,
"seed": 17
},
"fitting_net": {
"type": "ener",
"neuron": [
120,
120,
120
],
"resnet_dt": false,
"seed": 9
}
},
"learning_rate": {
"type": "exp",
"start_lr": 0.001,
"stop_lr": 1e-08,
"decay_steps": 500
},
"loss": {
"type": "ener",
"start_pref_e": 0.02,
"limit_pref_e": 2.0,
"start_pref_f": 1000,
"limit_pref_f": 1.0,
"start_pref_v": 0,
"limit_pref_v": 0
},
"training": {
"numb_steps": 10000,
"seed": 11,
"disp_file": "lcurve.out",
"disp_freq": 1,
"save_freq": 40,
"save_ckpt": "model.ckpt",
"disp_training": true,
"time_training": true,
"profiling": true,
"profiling_file": "timeline.json"
}
},
"dp_compress": true,
"fp_task_max": 100,
"fp_task_min": 10,
"model_devi_engine": "lammps",
"model_devi_jobs": [
{
"sys_idx": [0,3,
4,5,8,
9,10],
"temps": [300,1000],
"trj_freq": 10,
"nsteps": 2000,
"ensemble": "nvt"
},
{
"sys_idx": [1,2,
4,6,7,
9,10],
"temps": [300,2000],
"trj_freq": 10,
"nsteps": 2000,
"ensemble": "nvt"
}
],
"model_devi_dt": 0.002,
"model_devi_skip": 0,
"model_devi_f_trust_lo": 0.01,
"model_devi_f_trust_hi": 1,
"model_devi_clean_traj": false,
"shuffle_poscar": false,
"fp_style": "vasp",
"fp_pp_path": "../ff",
"fp_pp_files": [
"POTCAR"
],
"fp_incar": "../incar/INCAR"
}
And the input.json in the work path was:
{
"model": {
"type_map": [
"C"
],
"descriptor": {
"type": "se_atten_v2",
"sel": 80,
"rcut": 4.0,
"rcut_smth": 3.5,
"neuron": [
10,
20,
40
],
"resnet_dt": false,
"seed": 1509306854,
"axis_neuron": 4,
"activation_function": "tanh",
"type_one_side": false,
"precision": "default",
"trainable": true,
"exclude_types": [],
"attn": 128,
"attn_layer": 2,
"attn_dotr": true,
"attn_mask": false,
"set_davg_zero": false
},
"fitting_net": {
"type": "ener",
"neuron": [
120,
120,
120
],
"resnet_dt": false,
"seed": 3358502526,
"numb_fparam": 0,
"numb_aparam": 0,
"activation_function": "tanh",
"precision": "default",
"trainable": true,
"rcond": null,
"atom_ener": [],
"use_aparam_as_mask": false
},
"data_stat_nbatch": 10,
"data_stat_protect": 0.01,
"data_bias_nsample": 10,
"srtab_add_bias": true,
"type": "standard",
"compress": {
"model_file": "frozen_model.pb",
"min_nbor_dist": 1.0073871479227832,
"table_config": [
5,
0.01,
0.1,
-1
],
"type": "se_e2_a"
}
},
"learning_rate": {
"type": "exp",
"start_lr": 0.001,
"stop_lr": 1e-08,
"decay_steps": 500,
"scale_by_worker": "linear"
},
"loss": {
"type": "ener",
"start_pref_e": 0.02,
"limit_pref_e": 2.0,
"start_pref_f": 1000,
"limit_pref_f": 1.0,
"start_pref_v": 0,
"limit_pref_v": 0,
"start_pref_ae": 0.0,
"limit_pref_ae": 0.0,
"start_pref_pf": 0.0,
"limit_pref_pf": 0.0,
"enable_atom_ener_coeff": false,
"start_pref_gf": 0.0,
"limit_pref_gf": 0.0,
"numb_generalized_coord": 0
},
"training": {
"numb_steps": 10000,
"seed": 992369420,
"disp_file": "lcurve.out",
"disp_freq": 1,
"save_freq": 40,
"save_ckpt": "model-compression/model.ckpt",
"disp_training": true,
"time_training": true,
"profiling": true,
"profiling_file": "timeline.json",
"training_data": {
"systems": [
"../data.init/density_varying_init/1.0_3000K/deepmd_data",
"../data.init/density_varying_init/1.5_3000K/deepmd_data",
"../data.init/density_varying_init/2.0_3000K/deepmd_data",
"../data.init/density_varying_init/2.4_3000K/deepmd_data",
"../data.init/density_varying_init/2.8_3000K/deepmd_data",
"../data.init/density_varying_init/3.0_4000K/deepmd_data",
"../data.init/density_varying_init/3.0_5000K/deepmd_data",
"../data.init/density_varying_init/3.2_3000K/deepmd_data",
"../data.init/density_varying_init/3.4_4000K/deepmd_data",
"../data.init/density_varying_init/3.4_5000K/deepmd_data",
"../data.init/density_varying_init/3.6_3000K/deepmd_data",
"../data.init/density_varying_init/4.0_5000K/deepmd_data",
"../data.init/density_varying_init/4.4_5000K/deepmd_data"
],
"batch_size": [
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10
],
"set_prefix": "set",
"auto_prob": "prob_sys_size",
"sys_probs": null
},
"validation_data": null,
"enable_profiler": false,
"tensorboard": false,
"tensorboard_log_dir": "log",
"tensorboard_freq": 1
}
}
Summary
I was training a model with "type": "se_atten_v2" in deepmd-kit 2.2.7, I believe that the documentation says that "Descriptors with se_e2_a, se_e3, se_e2_r and se_atten_v2 types are supported by the model compression feature.", however, i still got the error when compressing the model: "RuntimeError: can not compress model when attention layer is not 0."
Is there anying wrong with my training settings, thank you
DeePMD-kit Version
DeePMD-kit v2.2.7
TensorFlow Version
2.9.0
Python Version, CUDA Version, GCC Version, LAMMPS Version, etc
python 3.10.13; gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-16)
Details
I train the model in a dpgen workflow.
The para.json was:
{
"type_map": [
"C"
],
"mass_map": [
12
],
"init_data_prefix": "/home1/zhangzt/mlPotential_DP/dpgen_test_attention/init/",
"init_data_sys": [
"density_varying_init/1.0_3000K/deepmd_data",
"density_varying_init/1.5_3000K/deepmd_data",
"density_varying_init/2.0_3000K/deepmd_data",
"density_varying_init/2.4_3000K/deepmd_data",
"density_varying_init/2.8_3000K/deepmd_data",
"density_varying_init/3.0_4000K/deepmd_data",
"density_varying_init/3.0_5000K/deepmd_data",
"density_varying_init/3.2_3000K/deepmd_data",
"density_varying_init/3.4_4000K/deepmd_data",
"density_varying_init/3.4_5000K/deepmd_data",
"density_varying_init/3.6_3000K/deepmd_data",
"density_varying_init/4.0_5000K/deepmd_data",
"density_varying_init/4.4_5000K/deepmd_data"
],
"init_batch_size": [10,10,10,10,10,10,10,10,10,10,10,10,10],
"sys_configs": [
["../mdconfigs/dense_1.0/00*/POSCAR"],
["../mdconfigs/dense_1.5/00*/POSCAR"],
["../mdconfigs/dense_2.0/00*/POSCAR"],
["../mdconfigs/dense_2.4/00*/POSCAR"],
["../mdconfigs/dense_2.8/00*/POSCAR"],
["../mdconfigs/dense_3.0/00*/POSCAR"],
["../mdconfigs/dense_3.2/00*/POSCAR"],
["../mdconfigs/dense_3.4/00*/POSCAR"],
["../mdconfigs/dense_3.6/00*/POSCAR"],
["../mdconfigs/dense_4.0/00*/POSCAR"],
["../mdconfigs/dense_4.4/00*/POSCAR"]
],
"sys_batch_size": [8,8,8,8,8,8,8,8,8,8,8],
"numb_models": 4,
"default_training_param": {
"model": {
"_comment": " model parameters",
"type_map": [
"C"
],
"descriptor": {
"type": "se_atten_v2",
"sel": [
80
],
"rcut": 4.0,
"rcut_smth": 3.5,
"neuron": [
10,
20,
40
],
"resnet_dt": false,
"seed": 17
},
"fitting_net": {
"type": "ener",
"neuron": [
120,
120,
120
],
"resnet_dt": false,
"seed": 9
}
},
"learning_rate": {
"type": "exp",
"start_lr": 0.001,
"stop_lr": 1e-08,
"decay_steps": 500
},
"loss": {
"type": "ener",
"start_pref_e": 0.02,
"limit_pref_e": 2.0,
"start_pref_f": 1000,
"limit_pref_f": 1.0,
"start_pref_v": 0,
"limit_pref_v": 0
},
"training": {
"numb_steps": 10000,
"seed": 11,
"disp_file": "lcurve.out",
"disp_freq": 1,
"save_freq": 40,
"save_ckpt": "model.ckpt",
"disp_training": true,
"time_training": true,
"profiling": true,
"profiling_file": "timeline.json"
}
},
"dp_compress": true,
"fp_task_max": 100,
"fp_task_min": 10,
"model_devi_engine": "lammps",
"model_devi_jobs": [
{
"sys_idx": [0,3,
4,5,8,
9,10],
"temps": [300,1000],
"trj_freq": 10,
"nsteps": 2000,
"ensemble": "nvt"
},
{
"sys_idx": [1,2,
4,6,7,
9,10],
"temps": [300,2000],
"trj_freq": 10,
"nsteps": 2000,
"ensemble": "nvt"
}
],
"model_devi_dt": 0.002,
"model_devi_skip": 0,
"model_devi_f_trust_lo": 0.01,
"model_devi_f_trust_hi": 1,
"model_devi_clean_traj": false,
"shuffle_poscar": false,
"fp_style": "vasp",
"fp_pp_path": "../ff",
"fp_pp_files": [
"POTCAR"
],
"fp_incar": "../incar/INCAR"
}
And the input.json in the work path was:
{
"model": {
"type_map": [
"C"
],
"descriptor": {
"type": "se_atten_v2",
"sel": 80,
"rcut": 4.0,
"rcut_smth": 3.5,
"neuron": [
10,
20,
40
],
"resnet_dt": false,
"seed": 1509306854,
"axis_neuron": 4,
"activation_function": "tanh",
"type_one_side": false,
"precision": "default",
"trainable": true,
"exclude_types": [],
"attn": 128,
"attn_layer": 2,
"attn_dotr": true,
"attn_mask": false,
"set_davg_zero": false
},
"fitting_net": {
"type": "ener",
"neuron": [
120,
120,
120
],
"resnet_dt": false,
"seed": 3358502526,
"numb_fparam": 0,
"numb_aparam": 0,
"activation_function": "tanh",
"precision": "default",
"trainable": true,
"rcond": null,
"atom_ener": [],
"use_aparam_as_mask": false
},
"data_stat_nbatch": 10,
"data_stat_protect": 0.01,
"data_bias_nsample": 10,
"srtab_add_bias": true,
"type": "standard",
"compress": {
"model_file": "frozen_model.pb",
"min_nbor_dist": 1.0073871479227832,
"table_config": [
5,
0.01,
0.1,
-1
],
"type": "se_e2_a"
}
},
"learning_rate": {
"type": "exp",
"start_lr": 0.001,
"stop_lr": 1e-08,
"decay_steps": 500,
"scale_by_worker": "linear"
},
"loss": {
"type": "ener",
"start_pref_e": 0.02,
"limit_pref_e": 2.0,
"start_pref_f": 1000,
"limit_pref_f": 1.0,
"start_pref_v": 0,
"limit_pref_v": 0,
"start_pref_ae": 0.0,
"limit_pref_ae": 0.0,
"start_pref_pf": 0.0,
"limit_pref_pf": 0.0,
"enable_atom_ener_coeff": false,
"start_pref_gf": 0.0,
"limit_pref_gf": 0.0,
"numb_generalized_coord": 0
},
"training": {
"numb_steps": 10000,
"seed": 992369420,
"disp_file": "lcurve.out",
"disp_freq": 1,
"save_freq": 40,
"save_ckpt": "model-compression/model.ckpt",
"disp_training": true,
"time_training": true,
"profiling": true,
"profiling_file": "timeline.json",
"training_data": {
"systems": [
"../data.init/density_varying_init/1.0_3000K/deepmd_data",
"../data.init/density_varying_init/1.5_3000K/deepmd_data",
"../data.init/density_varying_init/2.0_3000K/deepmd_data",
"../data.init/density_varying_init/2.4_3000K/deepmd_data",
"../data.init/density_varying_init/2.8_3000K/deepmd_data",
"../data.init/density_varying_init/3.0_4000K/deepmd_data",
"../data.init/density_varying_init/3.0_5000K/deepmd_data",
"../data.init/density_varying_init/3.2_3000K/deepmd_data",
"../data.init/density_varying_init/3.4_4000K/deepmd_data",
"../data.init/density_varying_init/3.4_5000K/deepmd_data",
"../data.init/density_varying_init/3.6_3000K/deepmd_data",
"../data.init/density_varying_init/4.0_5000K/deepmd_data",
"../data.init/density_varying_init/4.4_5000K/deepmd_data"
],
"batch_size": [
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10
],
"set_prefix": "set",
"auto_prob": "prob_sys_size",
"sys_probs": null
},
"validation_data": null,
"enable_profiler": false,
"tensorboard": false,
"tensorboard_log_dir": "log",
"tensorboard_freq": 1
}
}
However, i got the error message in the train.log when compressing the model:
DEEPMD INFO saved checkpoint model.ckpt
DEEPMD INFO average training time: 4.8215 s/batch (exclude first 1 batches)
DEEPMD INFO finished training
DEEPMD INFO wall time: 65748.233 s
WARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/python/compat/v2_compat.py:107: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term
WARNING:root:To get the best performance, it is recommended to adjust the number of threads by setting the environment variables OMP_NUM_THREADS, TF_INTRA_OP_PARALLELISM_THREADS, and TF_INTER_OP_PARALLELISM_THREADS. See https://deepmd.rtfd.io/parallelism/ for more information.
DEEPMD WARNING The following nodes are not in the graph: {'fitting_attr/aparam_nall', 'spin_attr/ntypes_spin'}. Skip freezeing these nodes. You may be freezing a checkpoint generated by an old version.
DEEPMD INFO The following nodes will be frozen: ['o_energy', 'model_attr/model_version', 'fitting_attr/dfparam', 'model_attr/tmap', 'o_virial', 'train_attr/min_nbor_dist', 'o_atom_energy', 'descrpt_attr/rcut', 'model_attr/model_type', 't_mesh', 'o_force', 'o_atom_virial', 'model_type', 'fitting_attr/daparam', 'train_attr/training_script', 'descrpt_attr/ntypes']
WARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/freeze.py:370: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Use
tf.compat.v1.graph_util.convert_variables_to_constantsWARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/freeze.py:370: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Use
tf.compat.v1.graph_util.convert_variables_to_constantsWARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/python/framework/convert_to_constants.py:925: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Use
tf.compat.v1.graph_util.extract_sub_graphWARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/python/framework/convert_to_constants.py:925: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Use
tf.compat.v1.graph_util.extract_sub_graphDEEPMD INFO 1640 ops in the final graph.
WARNING:tensorflow:From /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/python/compat/v2_compat.py:107: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term
WARNING:root:To get the best performance, it is recommended to adjust the number of threads by setting the environment variables OMP_NUM_THREADS, TF_INTRA_OP_PARALLELISM_THREADS, and TF_INTER_OP_PARALLELISM_THREADS. See https://deepmd.rtfd.io/parallelism/ for more information.
DEEPMD INFO
DEEPMD INFO stage 1: compress the model
DEEPMD WARNING Switch to serial execution due to lack of horovod module.
DEEPMD INFO _____ _____ __ __ _____ _ _ _
DEEPMD INFO | __ \ | __ \ | / || __ \ | | ()| |
DEEPMD INFO | | | | ___ ___ | |__) || \ / || | | | ______ | | __ _ | |
DEEPMD INFO | | | | / _ \ / _ | / | |/| || | | |||| |/ /| || |
DEEPMD INFO | || || /| /| | | | | || || | | < | || |
DEEPMD INFO |/ _| _||| || |_||____/ ||_|| __|
DEEPMD INFO Please read and cite:
DEEPMD INFO Wang, Zhang, Han and E, Comput.Phys.Comm. 228, 178-184 (2018)
DEEPMD INFO Zeng et al, J. Chem. Phys., 159, 054801 (2023)
DEEPMD INFO See https://deepmd.rtfd.io/credits/ for details.
DEEPMD INFO installed to: /home1/zhangzt/anaconda3/envs/deepmd
DEEPMD INFO source : v2.2.7
DEEPMD INFO source brach: HEAD
DEEPMD INFO source commit: 839f4fe
DEEPMD INFO source commit at: 2023-10-27 21:10:24 +0800
DEEPMD INFO build float prec: double
DEEPMD INFO build variant: cpu
DEEPMD INFO build with tf inc: /home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/include;/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/tensorflow/../../../../include
DEEPMD INFO build with tf lib:
DEEPMD INFO ---Summary of the training---------------------------------------
DEEPMD INFO running on: n0125
DEEPMD INFO computing device: cpu:0
DEEPMD INFO Count of visible GPU: 0
DEEPMD INFO num_intra_threads: 0
DEEPMD INFO num_inter_threads: 0
DEEPMD INFO -----------------------------------------------------------------
DEEPMD INFO training without frame parameter
Traceback (most recent call last):
File "/home1/zhangzt/anaconda3/envs/deepmd/bin/dp", line 10, in
sys.exit(main())
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd_cli/main.py", line 635, in main
deepmd_main(args)
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/main.py", line 82, in main
compress(**dict_args)
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/compress.py", line 150, in compress
train(
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/train.py", line 168, in train
_do_work(jdata, run_opt, is_compress)
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/entrypoints/train.py", line 280, in _do_work
model.build(train_data, stop_batch, origin_type_map=origin_type_map)
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/train/trainer.py", line 298, in build
self.model.enable_compression()
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/model/model.py", line 618, in enable_compression
self.descrpt.enable_compression(
File "/home1/zhangzt/anaconda3/envs/deepmd/lib/python3.10/site-packages/deepmd/descriptor/se_atten.py", line 391, in enable_compression
raise RuntimeError("can not compress model when attention layer is not 0.")
RuntimeError: can not compress model when attention layer is not 0.