When I enable tensorboard, I got the error. After I comment the following line where the error throws, it works.
Traceback (most recent call last):
File "/home/jz748/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1375, in _do_call
return fn(*args)
File "/home/jz748/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1359, in _run_fn
return self._call_tf_sessionrun(options, feed_dict, fetch_list,
File "/home/jz748/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1451, in _call_tf_sessionrun
return tf_session.TF_SessionRun_wrapper(self._session, options, feed_dict,
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Nan in summary histogram for: rij
[[{{node rij}}]]
[[filter_type_0/matrix_1_0/read/_92]]
(1) Invalid argument: Nan in summary histogram for: rij
[[{{node rij}}]]
0 successful operations.
0 derived errors ignored.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/jz748/anaconda3/envs/dpdev/bin/dp", line 33, in <module>
sys.exit(load_entry_point('deepmd-kit', 'console_scripts', 'dp')())
File "/home/jz748/codes/deepmd-kit/deepmd/entrypoints/main.py", line 383, in main
train(**dict_args)
File "/home/jz748/codes/deepmd-kit/deepmd/entrypoints/train.py", line 211, in train
_do_work(jdata, run_opt)
File "/home/jz748/codes/deepmd-kit/deepmd/entrypoints/train.py", line 265, in _do_work
model.train(train_data, valid_data)
File "/home/jz748/codes/deepmd-kit/deepmd/train/trainer.py", line 519, in train
summary, _ = self.sess.run([summary_merged_op, self.train_op], feed_dict=train_feed_dict,
File "/home/jz748/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 967, in run
result = self._run(None, fetches, feed_dict, options_ptr,
File "/home/jz748/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1190, in _run
results = self._do_run(handle, final_targets, final_fetches,
File "/home/jz748/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1368, in _do_run
return self._do_call(_run_fn, feeds, fetches, targets, options,
File "/home/jz748/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1394, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Nan in summary histogram for: rij
[[node rij (defined at /home/jz748/codes/deepmd-kit/deepmd/descriptor/se_a.py:367) ]]
[[filter_type_0/matrix_1_0/read/_92]]
(1) Invalid argument: Nan in summary histogram for: rij
[[node rij (defined at /home/jz748/codes/deepmd-kit/deepmd/descriptor/se_a.py:367) ]]
0 successful operations.
0 derived errors ignored.
Errors may have originated from an input operation.
Input Source operations connected to node rij:
ProdEnvMatA (defined at <string>:1307)
Input Source operations connected to node rij:
ProdEnvMatA (defined at <string>:1307)
Original stack trace for 'rij':
File "/anaconda3/envs/dpdev/bin/dp", line 33, in <module>
sys.exit(load_entry_point('deepmd-kit', 'console_scripts', 'dp')())
File "/codes/deepmd-kit/deepmd/entrypoints/main.py", line 383, in main
train(**dict_args)
File "/codes/deepmd-kit/deepmd/entrypoints/train.py", line 211, in train
_do_work(jdata, run_opt)
File "/codes/deepmd-kit/deepmd/entrypoints/train.py", line 261, in _do_work
model.build(train_data, stop_batch)
File "/codes/deepmd-kit/deepmd/train/trainer.py", line 315, in build
self._build_network(data)
File "/codes/deepmd-kit/deepmd/train/trainer.py", line 342, in _build_network
= self.model.build (self.place_holders['coord'],
File "/codes/deepmd-kit/deepmd/model/ener.py", line 154, in build
= self.descrpt.build(coord_,
File "/codes/deepmd-kit/deepmd/descriptor/se_a.py", line 367, in build
tf.summary.histogram('rij', self.rij)
File "/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/summary/summary.py", line 178, in histogram
val = _gen_logging_ops.histogram_summary(
File "/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/ops/gen_logging_ops.py", line 285, in histogram_summary
_, _, _op, _outputs = _op_def_library._apply_op_helper(
File "/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/framework/op_def_library.py", line 748, in _apply_op_helper
op = g._create_op_internal(op_type_name, inputs, dtypes=None, File "/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 3528, in _create_op_internal
ret = Operation(
File "/anaconda3/envs/dpdev/lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 1990, in __init__
self._traceback = tf_stack.extract_stack()
Summary
When I enable tensorboard, I got the error. After I comment the following line where the error throws, it works.
deepmd-kit/deepmd/descriptor/se_a.py
Line 367 in 86db65d
Deepmd-kit version, installation way, input file, running commands, error log, etc.
v2.0.0.b2, conda, examples/water/se_e2_a (with tensorboard enable)
Steps to Reproduce
cd examples/water/se_e2_aAfter set
tensorboardtotrue,Further Information, Files, and Links