-
Notifications
You must be signed in to change notification settings - Fork 411
Description
I make the mxnet and when I runned the FCIS,I got the following problem,and I suppose it's because cudnn version.cuda8.0 cudnn6
loading annotations into memory...
Done (t=0.52s)
creating index...
index created!
num_images 6048
prepare gt_sdsdb using 1.19558501244 seconds
generate cache_seg_inst using 0.203454017639 seconds
append flipped images to roidb
filtered 0 roidb entries: 12096 -> 12096
providing maximum shape [('data', (1, 3, 756, 1008)), ('gt_boxes', (1, 100, 5)), ('gt_masks', (1, 100, 756, 1008))] [('proposal_label', (1, 45360)), ('proposal_bbox_target', (1, 60, 48, 63)), ('proposal_bbox_weight', (1, 60, 48, 63))]
data shape:
{'data': (1L, 3L, 756L, 1008L),
'gt_boxes': (1L, 9L, 5L),
'gt_masks': (9L, 756L, 1008L),
'im_info': (1L, 3L),
'proposal_bbox_target': (1L, 60L, 48L, 63L),
'proposal_bbox_weight': (1L, 60L, 48L, 63L),
'proposal_label': (1L, 45360L)}
lr 0.0001 lr_epoch_diff [5.0] lr_iters [60480]
Traceback (most recent call last):
File "experiments/fcis/fcis_end2end_train_test.py", line 13, in
train_end2end.main()
File "experiments/fcis/../../fcis/train_end2end.py", line 187, in main
config.TRAIN.lr, config.TRAIN.lr_step)
File "experiments/fcis/../../fcis/train_end2end.py", line 179, in train_net
arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
File "experiments/fcis/../../fcis/core/module.py", line 961, in fit
self.update_metric(eval_metric, data_batch.label)
File "experiments/fcis/../../fcis/core/module.py", line 1055, in update_metric
self._curr_module.update_metric(eval_metric, labels)
File "experiments/fcis/../../fcis/core/module.py", line 669, in update_metric
self._exec_group.update_metric(eval_metric, labels)
File "experiments/fcis/../../fcis/core/DataParallelExecutorGroup.py", line 529, in update_metric
eval_metric.update(labels, texec.outputs)
File "/home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/metric.py", line 318, in update
metric.update(labels, preds)
File "experiments/fcis/../../fcis/core/metric.py", line 45, in update
pred_label = mx.ndarray.argmax_channel(pred).asnumpy().astype('int32')
File "/home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/ndarray/ndarray.py", line 1980, in asnumpy
ctypes.c_size_t(data.size)))
File "/home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/base.py", line 252, in check_call
raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: [16:31:18] src/operator/nn/./cudnn/cudnn_convolution-inl.h:449: Check failed: e == CUDNN_STATUS_SUCCESS (3 vs. 0) cuDNN: CUDNN_STATUS_BAD_PARAM
Stack trace returned 10 entries:
[bt] (0) /home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/libmxnet.so(dmlc::StackTraceabi:cxx11+0x5b) [0x7fd55ea8c2bb]
[bt] (1) /home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x28) [0x7fd55ea8ce28]
[bt] (2) /home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/libmxnet.so(mxnet::op::CuDNNConvolutionOp::InitDescriptors(std::vector<nnvm::TShape, std::allocatornnvm::TShape > const&, std::vector<nnvm::TShape, std::allocatornnvm::TShape > const&, cudnnDataType_t, cudnnDataType_t)+0x149a) [0x7fd56393523a]
[bt] (3) /home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/libmxnet.so(+0x5bd0d7b) [0x7fd563912d7b]
[bt] (4) /home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/libmxnet.so(void mxnet::op::ConvolutionComputemshadow::gpu(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocatormxnet::TBlob > const&, std::vector<mxnet::OpReqType, std::allocatormxnet::OpReqType > const&, std::vector<mxnet::TBlob, std::allocatormxnet::TBlob > const&)+0xe30) [0x7fd563917910]
[bt] (5) /home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/libmxnet.so(mxnet::exec::FComputeExecutor::Run(mxnet::RunContext, bool)+0x59) [0x7fd561caa239]
[bt] (6) /home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/libmxnet.so(+0x3f32f60) [0x7fd561c74f60]
[bt] (7) /home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/libmxnet.so(mxnet::engine::ThreadedEngine::ExecuteOprBlock(mxnet::RunContext, mxnet::engine::OprBlock*)+0x8f5) [0x7fd561bc39c5]
[bt] (8) /home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/libmxnet.so(void mxnet::engine::ThreadedEnginePerDevice::GPUWorker<(dmlc::ConcurrentQueueType)0>(mxnet::Context, bool, mxnet::engine::ThreadedEnginePerDevice::ThreadWorkerBlock<(dmlc::ConcurrentQueueType)0>, std::shared_ptrdmlc::ManualEvent const&)+0xeb) [0x7fd561bda10b]
[bt] (9) /home/scau2/anaconda2/lib/python2.7/site-packages/mxnet/libmxnet.so(std::_Function_handler<void (std::shared_ptrdmlc::ManualEvent), mxnet::engine::ThreadedEnginePerDevice::PushToExecute(mxnet::engine::OprBlock, bool)::{lambda()#4}::operator()() const::{lambda(std::shared_ptrdmlc::ManualEvent)#1}>::_M_invoke(std::_Any_data const&, std::shared_ptrdmlc::ManualEvent&&)+0x4e) [0x7fd561bda37e]