TensorRT show no improvement in inference speed

I attempted to deploy the dsvt model to TensorRT according to your deployment code, By the TensorRT official example code I used dynamic shape for dsvt_block model input, Model inference time is about 260ms. However, using pytorch version takes less time, about 140ms. Why the time takes more with TensorRT c++ code?

**Environment**
TensorRT Version: 8.5.1.7
CUDA Version: 11.8
CUDNN Version: 8.6
Hardware GPU: p4000
(the rest is the same as the public)

**inference code**
    
    #include "trt_infer.h"
    #include"cnpy.h"
    TRTInfer::TRTInfer(TrtConfig trt_config): mEngine_(nullptr)
    {
        // return;
        sum_cpy_feature_ = 0.0f;
        sum_cpy_output_ = 0.0f;
        count_ = 0;
        trt_config_ = trt_config;
    
        input_cpy_kind_ = cudaMemcpyHostToDevice;
        output_cpy_kind_ = cudaMemcpyDeviceToHost;
    
        build();
    
        CHECKCUDA(cudaStreamCreate(&stream_), "failed to create cuda stream");
    
        std::cout << "tensorrt init done." << std::endl;
    }
    
    
    bool TRTInfer::build()
    {
        auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
        if (!builder)
        {
            return false;
        }
    
        SampleUniquePtr<nvinfer1::IRuntime> runtime{createInferRuntime(sample::gLogger.getTRTLogger())};
        if (!runtime)
        {
            return false;
        }
    
        // CUDA stream used for profiling by the builder.
        auto profileStream = samplesCommon::makeCudaStream();
        if (!profileStream)
        {
            return false;
        }
    
        const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
        auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
        if (!network)
        {
            return false;
        }
    
        auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
        if (!config)
        {
            return false;
        }
    
        auto parser = SampleUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, sample::gLogger.getTRTLogger()));
        if (!parser)
        {
            return false;
        }
    
        // auto constructed = constructNetwork(builder, network, config, parser);
        // if (!constructed)
        // {
        //     return false;
        // }
    
        //replace conscructNetwork with following code:
        auto parsed = parser->parseFromFile(trt_config_.model_file.c_str(), static_cast<int>(sample::gLogger.getReportableSeverity()));
        if (!parsed)
        {
            return false;
        }
    
        for (int i = 0; i < network->getNbInputs(); i++) {
            std::cout << "network->getInput(i)->getDimensions(): " << network->getInput(i)->getDimensions() << std::endl;
            mInputDims.push_back(network->getInput(i)->getDimensions());
        }
        for (int i = 0; i < network->getNbOutputs(); i++) {
            mOutputDims.push_back(network->getOutput(i)->getDimensions());
        }
    
        config->setProfileStream(*profileStream);
    
    
        config->setAvgTimingIterations(1);
        config->setMinTimingIterations(1);
        config->setMaxWorkspaceSize(static_cast<size_t>(trt_config_.max_workspace)<<20);
        if (builder->platformHasFastFp16() && trt_config_.fp16mode)
        {
            config->setFlag(BuilderFlag::kFP16);
        }
        if (builder->platformHasFastInt8() && trt_config_.int8mode)
        {
            config->setFlag(BuilderFlag::kINT8);
            // samplesCommon::setAllDynamicRanges(network.get(), 127.0f, 127.0f); // in case use int8 without calibration
        }
        builder->setMaxBatchSize(1);
        
        std::unique_ptr<nvinfer1::IInt8Calibrator> calibrator;
        if (builder->platformHasFastInt8() && trt_config_.int8mode)
        {
            MNISTBatchStream calibrationStream(trt_config_.calib_data);
            calibrator.reset(new Int8EntropyCalibrator2<MNISTBatchStream>(calibrationStream, -1, trt_config_.net_name.c_str(), trt_config_.input_name.c_str()));
            config->setInt8Calibrator(calibrator.get());
        }
    
        IOptimizationProfile* profile = builder->createOptimizationProfile();
        profile->setDimensions("src", OptProfileSelector::kMIN, Dims2(1000,128));
        profile->setDimensions("src", OptProfileSelector::kOPT, Dims2(24629,128));
        profile->setDimensions("src", OptProfileSelector::kMAX, Dims2(100000,128));
        profile->setDimensions("set_voxel_inds_tensor_shift_0", OptProfileSelector::kMIN, Dims3(2,50,36));
        profile->setDimensions("set_voxel_inds_tensor_shift_0", OptProfileSelector::kOPT, Dims3(2,1156,36));
        profile->setDimensions("set_voxel_inds_tensor_shift_0", OptProfileSelector::kMAX, Dims3(2,5000,36));
        profile->setDimensions("set_voxel_inds_tensor_shift_1", OptProfileSelector::kMIN, Dims3(2,50,36));
        profile->setDimensions("set_voxel_inds_tensor_shift_1", OptProfileSelector::kOPT, Dims3(2,834,36));
        profile->setDimensions("set_voxel_inds_tensor_shift_1", OptProfileSelector::kMAX, Dims3(2,3200,36));
        profile->setDimensions("set_voxel_masks_tensor_shift_0", OptProfileSelector::kMIN, Dims3(2,50,36));
        profile->setDimensions("set_voxel_masks_tensor_shift_0", OptProfileSelector::kOPT, Dims3(2,1156,36));
        profile->setDimensions("set_voxel_masks_tensor_shift_0", OptProfileSelector::kMAX, Dims3(2,5000,36));
        profile->setDimensions("set_voxel_masks_tensor_shift_1", OptProfileSelector::kMIN, Dims3(2,50,36));
        profile->setDimensions("set_voxel_masks_tensor_shift_1", OptProfileSelector::kOPT, Dims3(2,834,36));
        profile->setDimensions("set_voxel_masks_tensor_shift_1", OptProfileSelector::kMAX, Dims3(2,3200,36));
        profile->setDimensions("pos_embed_tensor", OptProfileSelector::kMIN, Dims4(4,2,1000,128));
        profile->setDimensions("pos_embed_tensor", OptProfileSelector::kOPT, Dims4(4,2,24629,128));
        profile->setDimensions("pos_embed_tensor", OptProfileSelector::kMAX, Dims4(4,2,100000,128));
        config->addOptimizationProfile(profile);
    
        SampleUniquePtr<nvinfer1::IHostMemory> plan{builder->buildSerializedNetwork(*network, *config)};
        if (!plan)
        {
            return false;
        }
    
        mEngine_ = std::shared_ptr<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(plan->data(), plan->size()), samplesCommon::InferDeleter());
        if (!mEngine_)
        {
            return false;
        }
    
        // Create RAII buffer manager object
        context_ = mEngine_->createExecutionContext();
        if (!context_)
        {
            return false;
        }
    
        return true;
    
    }
    
    
    void TRTInfer::doinference(std::vector<void*> &inputs, std::vector<float*> &outputs, std::vector<int> &input_dynamic)
    {
       infer_dynamic(inputs, outputs, input_dynamic);
       cudaStreamSynchronize(stream_);
    }
    
    
    bool TRTInfer::infer_dynamic(std::vector<void*> &inputs, std::vector<float*> &outputs, std::vector<int> &input_dynamic)
    {
        double t0 = getTime();
        mInputDims[0] = Dims2{input_dynamic[0], 128};
        mInputDims[1] = Dims3{2, input_dynamic[1], 36};
        mInputDims[2] = Dims3{2, input_dynamic[2], 36};
        mInputDims[3] = Dims3{2, input_dynamic[3], 36};
        mInputDims[4] = Dims3{2, input_dynamic[4], 36};
        mInputDims[5] = Dims4{4, 2, input_dynamic[5], 128};
    
        mInput[0].hostBuffer.resize(mInputDims[0]);
        mInput[1].hostBuffer.resize(mInputDims[1]);
        mInput[2].hostBuffer.resize(mInputDims[2]);
        mInput[3].hostBuffer.resize(mInputDims[3]);
        mInput[4].hostBuffer.resize(mInputDims[4]);
        mInput[5].hostBuffer.resize(mInputDims[5]);
        
    
        std::copy((float*)(inputs[0]), (float*)(inputs[0]) + 1, static_cast<float*>(mInput[0].hostBuffer.data()));
        std::copy((int*)inputs[1], (int*)inputs[1] + 2* input_dynamic[1] * 36, static_cast<int*>(mInput[1].hostBuffer.data()));
        std::copy((int*)inputs[2], (int*)inputs[2] + 2* input_dynamic[2] * 36, static_cast<int*>(mInput[2].hostBuffer.data()));
        std::copy((bool*)inputs[3], (bool*)inputs[3] + 2* input_dynamic[3] * 36, static_cast<bool*>(mInput[3].hostBuffer.data()));
        std::copy((bool*)inputs[4], (bool*)inputs[4] + 2* input_dynamic[4] * 36, static_cast<bool*>(mInput[4].hostBuffer.data()));
        std::copy((float*)inputs[5], (float*)inputs[5] + 4* 2* input_dynamic[5] * 128, static_cast<float*>(mInput[5].hostBuffer.data()));
        cudaStreamSynchronize(stream_);
        double t1 = getTime();
    
        mInput[0].deviceBuffer.resize(mInputDims[0]);
        mInput[1].deviceBuffer.resize(mInputDims[1]);
        mInput[2].deviceBuffer.resize(mInputDims[2]);
        mInput[3].deviceBuffer.resize(mInputDims[3]);
        mInput[4].deviceBuffer.resize(mInputDims[4]);
        mInput[5].deviceBuffer.resize(mInputDims[5]);
    
        CHECK(cudaMemcpy(mInput[0].deviceBuffer.data(), mInput[0].hostBuffer.data(), mInput[0].hostBuffer.nbBytes(), cudaMemcpyHostToDevice));
        CHECK(cudaMemcpy(mInput[1].deviceBuffer.data(), mInput[1].hostBuffer.data(), mInput[1].hostBuffer.nbBytes(), cudaMemcpyHostToDevice));
        CHECK(cudaMemcpy(mInput[2].deviceBuffer.data(), mInput[2].hostBuffer.data(), mInput[2].hostBuffer.nbBytes(), cudaMemcpyHostToDevice));
        CHECK(cudaMemcpy(mInput[3].deviceBuffer.data(), mInput[3].hostBuffer.data(), mInput[3].hostBuffer.nbBytes(), cudaMemcpyHostToDevice));
        CHECK(cudaMemcpy(mInput[4].deviceBuffer.data(), mInput[4].hostBuffer.data(), mInput[4].hostBuffer.nbBytes(), cudaMemcpyHostToDevice));
        CHECK(cudaMemcpy(mInput[5].deviceBuffer.data(), mInput[5].hostBuffer.data(), mInput[5].hostBuffer.nbBytes(), cudaMemcpyHostToDevice));
        cudaStreamSynchronize(stream_);
        double t2 = getTime();
    
        context_->setBindingDimensions(0, mInputDims[0]);
        context_->setBindingDimensions(1, mInputDims[1]);
        context_->setBindingDimensions(2, mInputDims[2]);
        context_->setBindingDimensions(3, mInputDims[3]);
        context_->setBindingDimensions(4, mInputDims[4]);
        context_->setBindingDimensions(5, mInputDims[5]);
        // context_->setBindingDimensions(6, mInputDims[6]);
        std::cout << "mEngine_->getNbBindings(): " << mEngine_->getNbBindings() << std::endl;
        std::cout << " mEngine_->getBindingDimensions(i)" <<  mEngine_->getBindingDimensions(0) << std::endl;
        std::cout << " context_->getBindingDimensions(i)" <<  context_->getBindingDimensions(0) << std::endl;
        cudaStreamSynchronize(stream_);
        double t3 = getTime();
    
        // We can only run inference once all dynamic input shapes have been specified.
        if (!context_->allInputDimensionsSpecified())
        {
            return false;
        }
        mOutputDims[0] = mInputDims[0];
        mOutput[0].deviceBuffer.resize(mOutputDims[0]);
        mOutput[0].hostBuffer.resize(mOutputDims[0]);
        std::vector<void*> processorBindings = {mInput[0].deviceBuffer.data(),
                                                mInput[1].deviceBuffer.data(),
                                                mInput[2].deviceBuffer.data(),
                                                mInput[3].deviceBuffer.data(),
                                                mInput[4].deviceBuffer.data(),
                                                mInput[5].deviceBuffer.data(),
                                                mOutput[0].deviceBuffer.data()};
        cudaStreamSynchronize(stream_);
        double t4 = getTime();
        bool status = context_->executeV2(processorBindings.data());
        if (!status)
        {
            return false;
        }
        cudaStreamSynchronize(stream_);
        double t5 = getTime();
    
        CHECK(cudaMemcpy(mOutput[0].hostBuffer.data(), mOutput[0].deviceBuffer.data(), mOutput[0].deviceBuffer.nbBytes(),
            cudaMemcpyDeviceToHost));
        cudaStreamSynchronize(stream_);
        double t6 = getTime();
        // cnpy::npy_save("dsvt_output_tensor.npy", static_cast<float*>(mOutput[0].hostBuffer.data()), {mOutput[0].deviceBuffer.nbBytes()/4},"w");
        std::cout << "time elapse:" << t1-t0 << std::endl;
        std::cout << "time elapse:" << t2-t1 << std::endl;
        std::cout << "time elapse:" << t3-t2 << std::endl;
        std::cout << "time elapse:" << t4-t3 << std::endl;
        std::cout << "time elapse:" << t5-t4 << std::endl;
        std::cout << "time elapse:" << t6-t5 << std::endl;
        return true;
    
    }


according to results, the average time cost of each stage, as following:
    t1-t0:0.00860953
    t2-t1:0.0124242
    t3-t2:4.72069e-05
    t4-t3:8.10623e-06
    t5-t4:0.260188
    t6-t5:0.00110817

c++ code takes more time? Have some mistakes in inference code? 



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

TensorRT show no improvement in inference speed #43

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

TensorRT show no improvement in inference speed #43

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions