Step 1: TF모델을 TRT 포맷으로 변환
Step 2: 모델 Parser 생성
Step 3: 입/출력 레이어 정보 입력
Step 4: 모델의 최적화 및
런타임 Engine 생성
Step 5: 엔진을 파일로 저장
Step 6: 엔진을 파일에서 읽음
Step 7: Inference 수행
•
•
•
•
•
•
•
•
•
PReLUPlugin::PReLUPlugin(const Weights *weights, int nbWeights) {
mWeights = weights[0];
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type));
}
int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace,
cudaStream_t stream) {
const float zerof{0.0f}; const __half zeroh = fp16::__float2half(0.0f);
if (mWeights.type == DataType::__float) {
CHECK(Forward_gpu<__float>(batchSize * mNbInputCount, mNbInputChannels,
mNbInputHeight * mNbInputHeight, reinterpret_cast<const __float *>(mDeviceKernel),
reinterpret_cast<const __float *>(inputs[0]), reinterpret_cast<__float *>(outputs[0]),
zerof, mChannelShared ? mNbInputChannels : 1, stream));
} else { // DataType::kFLOAT }
return 0;
}
template <typename Ftype>
__global__ void PReLUForward(const int n, const int channels, const int dim, const Ftype* slope_data, const
Ftype* in, Ftype* out, const Ftype zero, const int div_factor) {
CUDA_KERNEL_LOOP(index, n) {
int c = (index / dim) % channels / div_factor;
out[index] = (in[index] > (Ftype(zero))) ? in[index] :
in[index] * *(reinterpret_cast<const Ftype*>(slope_data)+c);
}
}
template <typename Ftype>
cudaError_t Forward_gpu(const int count, const int channels, const int dim, const Ftype* mDeviceKernel,
const Ftype* bottom_data, Ftype* top_data, const Ftype zero, const int div_factor, const cudaStream_t stream) {
PReLUForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>
(count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor);
return cudaGetLastError();
}
IPluginExt *PReLUPlugin::clone() const override {
return new PReLUPlugin(&mWeights, 1);
}
IPlugin* pluginFactory::createPlugin(const char* layerName, const Weights* serialData, int nbWeights) override {
return new PReLUPlugin(serialData, serialLength);
}
PluginFactory parserPluginFactory;
parser->setPluginFactoryExt(&parserPluginFactory);
const IBlobNameToTensor *blobNameToTensor =
parser->parse(gParams.deployFile.c_str(), // caffe deploy file
gParams.modelFile.c_str(), // caffe model file
*network, // network definition that the parser will populate
gParams.fp16 ? DataType::kHALF : DataType::kFLOAT);
builder->setMaxBatchSize(gParams.batchSize);
builder->setMaxWorkspaceSize(size_t(gParams.workspaceSize) << 20);
builder->setFp16Mode(gParams.fp16);
ICudaEngine* engine = builder->buildCudaEngine(*network);
void PReLUPlugin::serialize(void *buffer) {
char *d = static_cast<char *>(buffer), *a = d;
write(d, mNbInputChannels); write(d, mNbInputHeight); write(d, mNbInputWidth); write(d, mNbInputCount);
write(d, mChannelShared); write(d, mWeights.count); write(d, mWeights.type);
convertAndCopyToBuffer(d, mWeights);
assert(d == a + getSerializationSize());
}
PReLUPlugin::PReLUPlugin(const void *data, size_t length) {
const char *d = static_cast<const char *>(data), *a = d;
read<int>(d, mNbInputChannels); read<int>(d, mNbInputHeight); read<int>(d, mNbInputWidth);
read<int>(d, mNbInputCount); read<bool>(d, mChannelShared); read<int64_t>(d, mWeights.count);
read<DataType>(d, mWeights.type);
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type));
deserializeToDevice(d, mDeviceKernel, mWeights.count * type2size(mWeights.type));
assert(d == a + length);
}
Iplugin *PluginFactory::createPlugin(const char *layerName, const void *serialData, size_t serialLength) override
{
return new PReLUPlugin(serialData, serialLength);
}
PluginFactory pluginFactory;
engine = infer->deserializeCudaEngine(trt_plan_file, size, &pluginFactory);
cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float),
cudaMemcpyHostToDevice, stream);
context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
cudaStreamCreate(&stream));
IExecutionContext* context = engine->createExecutionContext();
[232]TensorRT를 활용한 딥러닝 Inference 최적화
[232]TensorRT를 활용한 딥러닝 Inference 최적화
[232]TensorRT를 활용한 딥러닝 Inference 최적화
[232]TensorRT를 활용한 딥러닝 Inference 최적화
[232]TensorRT를 활용한 딥러닝 Inference 최적화
[232]TensorRT를 활용한 딥러닝 Inference 최적화

[232]TensorRT를 활용한 딥러닝 Inference 최적화

  • 14.
    Step 1: TF모델을TRT 포맷으로 변환 Step 2: 모델 Parser 생성 Step 3: 입/출력 레이어 정보 입력 Step 4: 모델의 최적화 및 런타임 Engine 생성 Step 5: 엔진을 파일로 저장 Step 6: 엔진을 파일에서 읽음 Step 7: Inference 수행
  • 23.
  • 24.
    • • • PReLUPlugin::PReLUPlugin(const Weights *weights,int nbWeights) { mWeights = weights[0]; mWeights.values = malloc(mWeights.count * type2size(mWeights.type)); memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type)); }
  • 25.
    int PReLUPlugin::enqueue(int batchSize,const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream) { const float zerof{0.0f}; const __half zeroh = fp16::__float2half(0.0f); if (mWeights.type == DataType::__float) { CHECK(Forward_gpu<__float>(batchSize * mNbInputCount, mNbInputChannels, mNbInputHeight * mNbInputHeight, reinterpret_cast<const __float *>(mDeviceKernel), reinterpret_cast<const __float *>(inputs[0]), reinterpret_cast<__float *>(outputs[0]), zerof, mChannelShared ? mNbInputChannels : 1, stream)); } else { // DataType::kFLOAT } return 0; }
  • 26.
    template <typename Ftype> __global__void PReLUForward(const int n, const int channels, const int dim, const Ftype* slope_data, const Ftype* in, Ftype* out, const Ftype zero, const int div_factor) { CUDA_KERNEL_LOOP(index, n) { int c = (index / dim) % channels / div_factor; out[index] = (in[index] > (Ftype(zero))) ? in[index] : in[index] * *(reinterpret_cast<const Ftype*>(slope_data)+c); } }
  • 27.
    template <typename Ftype> cudaError_tForward_gpu(const int count, const int channels, const int dim, const Ftype* mDeviceKernel, const Ftype* bottom_data, Ftype* top_data, const Ftype zero, const int div_factor, const cudaStream_t stream) { PReLUForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>> (count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor); return cudaGetLastError(); }
  • 29.
    IPluginExt *PReLUPlugin::clone() constoverride { return new PReLUPlugin(&mWeights, 1); } IPlugin* pluginFactory::createPlugin(const char* layerName, const Weights* serialData, int nbWeights) override { return new PReLUPlugin(serialData, serialLength); }
  • 30.
    PluginFactory parserPluginFactory; parser->setPluginFactoryExt(&parserPluginFactory); const IBlobNameToTensor*blobNameToTensor = parser->parse(gParams.deployFile.c_str(), // caffe deploy file gParams.modelFile.c_str(), // caffe model file *network, // network definition that the parser will populate gParams.fp16 ? DataType::kHALF : DataType::kFLOAT);
  • 31.
  • 32.
    void PReLUPlugin::serialize(void *buffer){ char *d = static_cast<char *>(buffer), *a = d; write(d, mNbInputChannels); write(d, mNbInputHeight); write(d, mNbInputWidth); write(d, mNbInputCount); write(d, mChannelShared); write(d, mWeights.count); write(d, mWeights.type); convertAndCopyToBuffer(d, mWeights); assert(d == a + getSerializationSize()); }
  • 33.
    PReLUPlugin::PReLUPlugin(const void *data,size_t length) { const char *d = static_cast<const char *>(data), *a = d; read<int>(d, mNbInputChannels); read<int>(d, mNbInputHeight); read<int>(d, mNbInputWidth); read<int>(d, mNbInputCount); read<bool>(d, mChannelShared); read<int64_t>(d, mWeights.count); read<DataType>(d, mWeights.type); mWeights.values = malloc(mWeights.count * type2size(mWeights.type)); memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type)); deserializeToDevice(d, mDeviceKernel, mWeights.count * type2size(mWeights.type)); assert(d == a + length); }
  • 34.
    Iplugin *PluginFactory::createPlugin(const char*layerName, const void *serialData, size_t serialLength) override { return new PReLUPlugin(serialData, serialLength); }
  • 35.
    PluginFactory pluginFactory; engine =infer->deserializeCudaEngine(trt_plan_file, size, &pluginFactory);
  • 36.
    cudaMemcpyAsync(buffers[inputIndex], input, batchSize* INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream); context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr); cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); cudaStreamCreate(&stream)); IExecutionContext* context = engine->createExecutionContext();