// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP #define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP #include "../../op_cuda.hpp" #include "../csl/cudnn.hpp" #include "../csl/stream.hpp" #include "../csl/tensor.hpp" #include "../csl/tensor_ops.hpp" #include "../kernels/scale_shift.hpp" #include "../kernels/activations.hpp" #include "../kernels/activation_eltwise.hpp" #include "../kernels/bias_activation.hpp" #include "../kernels/bias_eltwise_activation.hpp" #include "../kernels/bias_activation_eltwise.hpp" #include "../kernels/activation_eltwise.hpp" #include "../kernels/eltwise_activation.hpp" #include "../kernels/eltwise_ops.hpp" #include #include #include #include #include #include namespace cv { namespace dnn { namespace cuda4dnn { struct ConvolutionConfiguration { /* the size of the following vectors must be equal to the kernel size */ std::vector kernel_size; std::vector dilations, strides; enum class PaddingMode { MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */ VALID, /* no padding is added */ SAME /* TensorFlow logic is used for same padding */ }; /* explicit paddings are used if and only if padMode is set to manual */ PaddingMode padMode; std::vector pads_begin, pads_end; /* full shape inclusive of channel and batch axis */ std::vector input_shape; std::vector output_shape; /* group count for grouped convolution */ std::size_t groups; enum class FusionMode { NONE, ACTIVATION, /* act(conv) */ ELTWISE_SUM, /* eltwise + conv */ /* eltwise tensor is passed as second input to forward */ ELTWISE_SUM_THEN_ACTIVATION, /* act(conv + eltwise) */ ACTIVATION_THEN_ELTWISE_SUM, /* act(conv) + eltwise */ }; FusionMode fusion_mode; enum class ActivationType { IDENTITY, RELU, /* uses value provided in `relu_negative_slope` */ CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */ POWER, TANH, SIGMOID, SWISH, MISH }; ActivationType activation_type; float relu_negative_slope, crelu_floor, crelu_ceil; float power_exp, power_scale, power_shift; }; template class ConvolutionOp final : public CUDABackendNode { public: using wrapper_type = GetCUDABackendWrapperType; ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle_, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias) : stream(std::move(stream_)), cudnnHandle(std::move(handle_)) { const auto& kernel_size = config.kernel_size; const auto& dilations = config.dilations; const auto& strides = config.strides; const auto convolution_order = kernel_size.size(); CV_Assert(convolution_order == dilations.size()); CV_Assert(convolution_order == strides.size()); const auto& input_shape = config.input_shape; const auto& output_shape = config.output_shape; CV_Assert(input_shape.size() == output_shape.size()); CV_Assert(input_shape.size() == convolution_order + 2); const auto groups = config.groups; CV_Assert (1 <= convolution_order && convolution_order <= 3); const auto rank = input_shape.size(); const auto output_feature_maps = output_shape[1]; const auto input_feature_maps = input_shape[1]; const auto input_feature_maps_per_group = input_feature_maps / groups; CV_Assert(input_feature_maps % groups == 0); filtersTensor = csl::makeTensorHeader(filters); csl::copyMatToTensor(filters, filtersTensor, stream); if (!bias.empty()) { biasTensor = csl::makeTensorHeader(bias); csl::copyMatToTensor(bias, biasTensor, stream); } /* left and right are misleading as the padding is applicable for any number of dimensions * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` * * `common_padding` contains the amount of padding that has to be added to both sides * `padding_left` and `padding_right` contains the amount of padding that needs to be added * to a particular side in addition to the common padding */ std::vector common_padding(rank, 0); std::vector padding_left(rank, 0), padding_right(rank, 0); if (config.padMode == ConvolutionConfiguration::PaddingMode::MANUAL) { const auto& pads_begin = config.pads_begin; const auto& pads_end = config.pads_end; CV_Assert(convolution_order == pads_begin.size()); CV_Assert(convolution_order == pads_end.size()); for (int i = 2; i < common_padding.size(); i++) { common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]); padding_left[i] = pads_begin[i - 2] - common_padding[i]; padding_right[i] = pads_end[i - 2] - common_padding[i]; } } else if (config.padMode == ConvolutionConfiguration::PaddingMode::VALID) { /* nothing to do as the paddings are already preset to zero */ } else if (config.padMode == ConvolutionConfiguration::PaddingMode::SAME) { /* TensorFlow Logic: * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] * * if total padding is odd, the extra is added towards the end */ for (int i = 2; i < rank; i++) { const auto j = i - 2; /* filter index */ const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; const auto required_total_padding = std::max(0, (output_shape[i] - 1) * strides[j] + effective_kernel_size - input_shape[i]); common_padding[i] = required_total_padding / 2; padding_left[i] = 0; padding_right[i] = required_total_padding % 2; } } /* in some scenarios, the extra padding at the end may not change the output at all */ for (int i = 2; i < rank; i++) { const auto j = i - 2; /* filter idx */ const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j]; /* the output shape doesn't change if we decrease the total padding by at most `rem` * provided that we decrease from the right */ if (rem && padding_right[i] > 0) padding_right[i] = std::max(0, padding_right[i] - rem); } auto is_not_zero = [](std::size_t i) { return i != 0; }; if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) || std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero)) { /* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by * copying the input to a bigger tensor and padding the ends manually */ transformed_shape = input_shape; for (int i = 0; i < rank; i++) transformed_shape[i] += padding_left[i] + padding_right[i]; inputTransformer = csl::TensorTransform(cudnnHandle, padding_left, padding_right); } typename csl::Convolution::params_type params; if (transformed_shape.empty()) { params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); } else { /* the convolution operation will be seeing the transformed input */ params.input_shape.assign(std::begin(transformed_shape), std::end(transformed_shape)); } auto& fshape = params.filter_shape; fshape.resize(rank); fshape[0] = output_feature_maps; fshape[1] = input_feature_maps_per_group; std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2); CV_Assert(fshape.size() == kernel_size.size() + 2); params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); params.stride = strides; params.dilation = dilations; params.groups = config.groups; fusion_mode = config.fusion_mode; activation = config.activation_type; relu_negative_slope = config.relu_negative_slope; crelu_floor = config.crelu_floor; crelu_ceil = config.crelu_ceil; power_exp = config.power_exp; power_scale = config.power_scale; power_shift = config.power_shift; /* we normally use cuDNN for convolution and perform bias, activation and eltwise ops ourselves * hence, the activation for cuDNN is IDENTITY by default */ fusion_location = InternalFusionLocation::NATIVE; /* i.e. we perform bias, act and eltwise */ params.eltwise = false; params.activation_type = csl::Convolution::ActivationType::IDENTITY; /* cuDNN can fuse the operations with convolution in some cases; try if it's possible */ if (!biasTensor.empty() && 0 && biasTensor.size() == output_feature_maps && /* cuDNN requirement */ activation == ConvolutionConfiguration::ActivationType::RELU && /* cuDNN requirement */ relu_negative_slope == 0.0 && /* cuDNN requirement */ (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION || /* act(conv + bias) */ fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) /* act(conv + bias + eltwise) */ ) { bool do_not_fuse = false; if(std::is_same::value) { /* performance degrades if fused with tensor core based convolutions in most cases */ int device; CUDA4DNN_CHECK_CUDA(cudaGetDevice(&device)); int cc_major; CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device)); if (cc_major >= 7) do_not_fuse = true; } if (!do_not_fuse) { fusion_location = InternalFusionLocation::CUDNN; auto bias_shape = std::vector(rank, 1); bias_shape[1] = output_feature_maps; params.bias_shape = bias_shape; if (config.fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) params.eltwise = true; params.activation_type = csl::Convolution::ActivationType::RELU; } } convoluter = csl::Convolution(cudnnHandle, params); csl::WorkspaceBuilder builder; if (!transformed_shape.empty()) { auto& shape = transformed_shape; auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies()); builder.require(sz); } builder.require(convoluter.get_workspace_size()); scratch_mem_in_bytes = builder.required_workspace_size(); } void forward( const std::vector>& inputs, const std::vector>& outputs, csl::Workspace& workspace) override { /* input[0] = conv input, input[1] = bias (from fused eltwise layer) */ CV_Assert(inputs.size() == 1 || inputs.size() == 2); CV_Assert(outputs.size() == 1); csl::WorkspaceAllocator allocator(workspace); auto input_wrapper = inputs[0].dynamicCast(); auto input = input_wrapper->getView(); if (!transformed_shape.empty()) { auto& shape = transformed_shape; auto transformed_input = allocator.get_tensor_span(std::begin(shape), std::end(shape)); inputTransformer.transform(input, transformed_input); input = transformed_input; } auto conv_scratchpad = allocator.get_instance(); auto output_wrapper = outputs[0].dynamicCast(); auto output = output_wrapper->getSpan(); if (fusion_location == InternalFusionLocation::CUDNN) { try { if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION) convoluter.convolve_with_bias_activation(output, input, filtersTensor, biasTensor, conv_scratchpad); else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) { auto eltwise_wrapper = inputs[1].dynamicCast(); auto eltwise = eltwise_wrapper->getView(); CV_Assert(is_shape_same(eltwise, output)); convoluter.convolve_with_bias_eltwise_activation(output, input, filtersTensor, biasTensor, eltwise, conv_scratchpad); } } catch(const csl::cudnn::cuDNNException& ex) { if (ex.getCUDNNStatus() == CUDNN_STATUS_NOT_SUPPORTED) { /* drop cuDNN fusion and use the native fusion path */ fusion_location = InternalFusionLocation::NATIVE; } else throw; } } if (fusion_location == InternalFusionLocation::NATIVE) { convoluter.convolve(output, input, filtersTensor, conv_scratchpad); if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM || fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION || fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM) { CV_Assert(inputs.size() == 2); } if (!biasTensor.empty() && inputs.size() == 2) { /* bias and eltwise */ CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM || fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION || fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM); auto eltwise_wrapper = inputs[1].dynamicCast(); auto eltwise = eltwise_wrapper->getView(); CV_Assert(is_shape_same(eltwise, output)); std::size_t inner_size = output.size_range(2, output.rank()); if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM) { kernels::biasN_eltwise_sum_2_identity_inplace(stream, output, inner_size, biasTensor, eltwise); } else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) { /* activation(conv + bias + eltwise) */ switch (activation) { case ConvolutionConfiguration::ActivationType::IDENTITY: kernels::biasN_eltwise_sum_2_identity_inplace(stream, output, inner_size, biasTensor, eltwise); break; case ConvolutionConfiguration::ActivationType::RELU: kernels::biasN_eltwise_sum_2_relu_inplace(stream, output, inner_size, biasTensor, eltwise, relu_negative_slope); break; case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: kernels::biasN_eltwise_sum_2_clipped_relu_inplace(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: kernels::biasN_eltwise_sum_2_power_inplace(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::biasN_eltwise_sum_2_tanh_inplace(stream, output, inner_size, biasTensor, eltwise); break; case ConvolutionConfiguration::ActivationType::SIGMOID: kernels::biasN_eltwise_sum_2_sigmoid_inplace(stream, output, inner_size, biasTensor, eltwise); break; case ConvolutionConfiguration::ActivationType::SWISH: kernels::biasN_eltwise_sum_2_swish_inplace(stream, output, inner_size, biasTensor, eltwise); break; case ConvolutionConfiguration::ActivationType::MISH: kernels::biasN_eltwise_sum_2_mish_inplace(stream, output, inner_size, biasTensor, eltwise); break; } } else if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM) { /* activation(conv + bias) + eltwise */ switch (activation) { case ConvolutionConfiguration::ActivationType::IDENTITY: kernels::biasN_eltwise_sum_2_identity_inplace(stream, output, inner_size, biasTensor, eltwise); break; case ConvolutionConfiguration::ActivationType::RELU: kernels::biasN_relu_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise, relu_negative_slope); break; case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: kernels::biasN_clipped_relu_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: kernels::biasN_power_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::biasN_tanh_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise); break; case ConvolutionConfiguration::ActivationType::SIGMOID: kernels::biasN_sigmoid_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise); break; case ConvolutionConfiguration::ActivationType::SWISH: kernels::biasN_swish_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise); break; case ConvolutionConfiguration::ActivationType::MISH: kernels::biasN_mish_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise); break; } } } else if (!biasTensor.empty() && inputs.size() == 1) { /* bias but no eltwise */ CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::NONE || fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION); std::size_t inner_size = output.size_range(2, output.rank()); switch(activation) { case ConvolutionConfiguration::ActivationType::IDENTITY: kernels::biasN(stream, output, output, inner_size, biasTensor); break; case ConvolutionConfiguration::ActivationType::RELU: kernels::biasN_relu_inplace(stream, output, inner_size, biasTensor, relu_negative_slope); break; case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: kernels::biasN_clipped_relu_inplace(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: kernels::biasN_power_inplace(stream, output, inner_size, biasTensor, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::biasN_tanh_inplace(stream, output, inner_size, biasTensor); break; case ConvolutionConfiguration::ActivationType::SIGMOID: kernels::biasN_sigmoid_inplace(stream, output, inner_size, biasTensor); break; case ConvolutionConfiguration::ActivationType::SWISH: kernels::biasN_swish_inplace(stream, output, inner_size, biasTensor); break; case ConvolutionConfiguration::ActivationType::MISH: kernels::biasN_mish_inplace(stream, output, inner_size, biasTensor); break; } } else if (biasTensor.empty() && inputs.size() == 2) { /* no bias but eltwise */ CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM || fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION || fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM); auto eltwise_wrapper = inputs[1].dynamicCast(); auto eltwise = eltwise_wrapper->getView(); CV_Assert(is_shape_same(eltwise, output)); /* we pass `eltwise` as `bias` (with `inner_size` as one) to bias-activation kernels */ if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM) { kernels::eltwise_sum_2(stream, output, output, eltwise); } else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) { switch (activation) { case ConvolutionConfiguration::ActivationType::IDENTITY: kernels::eltwise_sum_2(stream, output, output, eltwise); break; case ConvolutionConfiguration::ActivationType::RELU: kernels::eltwise_sum_2_relu(stream, output, output, eltwise, relu_negative_slope); break; case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: kernels::eltwise_sum_2_clipped_relu(stream, output, output, eltwise, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: kernels::eltwise_sum_2_power(stream, output, output, eltwise, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::eltwise_sum_2_tanh(stream, output, output, eltwise); break; case ConvolutionConfiguration::ActivationType::SIGMOID: kernels::eltwise_sum_2_sigmoid(stream, output, output, eltwise); break; case ConvolutionConfiguration::ActivationType::SWISH: kernels::eltwise_sum_2_swish(stream, output, output, eltwise); break; case ConvolutionConfiguration::ActivationType::MISH: kernels::eltwise_sum_2_mish(stream, output, output, eltwise); break; } } else if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM) { switch (activation) { case ConvolutionConfiguration::ActivationType::IDENTITY: kernels::eltwise_sum_2(stream, output, output, eltwise); break; case ConvolutionConfiguration::ActivationType::RELU: kernels::relu_eltwise_sum_2_inplace(stream, output, eltwise, relu_negative_slope); break; case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: kernels::clipped_relu_eltwise_sum_2_inplace(stream, output, eltwise, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: kernels::power_eltwise_sum_2_inplace(stream, output, eltwise, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::tanh_eltwise_sum_2_inplace(stream, output, eltwise); break; case ConvolutionConfiguration::ActivationType::SIGMOID: kernels::sigmoid_eltwise_sum_2_inplace(stream, output, eltwise); break; case ConvolutionConfiguration::ActivationType::SWISH: kernels::swish_eltwise_sum_2_inplace(stream, output, eltwise); break; case ConvolutionConfiguration::ActivationType::MISH: kernels::mish_eltwise_sum_2_inplace(stream, output, eltwise); break; } } } else if(biasTensor.empty() && inputs.size() == 1) { /* no bias and no eltwise */ CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::NONE || fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION); switch(activation) { case ConvolutionConfiguration::ActivationType::IDENTITY: break; case ConvolutionConfiguration::ActivationType::RELU: kernels::relu(stream, output, output, relu_negative_slope); break; case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: kernels::clipped_relu(stream, output, output, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: kernels::power(stream, output, output, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::tanh(stream, output, output); break; case ConvolutionConfiguration::ActivationType::SIGMOID: kernels::sigmoid(stream, output, output); break; case ConvolutionConfiguration::ActivationType::SWISH: kernels::swish(stream, output, output); break; case ConvolutionConfiguration::ActivationType::MISH: kernels::mish(stream, output, output); break; } } } } std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } private: csl::Stream stream; csl::cudnn::Handle cudnnHandle; csl::Tensor filtersTensor, biasTensor; csl::Convolution convoluter; std::vector transformed_shape; csl::TensorTransform inputTransformer; std::size_t scratch_mem_in_bytes; ConvolutionConfiguration::FusionMode fusion_mode; ConvolutionConfiguration::ActivationType activation; float relu_negative_slope, crelu_floor, crelu_ceil; float power_exp, power_scale, power_shift; enum class InternalFusionLocation { CUDNN, NATIVE } fusion_location; }; }}} /* namespace cv::dnn::cuda4dnn */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP */