// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #include #include #include "array.hpp" #include "math.hpp" #include "types.hpp" #include "atomics.hpp" #include "grid_stride_range.hpp" #include "execution.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/span.hpp" #include "../cuda4dnn/kernels/fill_copy.hpp" #include "../cuda4dnn/kernels/scale_shift.hpp" #include #include using namespace cv::dnn::cuda4dnn::csl; using namespace cv::dnn::cuda4dnn::csl::device; namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { namespace raw { template __global__ void reduce_sum_abs(Span output, View input, size_type outer_stride, size_type mid_stride) { for (auto idx : grid_stride_range(input.size())) { const index_type outer_idx = idx / outer_stride; const index_type inner_idx = idx % mid_stride; const index_type sum_idx = outer_idx * mid_stride + inner_idx; atomicAdd(&output[sum_idx], device::abs(input[idx])); } } template __global__ void reciprocal(Span output, T epsilon) { for (auto idx : grid_stride_range(output.size())) output[idx] = T(1) / (output[idx] + epsilon); } template __global__ void reduce_sum_squared(Span output, View input, size_type outer_stride, size_type mid_stride) { for (auto idx : grid_stride_range(input.size())) { const index_type outer_idx = idx / outer_stride; const index_type inner_idx = idx % mid_stride; const index_type sum_idx = outer_idx * mid_stride + inner_idx; atomicAdd(&output[sum_idx], input[idx] * input[idx]); } } template __global__ void rsqrt(Span output, T epsilon) { for (auto idx : grid_stride_range(output.size())) { using device::sqrt; output[idx] = T(1) / sqrt(output[idx] + epsilon); } } template __global__ void apply_norm(Span output, View input, size_type outer_stride, size_type mid_stride, View sums) { for (auto idx : grid_stride_range(output.size())) { const index_type outer_idx = idx / outer_stride; const index_type inner_idx = idx % mid_stride; const index_type sum_idx = outer_idx * mid_stride + inner_idx; output[idx] = input[idx] * sums[sum_idx]; } } } template void normalize( const Stream& stream, Span output, View input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon, Span workspace) { CV_Assert(output.size() == input.size()); CV_Assert(output.size() == outer_size * mid_size * inner_size); CV_Assert(norm == 1 || norm == 2); CV_Assert(workspace.size() >= outer_size * inner_size); auto sums = Span(workspace.data(), outer_size * inner_size); fill(stream, sums, 0.0); if (norm == 1) { auto reduce_kernel = raw::reduce_sum_abs; auto policy = make_policy(reduce_kernel, input.size(), 0, stream); launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size); auto reciprocal_kernel = raw::reciprocal; policy = make_policy(reciprocal_kernel, sums.size(), 0, stream); launch_kernel(reciprocal_kernel, policy, sums, epsilon); } else { auto reduce_kernel = raw::reduce_sum_squared; auto policy = make_policy(reduce_kernel, input.size(), 0, stream); launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size); auto rsqrt_kernel = raw::rsqrt; policy = make_policy(rsqrt_kernel, sums.size(), 0, stream); launch_kernel(rsqrt_kernel, policy, sums, epsilon); } auto scale_kernel = raw::apply_norm; auto policy = make_policy(scale_kernel, output.size(), 0, stream); launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums); } #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) template void normalize(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t, std::size_t, __half, Span<__half>); #endif template void normalize(const Stream&, Span, View, std::size_t, std::size_t, std::size_t, std::size_t, float, Span); }}}} /* namespace cv::dnn::cuda4dnn::kernels */