Blame view

3rdparty/opencv-4.5.4/modules/dnn/src/cuda/normalize.cu 4.85 KB
f4334277   Hu Chunming   提交3rdparty
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  // This file is part of OpenCV project.
  // It is subject to the license terms in the LICENSE file found in the top-level directory
  // of this distribution and at http://opencv.org/license.html.
  
  #include <cuda_runtime.h>
  #include <cuda_fp16.h>
  
  #include "array.hpp"
  #include "math.hpp"
  #include "types.hpp"
  #include "atomics.hpp"
  #include "grid_stride_range.hpp"
  #include "execution.hpp"
  
  #include "../cuda4dnn/csl/stream.hpp"
  #include "../cuda4dnn/csl/span.hpp"
  
  #include "../cuda4dnn/kernels/fill_copy.hpp"
  #include "../cuda4dnn/kernels/scale_shift.hpp"
  
  #include <opencv2/core.hpp>
  
  #include <cstddef>
  
  using namespace cv::dnn::cuda4dnn::csl;
  using namespace cv::dnn::cuda4dnn::csl::device;
  
  namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  
      namespace raw {
          template <class T>
          __global__ void reduce_sum_abs(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
              for (auto idx : grid_stride_range(input.size())) {
                  const index_type outer_idx = idx / outer_stride;
                  const index_type inner_idx = idx % mid_stride;
  
                  const index_type sum_idx = outer_idx * mid_stride + inner_idx;
                  atomicAdd(&output[sum_idx], device::abs(input[idx]));
              }
          }
  
          template <class T>
          __global__ void reciprocal(Span<T> output, T epsilon) {
              for (auto idx : grid_stride_range(output.size()))
                  output[idx] = T(1) / (output[idx] + epsilon);
          }
  
          template <class T>
          __global__ void reduce_sum_squared(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
             for (auto idx : grid_stride_range(input.size())) {
                  const index_type outer_idx = idx / outer_stride;
                  const index_type inner_idx = idx % mid_stride;
  
                  const index_type sum_idx = outer_idx * mid_stride + inner_idx;
                  atomicAdd(&output[sum_idx], input[idx] * input[idx]);
             }
          }
  
          template <class T>
          __global__ void rsqrt(Span<T> output, T epsilon) {
              for (auto idx : grid_stride_range(output.size())) {
                  using device::sqrt;
                  output[idx] = T(1) / sqrt(output[idx] + epsilon);
              }
          }
  
          template <class T>
          __global__ void apply_norm(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride, View<T> sums) {
              for (auto idx : grid_stride_range(output.size())) {
                  const index_type outer_idx = idx / outer_stride;
                  const index_type inner_idx = idx % mid_stride;
  
                  const index_type sum_idx = outer_idx * mid_stride + inner_idx;
                  output[idx] = input[idx] * sums[sum_idx];
              }
          }
      }
  
      template <class T>
      void normalize(
          const Stream& stream,
          Span<T> output,
          View<T> input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
          Span<T> workspace)
      {
          CV_Assert(output.size() == input.size());
          CV_Assert(output.size() == outer_size * mid_size * inner_size);
          CV_Assert(norm == 1 || norm == 2);
          CV_Assert(workspace.size() >= outer_size * inner_size);
  
          auto sums = Span<T>(workspace.data(), outer_size * inner_size);
  
          fill<T>(stream, sums, 0.0);
  
          if (norm == 1) {
              auto reduce_kernel = raw::reduce_sum_abs<T>;
              auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
              launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
  
              auto reciprocal_kernel = raw::reciprocal<T>;
              policy = make_policy(reciprocal_kernel, sums.size(), 0, stream);
              launch_kernel(reciprocal_kernel, policy, sums, epsilon);
          } else {
              auto reduce_kernel = raw::reduce_sum_squared<T>;
              auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
              launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
  
              auto rsqrt_kernel = raw::rsqrt<T>;
              policy = make_policy(rsqrt_kernel, sums.size(), 0, stream);
              launch_kernel(rsqrt_kernel, policy, sums, epsilon);
          }
  
          auto scale_kernel = raw::apply_norm<T>;
          auto policy = make_policy(scale_kernel, output.size(), 0, stream);
          launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums);
      }
  
  #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
      template void normalize(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t, std::size_t, __half, Span<__half>);
  #endif
      template void normalize(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t, std::size_t, float, Span<float>);
  
  }}}} /* namespace cv::dnn::cuda4dnn::kernels */