Blame view

3rdparty/opencv-4.5.4/modules/stitching/src/cuda/multiband_blend.cu 4.62 KB
f4334277   Hu Chunming   提交3rdparty
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
  #if !defined CUDA_DISABLER
  
  #include "opencv2/core/cuda/common.hpp"
  #include "opencv2/core/types.hpp"
  
  namespace cv { namespace cuda { namespace device
  {
      namespace blend
      {
          __global__ void addSrcWeightKernel16S(const PtrStep<short> src, const PtrStep<short> src_weight,
              PtrStep<short> dst, PtrStep<short> dst_weight, int rows, int cols)
          {
              int x = blockIdx.x * blockDim.x + threadIdx.x;
              int y = blockIdx.y * blockDim.y + threadIdx.y;
  
              if (y < rows && x < cols)
              {
                  const short3 v = ((const short3*)src.ptr(y))[x];
                  short w = src_weight.ptr(y)[x];
                  ((short3*)dst.ptr(y))[x].x += short((v.x * w) >> 8);
                  ((short3*)dst.ptr(y))[x].y += short((v.y * w) >> 8);
                  ((short3*)dst.ptr(y))[x].z += short((v.z * w) >> 8);
                  dst_weight.ptr(y)[x] += w;
              }
          }
  
          void addSrcWeightGpu16S(const PtrStep<short> src, const PtrStep<short> src_weight,
              PtrStep<short> dst, PtrStep<short> dst_weight, cv::Rect &rc)
          {
              dim3 threads(16, 16);
              dim3 grid(divUp(rc.width, threads.x), divUp(rc.height, threads.y));
              addSrcWeightKernel16S<<<grid, threads>>>(src, src_weight, dst, dst_weight, rc.height, rc.width);
              cudaSafeCall(cudaGetLastError());
          }
  
          __global__ void addSrcWeightKernel32F(const PtrStep<short> src, const PtrStepf src_weight,
              PtrStep<short> dst, PtrStepf dst_weight, int rows, int cols)
          {
              int x = blockIdx.x * blockDim.x + threadIdx.x;
              int y = blockIdx.y * blockDim.y + threadIdx.y;
  
              if (y < rows && x < cols)
              {
                  const short3 v = ((const short3*)src.ptr(y))[x];
                  float w = src_weight.ptr(y)[x];
                  ((short3*)dst.ptr(y))[x].x += static_cast<short>(v.x * w);
                  ((short3*)dst.ptr(y))[x].y += static_cast<short>(v.y * w);
                  ((short3*)dst.ptr(y))[x].z += static_cast<short>(v.z * w);
                  dst_weight.ptr(y)[x] += w;
              }
          }
  
          void addSrcWeightGpu32F(const PtrStep<short> src, const PtrStepf src_weight,
              PtrStep<short> dst, PtrStepf dst_weight, cv::Rect &rc)
          {
              dim3 threads(16, 16);
              dim3 grid(divUp(rc.width, threads.x), divUp(rc.height, threads.y));
              addSrcWeightKernel32F<<<grid, threads>>>(src, src_weight, dst, dst_weight, rc.height, rc.width);
              cudaSafeCall(cudaGetLastError());
          }
  
          __global__ void normalizeUsingWeightKernel16S(const PtrStep<short> weight, PtrStep<short> src,
              const int width, const int height)
          {
              int x = (blockIdx.x * blockDim.x) + threadIdx.x;
              int y = (blockIdx.y * blockDim.y) + threadIdx.y;
  
              if (x < width && y < height)
              {
                  const short3 v = ((short3*)src.ptr(y))[x];
                  short w = weight.ptr(y)[x];
                  ((short3*)src.ptr(y))[x] = make_short3(short((v.x << 8) / w),
                      short((v.y << 8) / w), short((v.z << 8) / w));
              }
          }
  
          void normalizeUsingWeightMapGpu16S(const PtrStep<short> weight, PtrStep<short> src,
                                             const int width, const int height)
          {
              dim3 threads(16, 16);
              dim3 grid(divUp(width, threads.x), divUp(height, threads.y));
              normalizeUsingWeightKernel16S<<<grid, threads>>> (weight, src, width, height);
          }
  
          __global__ void normalizeUsingWeightKernel32F(const PtrStepf weight, PtrStep<short> src,
              const int width, const int height)
          {
              int x = (blockIdx.x * blockDim.x) + threadIdx.x;
              int y = (blockIdx.y * blockDim.y) + threadIdx.y;
  
              if (x < width && y < height)
              {
                  const float WEIGHT_EPS = 1e-5f;
                  const short3 v = ((short3*)src.ptr(y))[x];
                  float w = weight.ptr(y)[x];
                  ((short3*)src.ptr(y))[x] = make_short3(static_cast<short>(v.x / (w + WEIGHT_EPS)),
                      static_cast<short>(v.y / (w + WEIGHT_EPS)),
                      static_cast<short>(v.z / (w + WEIGHT_EPS)));
              }
          }
  
          void normalizeUsingWeightMapGpu32F(const PtrStepf weight, PtrStep<short> src,
                                             const int width, const int height)
          {
              dim3 threads(16, 16);
              dim3 grid(divUp(width, threads.x), divUp(height, threads.y));
              normalizeUsingWeightKernel32F<<<grid, threads>>> (weight, src, width, height);
          }
      }
  }}}
  
  #endif