padding.cu
8.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "array.hpp"
#include "math.hpp"
#include "types.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "kernel_dispatcher.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include <opencv2/core.hpp>
#include <cstddef>
#include <vector>
#include <utility>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t Rank>
__global__ void copy_with_reflection101(
Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> start, array<index_type, Rank> end,
View<T> input, array<size_type, Rank> in_strides)
{
for (auto i : grid_stride_range(output.size())) {
/* compute output axis indices corresponding to element 'i' */
array<index_type, Rank> out_index;
out_index[0] = i / out_strides[0];
for (int j = 1; j < Rank; j++)
out_index[j] = (i % out_strides[j - 1]) / out_strides[j];
/* compute input axis indices corresponding to output axis indices */
array<index_type, Rank> in_index;
for (int j = 0; j < Rank; j++) {
/* if out_index < start, the point is in the left reflection region
* the reflected value's index is the absolute value of the difference
*
* otherwise, if the value is in the copy region, out_index - start gives the input index
*/
using device::abs;
in_index[j] = abs(out_index[j] - start[j]);
/* if out_index >= end, it's in the right reflection region */
if (out_index[j] >= end[j])
in_index[j] = (end[j] - start[j]) - (out_index[j] - end[j]) - 2;
}
/* compute input element number from input axis indices */
index_type iidx = 0;
for (int j = 0; j < Rank; j++)
iidx += in_index[j] * in_strides[j];
output[i] = input[iidx];
}
}
}
template <class T, std::size_t Rank> static
void launch_copy_with_reflection101(
const Stream& stream,
Span<T> output, const std::vector<std::size_t>& outStride,
View<T> input, const std::vector<std::size_t>& inStride,
const std::vector<std::pair<std::size_t, std::size_t>>& ranges)
{
CV_Assert(outStride.size() == Rank);
CV_Assert(inStride.size() == Rank);
CV_Assert(ranges.size() == Rank);
array<size_type, Rank> outStride_k, inStride_k;
outStride_k.assign(std::begin(outStride), std::end(outStride));
inStride_k.assign(std::begin(inStride), std::end(inStride));
array<index_type, Rank> start_k, end_k;
for (int i = 0; i < Rank; i++) {
start_k[i] = ranges[i].first;
end_k[i] = ranges[i].second;
}
auto kernel = raw::copy_with_reflection101<T, Rank>;
auto policy = make_policy(kernel, output.size(), 0, stream);
launch_kernel(kernel, policy, output, outStride_k, start_k, end_k, input, inStride_k);
}
GENERATE_KERNEL_DISPATCHER(copy_with_reflection101_dispatcher, launch_copy_with_reflection101);
template <class T>
void copy_with_reflection101(
const Stream& stream,
TensorSpan<T> output, TensorView<T> input,
std::vector<std::pair<std::size_t, std::size_t>> ranges)
{
CV_Assert(output.rank() == input.rank());
CV_Assert(output.rank() == ranges.size());
/* squeezable axes at the beginning of both tensors can be eliminated
*
* Reasoning:
* ----------
* Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
* output tensor will be [i1 + off1, i2 + off2, ...]. The rest of the elements in the output are padding.
* The padding operation essentially copies items from the input tensor to new locations in the output tensor
* and pads the remaining.
*
* If the size of the first axis of the input and output tensor is unity, the input and output indices
* for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...] respectively. Note that
* there cannot be extra padding since the axes have unit size. The first index does not contribute to the
* element's address calculation and hence does nothing apart from eating up few cycles.
*/
while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
CV_Assert(ranges[0].first == 0 && ranges[0].second == 1);
input.squeeze(0);
output.squeeze(0);
ranges.erase(std::begin(ranges));
CV_Assert(output.rank() == input.rank());
CV_Assert(output.rank() == ranges.size());
}
auto inShape = input.shape_as_vector();
auto outShape = output.shape_as_vector();
/* contiguous axes which do not have any padding can be combined into one axis
*
* Reasoning:
* ----------
* Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not have any
* padding. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
*
* Each axis in the contiguous unpadded axes sequence will add an offset of iN * strideN. In the above example,
* the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
* a size of `size1 * size2`. The new offset added will be `i12 * stride2` as the kernel iterates through `i12`.
* Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
*/
for (int i = 0; i < inShape.size(); i++) {
/* check if axis `i` requires any padding */
if (ranges[i].first == 0 && ranges[i].second == inShape[i]) {
/* loop invariant: `i` is the first axis in the contiguous unpadded axis sequence */
CV_Assert(inShape[i] == outShape[i]);
/* we now iterate through the axes which follow and try to merge */
int j = i + 1; /* `j` is the axis which we will attempt to merge */
while (j < inShape.size() && ranges[j].first == 0 && ranges[j].second == inShape[j]) {
CV_Assert(inShape[j] == outShape[j]);
/* `j` is also unpadded; merge `i` and `j` */
auto new_size = inShape[i] * inShape[j];
inShape[i] = new_size;
outShape[i] = new_size;
ranges[i].second = new_size;
/* delete axis `j` */
inShape.erase(std::begin(inShape) + j);
outShape.erase(std::begin(outShape) + j);
ranges.erase(std::begin(ranges) + j);
/* optimizations should not break the invariants */
CV_Assert(inShape.size() == outShape.size());
CV_Assert(inShape.size() == ranges.size());
CV_Assert(inShape[i] == outShape[i]);
CV_Assert(ranges[i].first == 0 && ranges[i].second == inShape[i]);
}
}
}
auto rank = inShape.size();
std::vector<std::size_t> inStride(rank), outStride(rank);
inStride.back() = 1;
outStride.back() = 1;
/* garbage, ..., garbage, 1 */
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
/* dim[0], dim[1], ..., dim[-1], 1 */
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
/* stride[0], stride[1], ..., stride[-2], 1 */
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
copy_with_reflection101_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, ranges);
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void copy_with_reflection101(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
#endif
template void copy_with_reflection101(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
}}}} /* namespace namespace cv::dnn::cuda4dnn::kernels */