memory.hpp
10.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
#include "error.hpp"
#include "pointer.hpp"
#include <opencv2/core.hpp>
#include <cuda_runtime_api.h>
#include <cstddef>
#include <type_traits>
#include <memory>
#include <utility>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
/* @brief smart device pointer with allocation/deallocation methods
*
* ManagedPtr is a smart shared device pointer which also handles memory allocation.
*/
template <class T>
class ManagedPtr {
static_assert(!std::is_const<T>::value && !std::is_volatile<T>::value, "T cannot be cv-qualified");
static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
public:
using element_type = T;
using pointer = DevicePtr<element_type>;
using const_pointer = DevicePtr<typename std::add_const<element_type>::type>;
using size_type = std::size_t;
ManagedPtr() noexcept : wrapped{ nullptr }, n{ 0 }, capacity{ 0 } { }
ManagedPtr(const ManagedPtr&) noexcept = default;
ManagedPtr(ManagedPtr&& other) noexcept
: wrapped{ std::move(other.wrapped) }, n{ other.n }, capacity { other.capacity }
{
other.reset();
}
/** allocates device memory for \p count number of element */
ManagedPtr(size_type count) {
if (count <= 0) {
CV_Error(Error::StsBadArg, "number of elements is zero or negative");
}
void* temp = nullptr;
CUDA4DNN_CHECK_CUDA(cudaMalloc(&temp, count * sizeof(element_type)));
auto ptr = typename pointer::pointer(static_cast<element_type*>(temp));
wrapped.reset(ptr, [](element_type* ptr) {
if (ptr != nullptr) {
/* contract violation for std::shared_ptr if cudaFree throws */
try {
CUDA4DNN_CHECK_CUDA(cudaFree(ptr));
} catch (const CUDAException& ex) {
std::ostringstream os;
os << "Device memory deallocation failed in deleter.\n";
os << ex.what();
os << "Exception will be ignored.\n";
CV_LOG_WARNING(0, os.str().c_str());
}
}
});
/* std::shared_ptr<T>::reset invokves the deleter if an exception occurs; hence, we don't
* need to have a try-catch block to free the allocated device memory
*/
n = capacity = count;
}
ManagedPtr& operator=(ManagedPtr&& other) noexcept {
wrapped = std::move(other.wrapped);
n = other.n;
capacity = other.capacity;
other.reset();
return *this;
}
size_type size() const noexcept { return n; }
void reset() noexcept { wrapped.reset(); n = capacity = 0; }
/**
* deallocates any previously allocated memory and allocates device memory
* for \p count number of elements
*
* @note no reallocation if the previously allocated memory has no owners and the requested memory size fits in it
* @note use move constructor to guarantee a deallocation of the previously allocated memory
*
* Exception Guarantee: Strong
*/
void reset(size_type count) {
/* we need to fully own the memory to perform optimizations */
if (wrapped.use_count() == 1) {
/* avoid reallocation if the existing capacity is sufficient */
if (count <= capacity) {
n = count;
return;
}
}
/* no optimization performed; allocate memory */
ManagedPtr tmp(count);
swap(tmp, *this);
}
pointer get() const noexcept { return pointer(wrapped.get()); }
explicit operator bool() const noexcept { return wrapped; }
friend bool operator==(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped == rhs.wrapped; }
friend bool operator!=(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped != rhs.wrapped; }
friend void swap(ManagedPtr& lhs, ManagedPtr& rhs) noexcept {
using std::swap;
swap(lhs.wrapped, rhs.wrapped);
swap(lhs.n, rhs.n);
swap(lhs.capacity, rhs.capacity);
}
private:
std::shared_ptr<element_type> wrapped;
size_type n, capacity;
};
/** copies entire memory block pointed by \p src to \p dest
*
* \param[in] src device pointer
* \param[out] dest host pointer
*
* Pre-conditions:
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
*
* Exception Guarantee: Basic
*/
template <class T>
void memcpy(T *dest, const ManagedPtr<T>& src) {
memcpy<T>(dest, src.get(), src.size());
}
/** copies data from memory pointed by \p src to fully fill \p dest
*
* \param[in] src host pointer
* \param[out] dest device pointer
*
* Pre-conditions:
* - memory pointed by \p src must be at least as big as the memory block held by \p dest
*
* Exception Guarantee: Basic
*/
template <class T>
void memcpy(const ManagedPtr<T>& dest, const T* src) {
memcpy<T>(dest.get(), src, dest.size());
}
/** copies data from memory pointed by \p src to \p dest
*
* if the two \p src and \p dest have different sizes, the number of elements copied is
* equal to the size of the smaller memory block
*
* \param[in] src device pointer
* \param[out] dest device pointer
*
* Exception Guarantee: Basic
*/
template <class T>
void memcpy(const ManagedPtr<T>& dest, const ManagedPtr<T>& src) {
memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()));
}
/** sets device memory block to a specific 8-bit value
*
* \param[in] src device pointer
* \param[out] ch 8-bit value to fill the device memory with
*
* Exception Guarantee: Basic
*/
template <class T>
void memset(const ManagedPtr<T>& dest, std::int8_t ch) {
memset<T>(dest.get(), ch, dest.size());
}
/** copies entire memory block pointed by \p src to \p dest asynchronously
*
* \param[in] src device pointer
* \param[out] dest host pointer
* \param stream CUDA stream that has to be used for the memory transfer
*
* Pre-conditions:
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
* - \p dest points to page-locked memory
*
* Exception Guarantee: Basic
*/
template <class T>
void memcpy(T *dest, const ManagedPtr<T>& src, const Stream& stream) {
CV_Assert(stream);
memcpy<T>(dest, src.get(), src.size(), stream);
}
/** copies data from memory pointed by \p src to \p dest asynchronously
*
* \param[in] src host pointer
* \param[out] dest device pointer
* \param stream CUDA stream that has to be used for the memory transfer
*
* Pre-conditions:
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
* - \p src points to page-locked memory
*
* Exception Guarantee: Basic
*/
template <class T>
void memcpy(const ManagedPtr<T>& dest, const T* src, const Stream& stream) {
CV_Assert(stream);
memcpy<T>(dest.get(), src, dest.size(), stream);
}
/** copies data from memory pointed by \p src to \p dest asynchronously
*
* \param[in] src device pointer
* \param[out] dest device pointer
* \param stream CUDA stream that has to be used for the memory transfer
*
* if the two \p src and \p dest have different sizes, the number of elements copied is
* equal to the size of the smaller memory block
*
* Exception Guarantee: Basic
*/
template <class T>
void memcpy(ManagedPtr<T>& dest, const ManagedPtr<T>& src, const Stream& stream) {
CV_Assert(stream);
memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()), stream);
}
/** sets device memory block to a specific 8-bit value asynchronously
*
* \param[in] src device pointer
* \param[out] ch 8-bit value to fill the device memory with
* \param stream CUDA stream that has to be used for the memory operation
*
* Exception Guarantee: Basic
*/
template <class T>
void memset(const ManagedPtr<T>& dest, int ch, const Stream& stream) {
CV_Assert(stream);
memset<T>(dest.get(), ch, dest.size(), stream);
}
/** @brief registers host memory as page-locked and unregisters on destruction */
class MemoryLockGuard {
public:
MemoryLockGuard() noexcept : ptr { nullptr } { }
MemoryLockGuard(const MemoryLockGuard&) = delete;
MemoryLockGuard(MemoryLockGuard&& other) noexcept : ptr{ other.ptr } {
other.ptr = nullptr;
}
/** page-locks \p size_in_bytes bytes of memory starting from \p ptr_
*
* Pre-conditions:
* - host memory should be unregistered
*/
MemoryLockGuard(void* ptr_, std::size_t size_in_bytes) {
CUDA4DNN_CHECK_CUDA(cudaHostRegister(ptr_, size_in_bytes, cudaHostRegisterPortable));
ptr = ptr_;
}
MemoryLockGuard& operator=(const MemoryLockGuard&) = delete;
MemoryLockGuard& operator=(MemoryLockGuard&& other) noexcept {
if (&other != this) {
if(ptr != nullptr) {
/* cudaHostUnregister does not throw for a valid ptr */
CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr));
}
ptr = other.ptr;
other.ptr = nullptr;
}
return *this;
}
~MemoryLockGuard() {
if(ptr != nullptr) {
/* cudaHostUnregister does not throw for a valid ptr */
CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr));
}
}
private:
void *ptr;
};
}}}} /* namespace cv::dnn::cuda4dnn::csl */
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP */