// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP #define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP #include "error.hpp" #include "stream.hpp" #include "pointer.hpp" #include #include #include #include #include #define CUDA4DNN_CHECK_CUBLAS(call) \ ::cv::dnn::cuda4dnn::csl::cublas::detail::check((call), CV_Func, __FILE__, __LINE__) namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas { /** @brief exception class for errors thrown by the cuBLAS API */ class cuBLASException : public CUDAException { public: using CUDAException::CUDAException; }; namespace detail { static void check(cublasStatus_t status, const char* func, const char* file, int line) { auto cublasGetErrorString = [](cublasStatus_t err) { switch (err) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; } return "UNKNOWN_CUBLAS_ERROR"; }; if (status != CUBLAS_STATUS_SUCCESS) throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line); } } /** non-copyable cuBLAS smart handle * * UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle * is destroyed after use. The handle must always be associated with a non-default stream. The stream * must be specified during construction. * * Refer to stream API for more information for the choice of forcing non-default streams. */ class UniqueHandle { public: UniqueHandle() noexcept : handle{ nullptr } { } UniqueHandle(UniqueHandle&) = delete; UniqueHandle(UniqueHandle&& other) noexcept { stream = std::move(other.stream); handle = other.handle; other.handle = nullptr; } /** creates a cuBLAS handle and associates it with the stream specified * * Exception Guarantee: Basic */ UniqueHandle(Stream strm) : stream(std::move(strm)) { CV_Assert(stream); CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle)); try { CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, stream.get())); } catch (...) { /* cublasDestroy won't throw if a valid handle is passed */ CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle)); throw; } } ~UniqueHandle() noexcept { if (handle) { /* cublasDestroy won't throw if a valid handle is passed */ CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle)); } } UniqueHandle& operator=(const UniqueHandle&) = delete; UniqueHandle& operator=(UniqueHandle&& other) noexcept { CV_Assert(other); if (&other != this) { UniqueHandle(std::move(*this)); /* destroy current handle */ stream = std::move(other.stream); handle = other.handle; other.handle = nullptr; } return *this; } /** returns the raw cuBLAS handle */ cublasHandle_t get() const noexcept { CV_Assert(handle); return handle; } /** returns true if the handle is valid */ explicit operator bool() const noexcept { return static_cast(handle); } private: Stream stream; cublasHandle_t handle; }; /** @brief sharable cuBLAS smart handle * * Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle * is destroyed after all references to the handle are destroyed. The handle must always * be associated with a non-default stream. The stream must be specified during construction. * * @note Moving a Handle object to another invalidates the former */ class Handle { public: Handle() = default; Handle(const Handle&) = default; Handle(Handle&&) = default; /** creates a cuBLAS handle and associates it with the stream specified * * Exception Guarantee: Basic */ Handle(Stream strm) : handle(std::make_shared(std::move(strm))) { } Handle& operator=(const Handle&) = default; Handle& operator=(Handle&&) = default; /** returns true if the handle is valid */ explicit operator bool() const noexcept { return static_cast(handle); } /** returns the raw cuBLAS handle */ cublasHandle_t get() const noexcept { CV_Assert(handle); return handle->get(); } private: std::shared_ptr handle; }; /** @brief GEMM for colummn-major matrices * * \f$ C = \alpha AB + \beta C \f$ * * @tparam T matrix element type (must be `half` or `float`) * * @param handle valid cuBLAS Handle * @param transa use transposed matrix of A for computation * @param transb use transposed matrix of B for computation * @param rows_c number of rows in C * @param cols_c number of columns in C * @param common_dim common dimension of A (or trans A) and B (or trans B) * @param alpha scale factor for AB * @param[in] A pointer to column-major matrix A in device memory * @param lda leading dimension of matrix A * @param[in] B pointer to column-major matrix B in device memory * @param ldb leading dimension of matrix B * @param beta scale factor for C * @param[in,out] C pointer to column-major matrix C in device memory * @param ldc leading dimension of matrix C * * Exception Guarantee: Basic */ template void gemm(const Handle& handle, bool transa, bool transb, std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, T alpha, const DevicePtr A, std::size_t lda, const DevicePtr B, std::size_t ldb, T beta, const DevicePtr C, std::size_t ldc); template <> inline void gemm(const Handle& handle, bool transa, bool transb, std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, half alpha, const DevicePtr A, std::size_t lda, const DevicePtr B, std::size_t ldb, half beta, const DevicePtr C, std::size_t ldc) { CV_Assert(handle); auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; int irows_c = static_cast(rows_c), icols_c = static_cast(cols_c), icommon_dim = static_cast(common_dim), ilda = static_cast(lda), ildb = static_cast(ldb), ildc = static_cast(ldc); CUDA4DNN_CHECK_CUBLAS( cublasHgemm( handle.get(), opa, opb, irows_c, icols_c, icommon_dim, &alpha, A.get(), ilda, B.get(), ildb, &beta, C.get(), ildc ) ); } template <> inline void gemm(const Handle& handle, bool transa, bool transb, std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, float alpha, const DevicePtr A, std::size_t lda, const DevicePtr B, std::size_t ldb, float beta, const DevicePtr C, std::size_t ldc) { CV_Assert(handle); auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; int irows_c = static_cast(rows_c), icols_c = static_cast(cols_c), icommon_dim = static_cast(common_dim), ilda = static_cast(lda), ildb = static_cast(ldb), ildc = static_cast(ldc); CUDA4DNN_CHECK_CUBLAS( cublasSgemm( handle.get(), opa, opb, irows_c, icols_c, icommon_dim, &alpha, A.get(), ilda, B.get(), ildb, &beta, C.get(), ildc ) ); } /** @brief Strided batched GEMM for colummn-major matrices * * \f$ C_i = \alpha A_i B_i + \beta C_i \f$ for a stack of matrices A, B and C indexed by i * * @tparam T matrix element type (must be `half` or `float`) * * @param handle valid cuBLAS Handle * @param transa use transposed matrix of A_i for computation * @param transb use transposed matrix of B_i for computation * @param rows_c number of rows in C_i * @param cols_c number of columns in C_i * @param common_dim common dimension of A_i (or trans A_i) and B_i (or trans B_i) * @param alpha scale factor for A_i B_i * @param[in] A pointer to stack of column-major matrices A in device memory * @param lda leading dimension of matrix A_i * @param strideA stride between matrices in A * @param[in] B pointer to stack of column-major matrices B in device memory * @param ldb leading dimension of matrix B_i * @param strideB stride between matrices in B * @param beta scale factor for C_i * @param[in,out] C pointer to stack of column-major matrices C in device memory * @param ldc leading dimension of matrix C_i * @param strideC stride between matrices in C * @param batchCount number of matrices in the batch * * Exception Guarantee: Basic */ template void gemmStridedBatched(const Handle& handle, bool transa, bool transb, std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, T alpha, const DevicePtr A, std::size_t lda, std::size_t strideA, const DevicePtr B, std::size_t ldb, std::size_t strideB, T beta, const DevicePtr C, std::size_t ldc, std::size_t strideC, std::size_t batchCount); template <> inline void gemmStridedBatched(const Handle& handle, bool transa, bool transb, std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, half alpha, const DevicePtr A, std::size_t lda, std::size_t strideA, const DevicePtr B, std::size_t ldb, std::size_t strideB, half beta, const DevicePtr C, std::size_t ldc, std::size_t strideC, std::size_t batchCount) { CV_Assert(handle); const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; const auto irows_c = static_cast(rows_c), icols_c = static_cast(cols_c), icommon_dim = static_cast(common_dim), ilda = static_cast(lda), ildb = static_cast(ldb), ildc = static_cast(ldc); const auto batch_count = static_cast(batchCount); const auto stride_a = static_cast(strideA), stride_b = static_cast(strideB), stride_c = static_cast(strideC); CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap CUDA4DNN_CHECK_CUBLAS( cublasHgemmStridedBatched( handle.get(), opa, opb, irows_c, icols_c, icommon_dim, &alpha, A.get(), ilda, stride_a, B.get(), ildb, stride_b, &beta, C.get(), ildc, stride_c, batch_count ) ); } template <> inline void gemmStridedBatched(const Handle& handle, bool transa, bool transb, std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, float alpha, const DevicePtr A, std::size_t lda, std::size_t strideA, const DevicePtr B, std::size_t ldb, std::size_t strideB, float beta, const DevicePtr C, std::size_t ldc, std::size_t strideC, std::size_t batchCount) { CV_Assert(handle); const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; const auto irows_c = static_cast(rows_c), icols_c = static_cast(cols_c), icommon_dim = static_cast(common_dim), ilda = static_cast(lda), ildb = static_cast(ldb), ildc = static_cast(ldc); const auto batch_count = static_cast(batchCount); const auto stride_a = static_cast(strideA), stride_b = static_cast(strideB), stride_c = static_cast(strideC); CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap CUDA4DNN_CHECK_CUBLAS( cublasSgemmStridedBatched( handle.get(), opa, opb, irows_c, icols_c, icommon_dim, &alpha, A.get(), ilda, stride_a, B.get(), ildb, stride_b, &beta, C.get(), ildc, stride_c, batch_count ) ); } }}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */