// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_HPP #define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_HPP #include "../../op_cuda.hpp" #include "../csl/stream.hpp" #include "../csl/cublas.hpp" #include "../csl/tensor.hpp" #include "../csl/tensor_ops.hpp" #include #include namespace cv { namespace dnn { namespace cuda4dnn { template class MatMulOp final : public CUDABackendNode { public: using wrapper_type = GetCUDABackendWrapperType; MatMulOp(csl::Stream stream_, csl::cublas::Handle handle) : stream(std::move(stream_)), cublasHandle(std::move(handle)) { } void forward( const std::vector>& inputs, const std::vector>& outputs, csl::Workspace& workspace) override { CV_Assert(inputs.size() == 2 && outputs.size() == 1); auto input1_wrapper = inputs[0].dynamicCast(); auto input1 = input1_wrapper->getView(); auto input2_wrapper = inputs[1].dynamicCast(); auto input2 = input2_wrapper->getView(); auto output_wrapper = outputs[0].dynamicCast(); auto output = output_wrapper->getSpan(); auto rank = output.rank(); CV_Assert(rank == input1.rank()); CV_Assert(rank == input2.rank()); CV_Assert(rank >= 2); // 1D MatMul not supported for (int i = 0; i < rank - 2; i++) { // broadcasting not supported auto size = output.get_axis_size(i); CV_Assert(input1.get_axis_size(i) == size); CV_Assert(input2.get_axis_size(i) == size); } auto m = input1.get_axis_size(-2); auto n = input1.get_axis_size(-1); auto k = input2.get_axis_size(-1); auto b = input1.size() / m / n; CV_Assert(input2.get_axis_size(-2) == n); CV_Assert(output.get_axis_size(-2) == m); CV_Assert(output.get_axis_size(-1) == k); if (get_effective_rank(output) <= 2) { CV_Assert(b == 1); CV_Assert(get_effective_rank(input1) <= 2); CV_Assert(get_effective_rank(input2) <= 2); csl::tensor_ops::gemm(cublasHandle, 0.0, output, 1.0, false, input1, false, input2); } else { CV_Assert(rank >= 3); input1.reshape(b, m, n); input2.reshape(b, n, k); output.reshape(b, m, k); input1.squeeze_to(3); input2.squeeze_to(3); output.squeeze_to(3); csl::tensor_ops::gemmStridedBatched(cublasHandle, 0.0, output, 1.0, false, input1, false, input2); } } private: csl::Stream stream; csl::cublas::Handle cublasHandle; }; }}} /* namespace cv::dnn::cuda4dnn */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_HPP */