matmul.hpp
3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_HPP
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_HPP
#include "../../op_cuda.hpp"
#include "../csl/stream.hpp"
#include "../csl/cublas.hpp"
#include "../csl/tensor.hpp"
#include "../csl/tensor_ops.hpp"
#include <opencv2/core.hpp>
#include <utility>
namespace cv { namespace dnn { namespace cuda4dnn {
template <class T>
class MatMulOp final : public CUDABackendNode {
public:
using wrapper_type = GetCUDABackendWrapperType<T>;
MatMulOp(csl::Stream stream_, csl::cublas::Handle handle)
: stream(std::move(stream_)), cublasHandle(std::move(handle))
{
}
void forward(
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
csl::Workspace& workspace) override
{
CV_Assert(inputs.size() == 2 && outputs.size() == 1);
auto input1_wrapper = inputs[0].dynamicCast<wrapper_type>();
auto input1 = input1_wrapper->getView();
auto input2_wrapper = inputs[1].dynamicCast<wrapper_type>();
auto input2 = input2_wrapper->getView();
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
auto output = output_wrapper->getSpan();
auto rank = output.rank();
CV_Assert(rank == input1.rank());
CV_Assert(rank == input2.rank());
CV_Assert(rank >= 2); // 1D MatMul not supported
for (int i = 0; i < rank - 2; i++)
{
// broadcasting not supported
auto size = output.get_axis_size(i);
CV_Assert(input1.get_axis_size(i) == size);
CV_Assert(input2.get_axis_size(i) == size);
}
auto m = input1.get_axis_size(-2);
auto n = input1.get_axis_size(-1);
auto k = input2.get_axis_size(-1);
auto b = input1.size() / m / n;
CV_Assert(input2.get_axis_size(-2) == n);
CV_Assert(output.get_axis_size(-2) == m);
CV_Assert(output.get_axis_size(-1) == k);
if (get_effective_rank(output) <= 2)
{
CV_Assert(b == 1);
CV_Assert(get_effective_rank(input1) <= 2);
CV_Assert(get_effective_rank(input2) <= 2);
csl::tensor_ops::gemm<T>(cublasHandle, 0.0, output, 1.0, false, input1, false, input2);
}
else
{
CV_Assert(rank >= 3);
input1.reshape(b, m, n);
input2.reshape(b, n, k);
output.reshape(b, m, k);
input1.squeeze_to(3);
input2.squeeze_to(3);
output.squeeze_to(3);
csl::tensor_ops::gemmStridedBatched<T>(cublasHandle, 0.0, output, 1.0, false, input1, false, input2);
}
}
private:
csl::Stream stream;
csl::cublas::Handle cublasHandle;
};
}}} /* namespace cv::dnn::cuda4dnn */
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_HPP */