sumpixels.avx512_skx.hpp
21.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2019-2020, Intel Corporation, all rights reserved.
#include "opencv2/core/hal/intrin.hpp"
namespace cv { namespace hal {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
namespace { // Anonymous namespace to avoid exposing the implementation classes
//
// NOTE: Look at the bottom of the file for the entry-point function for external callers
//
template<size_t num_channels> class IntegralCalculator;
template<size_t num_channels>
class IntegralCalculator {
public:
IntegralCalculator() {};
void calculate_integral_avx512(const uchar *src, size_t _srcstep,
double *sum, size_t _sumstep,
double *sqsum, size_t _sqsumstep,
int width, int height)
{
const int srcstep = (int)(_srcstep/sizeof(uchar));
const int sumstep = (int)(_sumstep/sizeof(double));
const int sqsumstep = (int)(_sqsumstep/sizeof(double));
const int ops_per_line = width * num_channels;
// Clear the first line of the sum as per spec (see integral documentation)
// Also adjust the index of sum and sqsum to be at the real 0th element
// and not point to the border pixel so it stays in sync with the src pointer
memset( sum, 0, (ops_per_line+num_channels)*sizeof(double));
sum += num_channels;
if (sqsum) {
memset( sqsum, 0, (ops_per_line+num_channels)*sizeof(double));
sqsum += num_channels;
}
// Now calculate the integral over the whole image one line at a time
for(int y = 0; y < height; y++) {
const uchar * src_line = &src[y*srcstep];
double * sum_above = &sum[y*sumstep];
double * sum_line = &sum_above[sumstep];
double * sqsum_above = (sqsum) ? &sqsum[y*sqsumstep] : NULL;
double * sqsum_line = (sqsum) ? &sqsum_above[sqsumstep] : NULL;
calculate_integral_for_line(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line);
}
}
static CV_ALWAYS_INLINE
void calculate_integral_for_line(const uchar *srcs,
double *sums, double *sums_above,
double *sqsums, double *sqsums_above,
int num_ops_in_line)
{
__m512i sum_accumulator = _mm512_setzero_si512(); // holds rolling sums for the line
__m512i sqsum_accumulator = _mm512_setzero_si512(); // holds rolling sqsums for the line
// The first element on each line must be zeroes as per spec (see integral documentation)
zero_out_border_pixel(sums, sqsums);
// Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk
aligned_integral( srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);
post_aligned_integral(srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);
}
static CV_ALWAYS_INLINE
void zero_out_border_pixel(double *sums, double *sqsums)
{
// Note the negative index is because the sums/sqsums pointers point to the first real pixel
// after the border pixel so we have to look backwards
_mm512_mask_storeu_epi64(&sums[-(ptrdiff_t)num_channels], (1<<num_channels)-1, _mm512_setzero_si512());
if (sqsums)
_mm512_mask_storeu_epi64(&sqsums[-(ptrdiff_t)num_channels], (1<<num_channels)-1, _mm512_setzero_si512());
}
static CV_ALWAYS_INLINE
void aligned_integral(const uchar *&srcs,
double *&sums, double *&sums_above,
double *&sqsum, double *&sqsum_above,
__m512i &sum_accumulator, __m512i &sqsum_accumulator,
int num_ops_in_line)
{
// This function handles full 64 byte chunks of the source data at a time until it gets to the part of
// the line that no longer contains a full 64 byte chunk. Other code will handle the last part.
const int num_chunks = num_ops_in_line >> 6; // quick int divide by 64
for (int index_64byte_chunk = 0; index_64byte_chunk < num_chunks; index_64byte_chunk++){
integral_64_operations_avx512((__m512i *) srcs,
(__m512i *) sums, (__m512i *) sums_above,
(__m512i *) sqsum, (__m512i *) sqsum_above,
0xFFFFFFFFFFFFFFFF, sum_accumulator, sqsum_accumulator);
srcs+=64; sums+=64; sums_above+=64;
if (sqsum){ sqsum+= 64; sqsum_above+=64; }
}
}
static CV_ALWAYS_INLINE
void post_aligned_integral(const uchar *srcs,
const double *sums, const double *sums_above,
const double *sqsum, const double *sqsum_above,
__m512i &sum_accumulator, __m512i &sqsum_accumulator,
int num_ops_in_line)
{
// This function handles the last few straggling operations that are not a full chunk of 64 operations
// We use the same algorithm, but we calculate a different operation mask using (num_ops % 64).
const unsigned int num_operations = (unsigned int) num_ops_in_line & 0x3F; // Quick int modulo 64
if (num_operations > 0) {
__mmask64 operation_mask = (1ULL << num_operations) - 1ULL;
integral_64_operations_avx512((__m512i *) srcs, (__m512i *) sums, (__m512i *) sums_above,
(__m512i *) sqsum, (__m512i *) sqsum_above,
operation_mask, sum_accumulator, sqsum_accumulator);
}
}
static CV_ALWAYS_INLINE
void integral_64_operations_avx512(const __m512i *srcs,
__m512i *sums, const __m512i *sums_above,
__m512i *sqsums, const __m512i *sqsums_above,
__mmask64 data_mask,
__m512i &sum_accumulator, __m512i &sqsum_accumulator)
{
__m512i src_64byte_chunk = read_64_bytes(srcs, data_mask);
while (data_mask) {
__m128i src_16bytes = extract_lower_16bytes(src_64byte_chunk);
__m512i src_longs_lo = convert_lower_8bytes_to_longs(src_16bytes);
__m512i src_longs_hi = convert_lower_8bytes_to_longs(shift_right_8_bytes(src_16bytes));
// Calculate integral for the sum on the 8 lanes at a time
integral_8_operations(src_longs_lo, sums_above, data_mask, sums, sum_accumulator);
integral_8_operations(src_longs_hi, sums_above+1, data_mask>>8, sums+1, sum_accumulator);
if (sqsums) {
__m512i squared_source_lo = square_m512(src_longs_lo);
__m512i squared_source_hi = square_m512(src_longs_hi);
integral_8_operations(squared_source_lo, sqsums_above, data_mask, sqsums, sqsum_accumulator);
integral_8_operations(squared_source_hi, sqsums_above+1, data_mask>>8, sqsums+1, sqsum_accumulator);
sqsums += 2;
sqsums_above+=2;
}
// Prepare for next iteration of loop
// shift source to align next 16 bytes to lane 0, shift the mask, and advance the pointers
sums += 2;
sums_above += 2;
data_mask = data_mask >> 16;
src_64byte_chunk = shift_right_16_bytes(src_64byte_chunk);
}
}
static CV_ALWAYS_INLINE
void integral_8_operations(const __m512i src_longs, const __m512i *above_values_ptr, __mmask64 data_mask,
__m512i *results_ptr, __m512i &accumulator)
{
// NOTE that the calculate_integral function referenced here must be implemented in the templated
// derivatives because the algorithm depends heavily on the number of channels in the image
//
_mm512_mask_storeu_pd(
results_ptr, // Store the result here
(__mmask8)data_mask, // Using the data mask to avoid overrunning the line
calculate_integral( // Writing the value of the integral derived from:
src_longs, // input data
_mm512_maskz_loadu_pd((__mmask8)data_mask, above_values_ptr), // and the results from line above
accumulator // keeping track of the accumulator
)
);
}
static CV_ALWAYS_INLINE
__m512i read_64_bytes(const __m512i *srcs, const __mmask64 data_mask) {
return _mm512_maskz_loadu_epi8(data_mask, srcs);
}
static CV_ALWAYS_INLINE
__m128i extract_lower_16bytes(const __m512i src_64byte_chunk) {
return _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0);
}
static CV_ALWAYS_INLINE
__m512i convert_lower_8bytes_to_longs(const __m128i src_16bytes) {
return _mm512_cvtepu8_epi64(src_16bytes);
}
static CV_ALWAYS_INLINE
__m512i square_m512(const __m512i src_longs) {
return _mm512_mullo_epi64(src_longs, src_longs);
}
static CV_ALWAYS_INLINE
__m128i shift_right_8_bytes(const __m128i src_16bytes) {
return _mm_maskz_compress_epi64(2, src_16bytes);
}
static CV_ALWAYS_INLINE
__m512i shift_right_16_bytes(const __m512i src_64byte_chunk) {
return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk);
}
static CV_ALWAYS_INLINE
__m512i m512_hadd(const __m512i a){
return _mm512_add_epi64(_mm512_maskz_compress_epi64(0xAA, a), _mm512_maskz_compress_epi64(0x55, a));
}
// The calculate_integral function referenced here must be implemented in the templated derivatives
// because the algorithm depends heavily on the number of channels in the image
// This is the incomplete definition (just the prototype) here.
//
static CV_ALWAYS_INLINE
__m512d calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator);
};
//============================================================================================================
// This the only section that needs to change with respect to algorithm based on the number of channels
// It is responsible for returning the calculation of 8 lanes worth of the integral and returning in the
// accumulated sums in the accumulator parameter (NOTE: accumulator is an input and output parameter)
//
// The function prototype that needs to be implemented is:
//
// __m512d calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator){ ... }
//
// Description of parameters:
// INPUTS:
// src_longs : 8 lanes worth of the source bytes converted to 64 bit integers
// above_values: 8 lanes worth of the result values from the line above (See the integral spec)
// accumulator : 8 lanes worth of sums from the previous iteration
// IMPORTANT NOTE: This parameter is both an INPUT AND OUTPUT parameter to this function
//
// OUTPUTS:
// return value: The actual integral value for all 8 lanes which is defined by the spec as
// the sum of all channel values to the left of a given pixel plus the result
// written to the line directly above the current line.
// accumulator: This is an input and and output. This parameter should be left with the accumulated
// sums for the current 8 lanes keeping all entries in the proper lane (do not shuffle it)
//
// Below here is the channel specific implementation
//
//========================================
// 1 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 1 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
// One channel support is implemented differently than 2, 3, or 4 channel
// One channel support has more horizontal operations that cannot be made vertical without losing performance
// The logical operations needed look like:
// Vertical LANES : |7|6|5|4|3|2|1|0|
// src_longs : |H|G|F|E|D|C|B|A|
// shift_by_1 : + |G|F|E|D|C|B|A| |
// shift_by_2 : + |F|E|D|C|B|A| | |
// shift_by_3 : + |E|D|C|B|A| | | |
// shift_by_4 : + |D|C|B|A| | | | |
// shift_by_5 : + |C|B|A| | | | | |
// shift_by_6 : + |B|A| | | | | | |
// shift_by_7 : + |A| | | | | | | |
// carry_over_idxs : + |7|7|7|7|7|7|7|7| (index position of result from previous iteration)
// = integral
//
// If we do this vertically we end up losing performance because of the number of operations. We will instead
// do a horizontal add tree to create the vertical sections we need as a tree
// Vertical Lanes: | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
// src_longs: | H | G | F | E | D | C | B | A |
// horiz_sum_1: | | | | | G+H | E+F | C+D | A+B |
// horiz_sum_2: | | | | | | | EFGH | ABCD |
//
const __m512i horiz_sum_1 = m512_hadd(src_longs); // indexes for the permutes below (3,2,1,0) = (GH, EF, CD, AB)
const __m512i horiz_sum_2 = m512_hadd(horiz_sum_1); // indexes for the permutes below (9, 8) = (EFGH, ABCD)
// Then we can use the partial sums by looking at the vertical stacks above and realize that, for example
// ABCD appears vertically in lanes 7, 6, 5, 4, and 3 so we will permute the values so that all partial products
// appear in the right lanes. and sum them up along with the carry over value from the accumulator. So we setup
// the lanes like:
// Vertical Lanes: | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
// s1 : | 0 | G | 0 | E | 0 | C | 0 | A |
// s2 : | ABCD | ABCD | ABCD | ABCD | ABCD | AB | AB | 0 |
// s3 : | EFGH | EF | EF | 0 | 0 | 0 | 0 | 0 |
// +------+------+------+------+------+------+------+------+
// sum : | A..H | A..G | A..F | A..E | A..D | A..C | A..B | A | Integral :-)
//
const __m512i s1 = _mm512_maskz_mov_epi64(0x55, src_longs); // 0 G 0 E 0 D 0 C 0 A
const __m512i s2 = _mm512_permutex2var_epi64(horiz_sum_1, _mm512_set_epi64(8,8,8,8,8,0,0,4), horiz_sum_2);
const __m512i s3 = _mm512_permutex2var_epi64(horiz_sum_1, _mm512_set_epi64(9,2,2,4,4,4,4,4), horiz_sum_2);
// Now we use the rolling sum from the previous iteration from accumulator and replicate it into carry_over
// And sum everything up into the accumulator
//
const __m512i carry_over = _mm512_permutex2var_epi64(accumulator, _mm512_set_epi64(7,7,7,7,7,7,7,7), accumulator);
accumulator = _mm512_add_epi64(_mm512_add_epi64(s2, s3), _mm512_add_epi64(carry_over, s1));
// Convert to double precision and store
//
__m512d integral_pd = _mm512_add_pd(_mm512_cvtepu64_pd(accumulator), above_values);
return integral_pd;
}
//========================================
// 2 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 2 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
__m512i carryover_idxs = _mm512_set_epi64(7, 6, 7, 6, 7, 6, 7, 6);
// Align data to prepare for the adds:
// shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
// Vertical LANES : 76543210
// src_longs : HGFEDCBA
// shift2lanes : + FEDCBA
// shift4lanes : + DCBA
// shift6lanes : + BA
// carry_over_idxs : + 76767676 (index position of result from previous iteration)
// = integral
__m512i shift2lanes = _mm512_maskz_expand_epi64(0xFC, src_longs);
__m512i shift4lanes = _mm512_maskz_expand_epi64(0xF0, src_longs);
__m512i shift6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs);
__m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);
// Add all values in tree form for perf ((0+2) + (4+6))
__m512i sum_shift_02 = _mm512_add_epi64(src_longs, shift2lanes);
__m512i sum_shift_46 = _mm512_add_epi64(shift4lanes, shift6lanes);
__m512i sum_all = _mm512_add_epi64(sum_shift_02, sum_shift_46);
accumulator = _mm512_add_epi64(sum_all, carry_over);
// Convert to packed double and add to the line above to get the true integral value
__m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
__m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values);
return integral_pd;
}
//========================================
// 3 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 3 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
__m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5);
// Align data to prepare for the adds:
// shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
// Vertical LANES: 76543210
// src_longs : HGFEDCBA
// shit3lanes : + EDCBA
// shift6lanes : + BA
// carry_over_idxs : + 65765765 (index position of result from previous iteration)
// = integral
__m512i shift3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs);
__m512i shift6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs);
__m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);
// Do the adds in tree form
__m512i sum_shift_03 = _mm512_add_epi64(src_longs, shift3lanes);
__m512i sum_rest = _mm512_add_epi64(shift6lanes, carry_over);
accumulator = _mm512_add_epi64(sum_shift_03, sum_rest);
// Convert to packed double and add to the line above to get the true integral value
__m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
__m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values);
return integral_pd;
}
//========================================
// 4 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 4 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
__m512i carryover_idxs = _mm512_set_epi64(7, 6, 5, 4, 7, 6, 5, 4);
// Align data to prepare for the adds:
// shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
// Vertical LANES: 76543210
// src_longs : HGFEDCBA
// shit4lanes : + DCBA
// carry_over_idxs : + 76547654 (index position of result from previous iteration)
// = integral
__m512i shifted4lanes = _mm512_maskz_expand_epi64(0xF0, src_longs);
__m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);
// Add data pixels and carry over from last iteration
__m512i sum_shift_04 = _mm512_add_epi64(src_longs, shifted4lanes);
accumulator = _mm512_add_epi64(sum_shift_04, carry_over);
// Convert to packed double and add to the line above to get the true integral value
__m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
__m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values);
return integral_pd;
}
} // end of anonymous namespace
static
void calculate_integral_avx512(const uchar *src, size_t _srcstep,
double *sum, size_t _sumstep,
double *sqsum, size_t _sqsumstep,
int width, int height, int cn)
{
CV_INSTRUMENT_REGION();
switch(cn){
case 1: {
IntegralCalculator< 1 > calculator;
calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
break;
}
case 2: {
IntegralCalculator< 2 > calculator;
calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
break;
}
case 3: {
IntegralCalculator< 3 > calculator;
calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
break;
}
case 4: {
IntegralCalculator< 4 > calculator;
calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
}
}
}
CV_CPU_OPTIMIZATION_NAMESPACE_END
}} // end namespace cv::hal