sumpixels.avx512_skx.hpp 21.1 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2019-2020, Intel Corporation, all rights reserved.

#include "opencv2/core/hal/intrin.hpp"

namespace cv { namespace hal {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN

namespace { // Anonymous namespace to avoid exposing the implementation classes

//
// NOTE: Look at the bottom of the file for the entry-point function for external callers
//

template<size_t num_channels> class IntegralCalculator;

template<size_t num_channels>
class IntegralCalculator  {
public:
    IntegralCalculator() {};


    void calculate_integral_avx512(const uchar *src, size_t _srcstep,
                                   double *sum,      size_t _sumstep,
                                   double *sqsum,    size_t _sqsumstep,
                                   int width, int height)
    {
        const int srcstep = (int)(_srcstep/sizeof(uchar));
        const int sumstep = (int)(_sumstep/sizeof(double));
        const int sqsumstep = (int)(_sqsumstep/sizeof(double));
        const int ops_per_line = width * num_channels;

        // Clear the first line of the sum as per spec (see integral documentation)
        // Also adjust the index of sum and sqsum to be at the real 0th element
        // and not point to the border pixel so it stays in sync with the src pointer
        memset( sum, 0, (ops_per_line+num_channels)*sizeof(double));
        sum += num_channels;

        if (sqsum) {
            memset( sqsum, 0, (ops_per_line+num_channels)*sizeof(double));
            sqsum += num_channels;
        }

        // Now calculate the integral over the whole image one line at a time
        for(int y = 0; y < height; y++) {
            const uchar * src_line    = &src[y*srcstep];
            double      * sum_above   = &sum[y*sumstep];
            double      * sum_line    = &sum_above[sumstep];
            double      * sqsum_above = (sqsum) ? &sqsum[y*sqsumstep]     : NULL;
            double      * sqsum_line  = (sqsum) ? &sqsum_above[sqsumstep] : NULL;

            calculate_integral_for_line(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line);

        }
    }

    static CV_ALWAYS_INLINE
    void calculate_integral_for_line(const uchar *srcs,
                                     double *sums, double *sums_above,
                                     double *sqsums, double *sqsums_above,
                                     int num_ops_in_line)
    {
        __m512i sum_accumulator   = _mm512_setzero_si512();  // holds rolling sums for the line
        __m512i sqsum_accumulator = _mm512_setzero_si512();  // holds rolling sqsums for the line

        // The first element on each line must be zeroes as per spec (see integral documentation)
        zero_out_border_pixel(sums, sqsums);

        // Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk
        aligned_integral(     srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);
        post_aligned_integral(srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);

    }


    static CV_ALWAYS_INLINE
    void zero_out_border_pixel(double *sums, double *sqsums)
    {
        // Note the negative index is because the sums/sqsums pointers point to the first real pixel
        // after the border pixel so we have to look backwards
        _mm512_mask_storeu_epi64(&sums[-(ptrdiff_t)num_channels], (1<<num_channels)-1, _mm512_setzero_si512());
        if (sqsums)
            _mm512_mask_storeu_epi64(&sqsums[-(ptrdiff_t)num_channels], (1<<num_channels)-1, _mm512_setzero_si512());
    }


    static CV_ALWAYS_INLINE
    void aligned_integral(const uchar *&srcs,
                          double *&sums,  double *&sums_above,
                          double *&sqsum, double *&sqsum_above,
                          __m512i &sum_accumulator, __m512i &sqsum_accumulator,
                          int num_ops_in_line)
    {
        // This function handles full 64 byte chunks of the source data at a time until it gets to the part of
        // the line that no longer contains a full 64 byte chunk.  Other code will handle the last part.

        const int num_chunks = num_ops_in_line >> 6;  // quick int divide by 64

        for (int index_64byte_chunk = 0; index_64byte_chunk < num_chunks; index_64byte_chunk++){
            integral_64_operations_avx512((__m512i *) srcs,
                                          (__m512i *) sums,  (__m512i *) sums_above,
                                          (__m512i *) sqsum, (__m512i *) sqsum_above,
                                          0xFFFFFFFFFFFFFFFF, sum_accumulator, sqsum_accumulator);
            srcs+=64; sums+=64; sums_above+=64;
            if (sqsum){ sqsum+= 64; sqsum_above+=64; }
        }
    }


    static CV_ALWAYS_INLINE
    void post_aligned_integral(const uchar *srcs,
                               const double *sums,   const double *sums_above,
                               const double *sqsum,  const double *sqsum_above,
                               __m512i &sum_accumulator, __m512i &sqsum_accumulator,
                               int num_ops_in_line)
    {
        // This function handles the last few straggling operations that are not a full chunk of 64 operations
        // We use the same algorithm, but we calculate a different operation mask using (num_ops % 64).

        const unsigned int num_operations = (unsigned int) num_ops_in_line & 0x3F;  // Quick int modulo 64

        if (num_operations > 0) {
            __mmask64 operation_mask = (1ULL << num_operations) - 1ULL;

            integral_64_operations_avx512((__m512i *) srcs, (__m512i *) sums, (__m512i *) sums_above,
                                          (__m512i *) sqsum, (__m512i *) sqsum_above,
                                          operation_mask, sum_accumulator, sqsum_accumulator);
        }
    }


    static CV_ALWAYS_INLINE
    void integral_64_operations_avx512(const __m512i *srcs,
                                       __m512i *sums,       const __m512i *sums_above,
                                       __m512i *sqsums,     const __m512i *sqsums_above,
                                       __mmask64 data_mask,
                                       __m512i &sum_accumulator, __m512i &sqsum_accumulator)
    {
       __m512i src_64byte_chunk = read_64_bytes(srcs, data_mask);

        while (data_mask) {
            __m128i src_16bytes = extract_lower_16bytes(src_64byte_chunk);

            __m512i src_longs_lo = convert_lower_8bytes_to_longs(src_16bytes);
            __m512i src_longs_hi = convert_lower_8bytes_to_longs(shift_right_8_bytes(src_16bytes));

            // Calculate integral for the sum on the 8 lanes at a time
            integral_8_operations(src_longs_lo, sums_above, data_mask, sums, sum_accumulator);
            integral_8_operations(src_longs_hi, sums_above+1, data_mask>>8, sums+1, sum_accumulator);

            if (sqsums) {
                __m512i squared_source_lo = square_m512(src_longs_lo);
                __m512i squared_source_hi = square_m512(src_longs_hi);

                integral_8_operations(squared_source_lo, sqsums_above, data_mask, sqsums, sqsum_accumulator);
                integral_8_operations(squared_source_hi, sqsums_above+1, data_mask>>8, sqsums+1, sqsum_accumulator);
                sqsums += 2;
                sqsums_above+=2;
            }

            // Prepare for next iteration of loop
            // shift source to align next 16 bytes to lane 0, shift the mask, and advance the pointers
            sums += 2;
            sums_above += 2;
            data_mask = data_mask >> 16;
            src_64byte_chunk = shift_right_16_bytes(src_64byte_chunk);

        }

    }


    static CV_ALWAYS_INLINE
    void integral_8_operations(const __m512i src_longs, const __m512i *above_values_ptr, __mmask64 data_mask,
                               __m512i *results_ptr, __m512i &accumulator)
     {
        // NOTE that the calculate_integral function referenced here must be implemented in the templated
        // derivatives because the algorithm depends heavily on the number of channels in the image
        //
        _mm512_mask_storeu_pd(
                results_ptr,   // Store the result here
                (__mmask8)data_mask,     // Using the data mask to avoid overrunning the line
                calculate_integral( // Writing the value of the integral derived from:
                        src_longs,                                                    // input data
                        _mm512_maskz_loadu_pd((__mmask8)data_mask, above_values_ptr), // and the results from line above
                        accumulator                                                   // keeping track of the accumulator
                )
        );
    }


    static CV_ALWAYS_INLINE
    __m512i read_64_bytes(const __m512i *srcs, const __mmask64 data_mask)  {
        return _mm512_maskz_loadu_epi8(data_mask, srcs);
    }


    static CV_ALWAYS_INLINE
    __m128i extract_lower_16bytes(const __m512i src_64byte_chunk) {
        return _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0);
    }


    static CV_ALWAYS_INLINE
    __m512i convert_lower_8bytes_to_longs(const __m128i src_16bytes)  {
        return _mm512_cvtepu8_epi64(src_16bytes);
    }


    static CV_ALWAYS_INLINE
    __m512i square_m512(const __m512i src_longs) {
        return _mm512_mullo_epi64(src_longs, src_longs);
    }


    static CV_ALWAYS_INLINE
    __m128i shift_right_8_bytes(const __m128i src_16bytes)  {
        return _mm_maskz_compress_epi64(2, src_16bytes);
    }


    static CV_ALWAYS_INLINE
    __m512i shift_right_16_bytes(const __m512i src_64byte_chunk)  {
        return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk);
    }


    static CV_ALWAYS_INLINE
    __m512i  m512_hadd(const __m512i a){
        return _mm512_add_epi64(_mm512_maskz_compress_epi64(0xAA, a), _mm512_maskz_compress_epi64(0x55, a));
    }


    // The calculate_integral function referenced here must be implemented in the templated derivatives
    // because the algorithm depends heavily on the number of channels in the image
    // This is the incomplete definition (just the prototype) here.
    //
    static CV_ALWAYS_INLINE
    __m512d calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator);

};


//============================================================================================================
// This the only section that needs to change with respect to algorithm based on the number of channels
// It is responsible for returning the calculation of 8 lanes worth of the integral and returning in the
// accumulated sums in the accumulator parameter (NOTE: accumulator is an input and output parameter)
//
// The function prototype that needs to be implemented is:
//
//     __m512d calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator){ ... }
//
// Description of parameters:
//   INPUTS:
//      src_longs   : 8 lanes worth of the source bytes converted to 64 bit integers
//      above_values: 8 lanes worth of the result values from the line above (See the integral spec)
//      accumulator : 8 lanes worth of sums from the previous iteration
//                    IMPORTANT NOTE: This parameter is both an INPUT AND OUTPUT parameter to this function
//
//   OUTPUTS:
//      return value: The actual integral value for all 8 lanes which is defined by the spec as
//                    the sum of all channel values to the left of a given pixel plus the result
//                    written to the line directly above the current line.
//      accumulator:  This is an input and and output.  This parameter should be left with the accumulated
//                    sums for the current 8 lanes keeping all entries in the proper lane (do not shuffle it)
//
// Below here is the channel specific implementation
//

//========================================
//   1 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 1 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
    // One channel support is implemented differently than 2, 3, or 4 channel
    // One channel support has more horizontal operations that cannot be made vertical without losing performance
    // The logical operations needed look like:
    //   Vertical LANES  :   |7|6|5|4|3|2|1|0|
    //   src_longs       :   |H|G|F|E|D|C|B|A|
    //   shift_by_1      : + |G|F|E|D|C|B|A| |
    //   shift_by_2      : + |F|E|D|C|B|A| | |
    //   shift_by_3      : + |E|D|C|B|A| | | |
    //   shift_by_4      : + |D|C|B|A| | | | |
    //   shift_by_5      : + |C|B|A| | | | | |
    //   shift_by_6      : + |B|A| | | | | | |
    //   shift_by_7      : + |A| | | | | | | |
    //   carry_over_idxs : + |7|7|7|7|7|7|7|7|  (index position of result from previous iteration)
    //                     = integral
    //
    // If we do this vertically we end up losing performance because of the number of operations.  We will instead
    // do a horizontal add tree to create the vertical sections we need as a tree
    // Vertical Lanes: |   7  |   6  |   5  |   4  |   3  |   2  |   1  |   0  |
    //      src_longs: |   H  |   G  |   F  |   E  |   D  |   C  |   B  |   A  |
    //    horiz_sum_1: |      |      |      |      |  G+H |  E+F |  C+D |  A+B |
    //    horiz_sum_2: |      |      |      |      |      |      | EFGH | ABCD |
    //
    const __m512i horiz_sum_1 = m512_hadd(src_longs);   // indexes for the permutes below (3,2,1,0) = (GH, EF, CD, AB)
    const __m512i horiz_sum_2 = m512_hadd(horiz_sum_1); // indexes for the permutes below (9, 8)    = (EFGH, ABCD)

    // Then we can use the partial sums by looking at the vertical stacks above and realize that, for example
    // ABCD appears vertically in lanes 7, 6, 5, 4, and 3 so we will permute the values so that all partial products
    // appear in the right lanes. and sum them up along with the carry over value from the accumulator.  So we setup
    // the lanes like:
    // Vertical Lanes: |   7  |   6  |   5  |   4  |   3  |   2  |   1  |   0  |
    //            s1 : |   0  |   G  |   0  |   E  |   0  |   C  |   0  |   A  |
    //            s2 : | ABCD | ABCD | ABCD | ABCD | ABCD |  AB  |  AB  |   0  |
    //            s3 : | EFGH |  EF  |  EF  |   0  |   0  |   0  |   0  |   0  |
    //                 +------+------+------+------+------+------+------+------+
    //           sum : | A..H | A..G | A..F | A..E | A..D | A..C | A..B |   A  | Integral :-)
    //
    const __m512i s1 = _mm512_maskz_mov_epi64(0x55, src_longs); // 0 G 0 E 0 D 0 C 0 A
    const __m512i s2 = _mm512_permutex2var_epi64(horiz_sum_1, _mm512_set_epi64(8,8,8,8,8,0,0,4), horiz_sum_2);
    const __m512i s3 = _mm512_permutex2var_epi64(horiz_sum_1, _mm512_set_epi64(9,2,2,4,4,4,4,4), horiz_sum_2);

    // Now we use the rolling sum from the previous iteration from accumulator and replicate it into carry_over
    // And sum everything up into the accumulator
    //
    const __m512i carry_over  = _mm512_permutex2var_epi64(accumulator, _mm512_set_epi64(7,7,7,7,7,7,7,7), accumulator);
    accumulator = _mm512_add_epi64(_mm512_add_epi64(s2, s3), _mm512_add_epi64(carry_over, s1));

    // Convert to double precision and store
    //
    __m512d integral_pd = _mm512_add_pd(_mm512_cvtepu64_pd(accumulator), above_values);
    return integral_pd;
}


//========================================
//   2 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 2 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
    __m512i carryover_idxs = _mm512_set_epi64(7, 6, 7, 6, 7, 6, 7, 6);

    // Align data to prepare for the adds:
    //    shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
    //   Vertical LANES  :   76543210
    //   src_longs       :   HGFEDCBA
    //   shift2lanes     : + FEDCBA
    //   shift4lanes     : + DCBA
    //   shift6lanes     : + BA
    //   carry_over_idxs : + 76767676  (index position of result from previous iteration)
    //                     = integral
    __m512i shift2lanes = _mm512_maskz_expand_epi64(0xFC, src_longs);
    __m512i shift4lanes = _mm512_maskz_expand_epi64(0xF0, src_longs);
    __m512i shift6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs);
    __m512i carry_over  = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);

    // Add all values in tree form for perf ((0+2) + (4+6))
    __m512i sum_shift_02  = _mm512_add_epi64(src_longs,    shift2lanes);
    __m512i sum_shift_46  = _mm512_add_epi64(shift4lanes,  shift6lanes);
    __m512i sum_all       = _mm512_add_epi64(sum_shift_02, sum_shift_46);
    accumulator           = _mm512_add_epi64(sum_all,      carry_over);

    // Convert to packed double and add to the line above to get the true integral value
    __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
    __m512d integral_pd    = _mm512_add_pd(accumulator_pd, above_values);
    return integral_pd;
}


//========================================
//   3 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 3 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
    __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5);

    // Align data to prepare for the adds:
    //    shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
    //   Vertical LANES:     76543210
    //   src_longs       :   HGFEDCBA
    //   shit3lanes      : + EDCBA
    //   shift6lanes     : + BA
    //   carry_over_idxs : + 65765765  (index position of result from previous iteration)
    //                     = integral
    __m512i shift3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs);
    __m512i shift6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs);
    __m512i carry_over    = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);

    // Do the adds in tree form
    __m512i sum_shift_03 = _mm512_add_epi64(src_longs,    shift3lanes);
    __m512i sum_rest     = _mm512_add_epi64(shift6lanes,  carry_over);
    accumulator          = _mm512_add_epi64(sum_shift_03, sum_rest);

    // Convert to packed double and add to the line above to get the true integral value
    __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
    __m512d integral_pd    = _mm512_add_pd(accumulator_pd, above_values);
    return integral_pd;
}


//========================================
//   4 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 4 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
    __m512i carryover_idxs = _mm512_set_epi64(7, 6, 5, 4, 7, 6, 5, 4);

    // Align data to prepare for the adds:
    //    shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
    //   Vertical LANES:     76543210
    //   src_longs       :   HGFEDCBA
    //   shit4lanes      : + DCBA
    //   carry_over_idxs : + 76547654  (index position of result from previous iteration)
    //                     = integral
    __m512i shifted4lanes = _mm512_maskz_expand_epi64(0xF0, src_longs);
    __m512i carry_over    = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);

    // Add data pixels and carry over from last iteration
    __m512i sum_shift_04 = _mm512_add_epi64(src_longs,    shifted4lanes);
    accumulator          = _mm512_add_epi64(sum_shift_04, carry_over);

    // Convert to packed double and add to the line above to get the true integral value
    __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
    __m512d integral_pd    = _mm512_add_pd(accumulator_pd, above_values);
    return integral_pd;
}


} // end of anonymous namespace

static
void calculate_integral_avx512(const uchar *src,   size_t _srcstep,
                               double      *sum,   size_t _sumstep,
                               double      *sqsum, size_t _sqsumstep,
                               int width, int height, int cn)
{
    CV_INSTRUMENT_REGION();

    switch(cn){
        case 1: {
            IntegralCalculator< 1 > calculator;
            calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
            break;
        }
        case 2: {
            IntegralCalculator< 2 > calculator;
            calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
            break;
        }
        case 3: {
            IntegralCalculator< 3 > calculator;
            calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
            break;
        }
        case 4: {
            IntegralCalculator< 4 > calculator;
            calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
        }
    }
}


CV_CPU_OPTIMIZATION_NAMESPACE_END
}} // end namespace cv::hal