Blame view

3rdparty/ffmpeg-4.4.4/libavcodec/aarch64/hevcdsp_sao_neon.S 3.27 KB
09c2d08c   Hu Chunming   arm交付版
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  /* -*-arm64-*-
   * vim: syntax=arm64asm
   *
   * AArch64 NEON optimised SAO functions for HEVC decoding
   *
   * Copyright (c) 2020 Josh Dekker <josh@itanimul.li>
   *
   * This file is part of FFmpeg.
   *
   * FFmpeg is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
   * License as published by the Free Software Foundation; either
   * version 2.1 of the License, or (at your option) any later version.
   *
   * FFmpeg is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   * Lesser General Public License for more details.
   *
   * You should have received a copy of the GNU Lesser General Public
   * License along with FFmpeg; if not, write to the Free Software
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
  #include "libavutil/aarch64/asm.S"
  
  // void sao_band_filter(uint8_t *_dst, uint8_t *_src,
  //                      ptrdiff_t stride_dst, ptrdiff_t stride_src,
  //                      int16_t *sao_offset_val, int sao_left_class,
  //                      int width, int height)
  function ff_hevc_sao_band_filter_8x8_8_neon, export=1
          sub             sp,  sp, #64
          stp            xzr, xzr, [sp]
          stp            xzr, xzr, [sp, #16]
          stp            xzr, xzr, [sp, #32]
          stp            xzr, xzr, [sp, #48]
          mov             w8,  #4
  0:
          ldrsh           x9, [x4,  x8, lsl #1] // x9 = sao_offset_val[k+1]
          subs            w8,  w8,  #1
          add            w10,  w8,  w5 // x10 = k + sao_left_class
          and            w10, w10, #0x1F
          strh            w9, [sp, x10, lsl #1]
          bne             0b
          ld1            {v16.16b-v19.16b}, [sp], #64
          movi           v20.8h,   #1
  1:      // beginning of line
          mov             w8,  w6
  2:
          // Simple layout for accessing 16bit values
          // with 8bit LUT.
          //
          //   00  01  02  03  04  05  06  07
          // +----------------------------------->
          // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
          // +----------------------------------->
          //    i-0     i-1     i-2     i-3
          // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
          ld1            {v2.8b}, [x1]
          // load src[x]
          uxtl            v0.8h,  v2.8b
          // >> shift
          ushr            v2.8h,  v0.8h, #3 // BIT_DEPTH - 3
          // x2 (access lower short)
          shl             v1.8h,  v2.8h, #1 // low (x2, accessing short)
          // +1 access upper short
          add             v3.8h,  v1.8h, v20.8h
          // shift insert index to upper byte
          sli             v1.8h,  v3.8h, #8
          // table
          tbx            v2.16b, {v16.16b-v19.16b}, v1.16b
          // src[x] + table
          add             v1.8h,  v0.8h, v2.8h
          // clip + narrow
          sqxtun          v4.8b,  v1.8h
          // store
          st1            {v4.8b}, [x0]
          // done 8 pixels
          subs            w8, w8,  #8
          bne             2b
          // finished line
          subs            w7, w7,  #1
          add             x0, x0,  x2 // dst += stride_dst
          add             x1, x1,  x3 // src += stride_src
          bne             1b
          ret
  endfunc