Blame view

ffmpeg-4.2.2/libavcodec/x86/mlpdsp.asm 6.98 KB
aac5773f   hucm   功能基本完成,接口待打磨
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
  ;******************************************************************************
  ;* SIMD-optimized MLP DSP functions
  ;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
  ;*
  ;* This file is part of FFmpeg.
  ;*
  ;* FFmpeg is free software; you can redistribute it and/or
  ;* modify it under the terms of the GNU Lesser General Public
  ;* License as published by the Free Software Foundation; either
  ;* version 2.1 of the License, or (at your option) any later version.
  ;*
  ;* FFmpeg is distributed in the hope that it will be useful,
  ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  ;* Lesser General Public License for more details.
  ;*
  ;* You should have received a copy of the GNU Lesser General Public
  ;* License along with FFmpeg; if not, write to the Free Software
  ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  ;******************************************************************************
  
  %include "libavutil/x86/x86util.asm"
  
  SECTION .text
  
  %if ARCH_X86_64
  
  %macro SHLX 2
  %if cpuflag(bmi2)
     shlx %1, %1, %2q
  %else
     shl  %1, %2b
  %endif
  %endmacro
  
  %macro REMATRIX 0
      movdqa        m0, [samplesq]
      movdqa        m1, [coeffsq ]
      pshufd        m2, m0, q2301
      pshufd        m3, m1, q2301
      pmuldq        m0, m1
      pmuldq        m3, m2
      paddq         m0, m3
  %if notcpuflag(avx2)
      movdqa        m1, [samplesq + 16]
      movdqa        m2, [coeffsq  + 16]
      pshufd        m3, m1, q2301
      pshufd        m4, m2, q2301
      pmuldq        m1, m2
      pmuldq        m4, m3
      paddq         m0, m1
      paddq         m0, m4
  %else
      vextracti128 xm1, m0, 1
      paddq        xm0, xm1
  %endif
  %endmacro
  
  %macro LOOP_END 0
      pshufd       xm1, xm0, q0032
      paddq        xm0, xm1
      movq      accumq, xm0
      movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
      sar       accumq, 14                            ; accum >>= 14
      and       accumd, maskd                         ; accum &= mask
      add       accumd, blsbsd                        ; accum += *bypassed_lsbs
      mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
      add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
      add     samplesq, 32                            ; samples += MAX_CHANNELS;
      cmp   blsbs_ptrq, cntq
  %endmacro
  
  %macro LOOP_SHIFT_END 0
      pshufd       xm1, xm0, q0032
      paddq        xm0, xm1
      movq      accumq, xm0
      and       indexd, auspd                         ; index &= access_unit_size_pow2;
      movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
      add       indexd, index2d                       ; index += index2
      SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
      add       accumq, noiseq                        ; accum += noise_buffer[index]
      movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
      sar       accumq, 14                            ; accum >>= 14
      and       accumd, maskd                         ; accum &= mask
      add       accumd, noised                        ; accum += *bypassed_lsbs
      mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
      add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
      add     samplesq, 32                            ; samples += MAX_CHANNELS;
      cmp   blsbs_ptrq, cntq
  %endmacro
  
  ;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
  ;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
  ;                             int index, unsigned int dest_ch, uint16_t blockpos,
  ;                             unsigned int maxchan, int matrix_noise_shift,
  ;                             int access_unit_size_pow2, int32_t mask)
  %macro MLP_REMATRIX_CHANNEL 0
  cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
                                          index, dest_ch, blockpos, maxchan, mns, \
                                          accum, mask, cnt
      mov         mnsd, mnsm                          ; load matrix_noise_shift
      movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
      mov     maxchand, maxchanm                      ; load maxchan
      mov        maskd, maskm                         ; load mask
  %if WIN64
      mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
  %endif
      shl     dest_chd, 2
      lea         cntq, [blsbs_ptrq + blockposq*8]
      test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
      jne .shift                                      ; jump if true
      cmp     maxchand, 4                             ; is maxchan < 4?
      jl .loop4                                       ; jump if true
  
  align 16
  .loop8:
      ; Process 5 or more channels
      REMATRIX
      LOOP_END
      jne .loop8
      RET
  
  align 16
  .loop4:
      ; Process up to 4 channels
      movdqa       xm0, [samplesq]
      movdqa       xm1, [coeffsq ]
      pshufd       xm2, xm0, q2301
      pshufd       xm3, xm1, q2301
      pmuldq       xm0, xm1
      pmuldq       xm3, xm2
      paddq        xm0, xm3
      LOOP_END
      jne .loop4
      RET
  
  .shift:
  %if WIN64
      mov       indexd, indexm         ; load index (not needed on UNIX64)
  %endif
      mov          r9d, r9m            ; load access_unit_size_pow2
  %if cpuflag(bmi2)
      ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
      DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
                  index, dest_ch, accum, index2, mns, \
                  ausp, mask, cnt, noise
      add         mnsd, 7              ; matrix_noise_shift += 7
  %else ; sse4
      mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
  %if WIN64
      ; r0 = rcx
      DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
                  index2, accum, ausp, mask, cnt, noise
  %else ; UNIX64
      ; r3 = rcx
      DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
                  index2, accum, ausp, mask, cnt, noise
  %endif
      lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
  %endif ; cpuflag
      sub        auspd, 1              ; access_unit_size_pow2 -= 1
      cmp          r7d, 4              ; is maxchan < 4?
      lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
      jl .loop4_shift                  ; jump if maxchan < 4
  
  align 16
  .loop8_shift:
      ; Process 5 or more channels
      REMATRIX
      LOOP_SHIFT_END
      jne .loop8_shift
      RET
  
  align 16
  .loop4_shift:
      ; Process up to 4 channels
      movdqa       xm0, [samplesq]
      movdqa       xm1, [coeffsq ]
      pshufd       xm2, xm0, q2301
      pshufd       xm3, xm1, q2301
      pmuldq       xm0, xm1
      pmuldq       xm3, xm2
      paddq        xm0, xm3
      LOOP_SHIFT_END
      jne .loop4_shift
      RET
  %endmacro
  
  INIT_XMM sse4
  MLP_REMATRIX_CHANNEL
  %if HAVE_AVX2_EXTERNAL
  INIT_YMM avx2, bmi2
  MLP_REMATRIX_CHANNEL
  %endif
  
  %endif ; ARCH_X86_64