Blame view

ffmpeg-4.2.2/libswscale/arm/yuv2rgb_neon.S 12.9 KB
aac5773f   hucm   功能基本完成,接口待打磨
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
  /*
   * Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
   * Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
   *
   * This file is part of FFmpeg.
   *
   * FFmpeg is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
   * License as published by the Free Software Foundation; either
   * version 2.1 of the License, or (at your option) any later version.
   *
   * FFmpeg is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   * Lesser General Public License for more details.
   *
   * You should have received a copy of the GNU Lesser General Public
   * License along with FFmpeg; if not, write to the Free Software
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
  #include "libavutil/arm/asm.S"
  
  
  .macro compute_premult
      vsub.u16            q14,q11                                        @ q14 = U * (1 << 3) - 128 * (1 << 3)
      vsub.u16            q15,q11                                        @ q15 = V * (1 << 3) - 128 * (1 << 3)
      vqdmulh.s16         q8, q15, d1[0]                                 @ q8  = V * v2r
      vqdmulh.s16         q9, q14, d1[1]                                 @ q9  = U * u2g
      vqdmulh.s16         q5, q15, d1[2]                                 @ q5  = V * v2g
      vadd.s16            q9, q5                                         @ q9  = U * u2g + V * v2g
      vqdmulh.s16         q10,q14, d1[3]                                 @ q10 = U * u2b
  .endm
  
  .macro compute_color dst_comp1 dst_comp2 pre
      vadd.s16            q1, q14, \pre
      vadd.s16            q2, q15, \pre
      vqrshrun.s16        \dst_comp1, q1, #1
      vqrshrun.s16        \dst_comp2, q2, #1
  .endm
  
  .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
      compute_color       \r1, \r2, q8
      compute_color       \g1, \g2, q9
      compute_color       \b1, \b2, q10
      vmov.u8             \a1, #255
      vmov.u8             \a2, #255
  .endm
  
  .macro compute dst ofmt
      vshll.u8            q14, d14, #3                                   @ q14 = Y * (1 << 3)
      vshll.u8            q15, d15, #3                                   @ q15 = Y * (1 << 3)
      vsub.s16            q14, q12                                       @ q14 = (Y - y_offset)
      vsub.s16            q15, q12                                       @ q15 = (Y - y_offset)
      vqdmulh.s16         q14, q13                                       @ q14 = (Y - y_offset) * y_coeff
      vqdmulh.s16         q15, q13                                       @ q15 = (Y - y_offset) * y_coeff
  
  .ifc \ofmt,argb
      compute_rgba        d7, d8, d9, d6, d11, d12, d13, d10
  .endif
  
  .ifc \ofmt,rgba
      compute_rgba        d6, d7, d8, d9, d10, d11, d12, d13
  .endif
  
  .ifc \ofmt,abgr
      compute_rgba        d9, d8, d7, d6, d13, d12, d11, d10
  .endif
  
  .ifc \ofmt,bgra
      compute_rgba        d8, d7, d6, d9, d12, d11, d10, d13
  .endif
  
      vzip.8              d6, d10                                        @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
      vzip.8              d7, d11                                        @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
      vzip.8              d8, d12                                        @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
      vzip.8              d9, d13                                        @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
      vst4.8              {q3, q4}, [\dst,:128]!
      vst4.8              {q5, q6}, [\dst,:128]!
  .endm
  
  .macro process_1l_internal dst src ofmt
      vld2.8              {d14, d15}, [\src]!                            @ q7 = Y (interleaved)
      compute             \dst, \ofmt
  .endm
  
  .macro process_1l ofmt
      compute_premult
      process_1l_internal r2, r4, \ofmt
  .endm
  
  .macro process_2l ofmt
      compute_premult
      process_1l_internal r2, r4, \ofmt
      process_1l_internal r11,r12,\ofmt
  .endm
  
  .macro load_args_nv12
      push                {r4-r12, lr}
      vpush               {q4-q7}
      ldr                 r4, [sp, #104]                                 @ r4  = srcY
      ldr                 r5, [sp, #108]                                 @ r5  = linesizeY
      ldr                 r6, [sp, #112]                                 @ r6  = srcC
      ldr                 r7, [sp, #116]                                 @ r7  = linesizeC
      ldr                 r8, [sp, #120]                                 @ r8  = table
      ldr                 r9, [sp, #124]                                 @ r9  = y_offset
      ldr                 r10,[sp, #128]                                 @ r10 = y_coeff
      vdup.16             d0, r10                                        @ d0  = y_coeff
      vld1.16             {d1}, [r8]                                     @ d1  = *table
      add                 r11, r2, r3                                    @ r11 = dst + linesize (dst2)
      add                 r12, r4, r5                                    @ r12 = srcY + linesizeY (srcY2)
      lsl                 r3, r3, #1
      lsl                 r5, r5, #1
      sub                 r3, r3, r0, lsl #2                             @ r3 = linesize  * 2 - width * 4 (padding)
      sub                 r5, r5, r0                                     @ r5 = linesizeY * 2 - width     (paddingY)
      sub                 r7, r7, r0                                     @ r7 = linesizeC     - width     (paddingC)
  .endm
  
  .macro load_args_nv21
      load_args_nv12
  .endm
  
  .macro load_args_yuv420p
      push                {r4-r12, lr}
      vpush               {q4-q7}
      ldr                 r4, [sp, #104]                                 @ r4  = srcY
      ldr                 r5, [sp, #108]                                 @ r5  = linesizeY
      ldr                 r6, [sp, #112]                                 @ r6  = srcU
      ldr                 r8, [sp, #128]                                 @ r8  = table
      ldr                 r9, [sp, #132]                                 @ r9  = y_offset
      ldr                 r10,[sp, #136]                                 @ r10 = y_coeff
      vdup.16             d0, r10                                        @ d0  = y_coeff
      vld1.16             {d1}, [r8]                                     @ d1  = *table
      add                 r11, r2, r3                                    @ r11 = dst + linesize (dst2)
      add                 r12, r4, r5                                    @ r12 = srcY + linesizeY (srcY2)
      lsl                 r3, r3, #1
      lsl                 r5, r5, #1
      sub                 r3, r3, r0, lsl #2                             @ r3 = linesize  * 2 - width * 4 (padding)
      sub                 r5, r5, r0                                     @ r5 = linesizeY * 2 - width     (paddingY)
      ldr                 r10,[sp, #120]                                 @ r10 = srcV
  .endm
  
  .macro load_args_yuv422p
      push                {r4-r12, lr}
      vpush               {q4-q7}
      ldr                 r4, [sp, #104]                                 @ r4  = srcY
      ldr                 r5, [sp, #108]                                 @ r5  = linesizeY
      ldr                 r6, [sp, #112]                                 @ r6  = srcU
      ldr                 r7, [sp, #116]                                 @ r7  = linesizeU
      ldr                 r12,[sp, #124]                                 @ r12 = linesizeV
      ldr                 r8, [sp, #128]                                 @ r8  = table
      ldr                 r9, [sp, #132]                                 @ r9  = y_offset
      ldr                 r10,[sp, #136]                                 @ r10 = y_coeff
      vdup.16             d0, r10                                        @ d0  = y_coeff
      vld1.16             {d1}, [r8]                                     @ d1  = *table
      sub                 r3, r3, r0, lsl #2                             @ r3  = linesize  - width * 4 (padding)
      sub                 r5, r5, r0                                     @ r5  = linesizeY - width     (paddingY)
      sub                 r7, r7, r0, lsr #1                             @ r7  = linesizeU - width / 2 (paddingU)
      sub                 r12,r12,r0, lsr #1                             @ r12 = linesizeV - width / 2 (paddingV)
      ldr                 r10,[sp, #120]                                 @ r10 = srcV
  .endm
  
  .macro load_chroma_nv12
      pld [r12, #64*3]
  
      vld2.8              {d2, d3}, [r6]!                                @ q1: interleaved chroma line
      vshll.u8            q14, d2, #3                                    @ q14 = U * (1 << 3)
      vshll.u8            q15, d3, #3                                    @ q15 = V * (1 << 3)
  .endm
  
  .macro load_chroma_nv21
      pld [r12, #64*3]
  
      vld2.8              {d2, d3}, [r6]!                                @ q1: interleaved chroma line
      vshll.u8            q14, d3, #3                                    @ q14 = U * (1 << 3)
      vshll.u8            q15, d2, #3                                    @ q15 = V * (1 << 3)
  .endm
  
  .macro load_chroma_yuv420p
      pld [r10, #64*3]
      pld [r12, #64*3]
  
      vld1.8              d2, [r6]!                                      @ d2: chroma red line
      vld1.8              d3, [r10]!                                     @ d3: chroma blue line
      vshll.u8            q14, d2, #3                                    @ q14 = U * (1 << 3)
      vshll.u8            q15, d3, #3                                    @ q15 = V * (1 << 3)
  .endm
  
  .macro load_chroma_yuv422p
      pld [r10, #64*3]
  
      vld1.8              d2, [r6]!                                      @ d2: chroma red line
      vld1.8              d3, [r10]!                                     @ d3: chroma blue line
      vshll.u8            q14, d2, #3                                    @ q14 = U * (1 << 3)
      vshll.u8            q15, d3, #3                                    @ q15 = V * (1 << 3)
  .endm
  
  .macro increment_and_test_nv12
      add                 r11, r11, r3                                   @ dst2  += padding
      add                 r12, r12, r5                                   @ srcY2 += paddingY
      add                 r6, r6, r7                                     @ srcC  += paddingC
      subs                r1, r1, #2                                     @ height -= 2
  .endm
  
  .macro increment_and_test_nv21
      increment_and_test_nv12
  .endm
  
  .macro increment_and_test_yuv420p
      add                 r11, r11, r3                                   @ dst2  += padding
      add                 r12, r12, r5                                   @ srcY2 += paddingY
      ldr                 r7, [sp, #116]                                 @ r7     = linesizeU
      sub                 r7, r7, r0, lsr #1                             @ r7     = linesizeU - width / 2 (paddingU)
      add                 r6, r6, r7                                     @ srcU  += paddingU
      ldr                 r7, [sp, #124]                                 @ r7     = linesizeV
      sub                 r7, r7, r0, lsr #1                             @ r7     = linesizeV - width / 2 (paddingV)
      add                 r10, r10, r7                                   @ srcV  += paddingV
      subs                r1, r1, #2                                     @ height -= 2
  .endm
  
  .macro increment_and_test_yuv422p
      add                 r6, r6, r7                                     @ srcU  += paddingU
      add                 r10,r10,r12                                    @ srcV  += paddingV
      subs                r1, r1, #1                                     @ height -= 1
  .endm
  
  .macro process_nv12 ofmt
      process_2l \ofmt
  .endm
  
  .macro process_nv21 ofmt
      process_2l \ofmt
  .endm
  
  .macro process_yuv420p ofmt
      process_2l \ofmt
  .endm
  
  .macro process_yuv422p ofmt
      process_1l \ofmt
  .endm
  
  .macro declare_func ifmt ofmt
  function ff_\ifmt\()_to_\ofmt\()_neon, export=1
      load_args_\ifmt
      vmov.u16            q11, #1024                                     @ q11 = 128 * (1 << 3)
      vdup.16             q12, r9                                        @ q12 = y_offset
      vmov                d26, d0                                        @ q13 = y_coeff
      vmov                d27, d0                                        @ q13 = y_coeff
  1:
      mov                 r8, r0                                         @ r8 = width
  2:
      pld [r6, #64*3]
      pld [r4, #64*3]
      vmov.i8             d10, #128
      load_chroma_\ifmt
      process_\ifmt \ofmt
      subs                r8, r8, #16                                    @ width -= 16
      bgt                 2b
      add                 r2, r2, r3                                     @ dst   += padding
      add                 r4, r4, r5                                     @ srcY  += paddingY
      increment_and_test_\ifmt
      bgt                 1b
      vpop                {q4-q7}
      pop                 {r4-r12, lr}
      mov                 pc, lr
  endfunc
  .endm
  
  .macro declare_rgb_funcs ifmt
      declare_func \ifmt, argb
      declare_func \ifmt, rgba
      declare_func \ifmt, abgr
      declare_func \ifmt, bgra
  .endm
  
  declare_rgb_funcs nv12
  declare_rgb_funcs nv21
  declare_rgb_funcs yuv420p
  declare_rgb_funcs yuv422p