Blame view

ffmpeg-4.2.2/libavcodec/x86/fmtconvert.asm 3.77 KB
aac5773f   hucm   功能基本完成,接口待打磨
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  ;******************************************************************************
  ;* x86 optimized Format Conversion Utils
  ;* Copyright (c) 2008 Loren Merritt
  ;*
  ;* This file is part of FFmpeg.
  ;*
  ;* FFmpeg is free software; you can redistribute it and/or
  ;* modify it under the terms of the GNU Lesser General Public
  ;* License as published by the Free Software Foundation; either
  ;* version 2.1 of the License, or (at your option) any later version.
  ;*
  ;* FFmpeg is distributed in the hope that it will be useful,
  ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  ;* Lesser General Public License for more details.
  ;*
  ;* You should have received a copy of the GNU Lesser General Public
  ;* License along with FFmpeg; if not, write to the Free Software
  ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  ;******************************************************************************
  
  %include "libavutil/x86/x86util.asm"
  
  SECTION .text
  
  ;------------------------------------------------------------------------------
  ; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
  ;                                    int len);
  ;------------------------------------------------------------------------------
  %macro INT32_TO_FLOAT_FMUL_SCALAR 1
  %if UNIX64
  cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
  %else
  cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
  %endif
  %if WIN64
      SWAP 0, 2
  %elif ARCH_X86_32
      movss   m0, mulm
  %endif
      SPLATD  m0
      shl     lend, 2
      add     srcq, lenq
      add     dstq, lenq
      neg     lenq
  .loop:
  %if cpuflag(sse2)
      cvtdq2ps  m1, [srcq+lenq   ]
      cvtdq2ps  m2, [srcq+lenq+16]
  %else
      cvtpi2ps  m1, [srcq+lenq   ]
      cvtpi2ps  m3, [srcq+lenq+ 8]
      cvtpi2ps  m2, [srcq+lenq+16]
      cvtpi2ps  m4, [srcq+lenq+24]
      movlhps   m1, m3
      movlhps   m2, m4
  %endif
      mulps     m1, m0
      mulps     m2, m0
      mova  [dstq+lenq   ], m1
      mova  [dstq+lenq+16], m2
      add     lenq, 32
      jl .loop
  %if notcpuflag(sse2)
      ;; cvtpi2ps switches to MMX even if the source is a memory location
      ;; possible an error in documentation since every tested CPU disagrees with
      ;; that. Use emms anyway since the vast majority of machines will use the
      ;; SSE2 variant
      emms
  %endif
      RET
  %endmacro
  
  INIT_XMM sse
  INT32_TO_FLOAT_FMUL_SCALAR 5
  INIT_XMM sse2
  INT32_TO_FLOAT_FMUL_SCALAR 3
  
  ;------------------------------------------------------------------------------
  ; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
  ;                                    const float *mul, int len);
  ;------------------------------------------------------------------------------
  %macro INT32_TO_FLOAT_FMUL_ARRAY8 0
  cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
      shl     lend, 2
      add     srcq, lenq
      add     dstq, lenq
      neg     lenq
  .loop:
      movss     m0, [mulq]
      SPLATD    m0
  %if cpuflag(sse2)
      cvtdq2ps  m1, [srcq+lenq   ]
      cvtdq2ps  m2, [srcq+lenq+16]
  %else
      cvtpi2ps  m1, [srcq+lenq   ]
      cvtpi2ps  m3, [srcq+lenq+ 8]
      cvtpi2ps  m2, [srcq+lenq+16]
      cvtpi2ps  m4, [srcq+lenq+24]
      movlhps   m1, m3
      movlhps   m2, m4
  %endif
      mulps     m1, m0
      mulps     m2, m0
      mova  [dstq+lenq   ], m1
      mova  [dstq+lenq+16], m2
      add     mulq, 4
      add     lenq, 32
      jl .loop
  %if notcpuflag(sse2)
      ;; cvtpi2ps switches to MMX even if the source is a memory location
      ;; possible an error in documentation since every tested CPU disagrees with
      ;; that. Use emms anyway since the vast majority of machines will use the
      ;; SSE2 variant
      emms
  %endif
      RET
  %endmacro
  
  INIT_XMM sse
  INT32_TO_FLOAT_FMUL_ARRAY8
  INIT_XMM sse2
  INT32_TO_FLOAT_FMUL_ARRAY8