Blame view

ffmpeg-4.2.2/libavcodec/x86/bswapdsp.asm 3.57 KB
aac5773f   hucm   功能基本完成,接口待打磨
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
  ;******************************************************************************
  ;* optimized bswap buffer functions
  ;* Copyright (c) 2008 Loren Merritt
  ;* Copyright (c) 2003-2013 Michael Niedermayer
  ;* Copyright (c) 2013 Daniel Kang
  ;*
  ;* This file is part of FFmpeg.
  ;*
  ;* FFmpeg is free software; you can redistribute it and/or
  ;* modify it under the terms of the GNU Lesser General Public
  ;* License as published by the Free Software Foundation; either
  ;* version 2.1 of the License, or (at your option) any later version.
  ;*
  ;* FFmpeg is distributed in the hope that it will be useful,
  ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  ;* Lesser General Public License for more details.
  ;*
  ;* You should have received a copy of the GNU Lesser General Public
  ;* License along with FFmpeg; if not, write to the Free Software
  ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  ;******************************************************************************
  
  %include "libavutil/x86/x86util.asm"
  
  SECTION_RODATA
  pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  
  cextern pb_80
  
  SECTION .text
  
  ; %1 = aligned/unaligned
  %macro BSWAP_LOOPS  1
      mov      r3d, r2d
      sar      r2d, 3
      jz       .left4_%1
  %if cpuflag(avx2)
      sar      r2d, 1
      jz       .left8_%1
  %endif
  .loop8_%1:
      mov%1    m0, [r1 +  0]
      mov%1    m1, [r1 + mmsize]
  %if cpuflag(ssse3)||cpuflag(avx2)
      pshufb   m0, m2
      pshufb   m1, m2
      mov%1    [r0 +  0], m0
      mov%1    [r0 + mmsize], m1
  %else
      pshuflw  m0, m0, 10110001b
      pshuflw  m1, m1, 10110001b
      pshufhw  m0, m0, 10110001b
      pshufhw  m1, m1, 10110001b
      mova     m2, m0
      mova     m3, m1
      psllw    m0, 8
      psllw    m1, 8
      psrlw    m2, 8
      psrlw    m3, 8
      por      m2, m0
      por      m3, m1
      mov%1    [r0 +  0], m2
      mov%1    [r0 + 16], m3
  %endif
      add      r0, mmsize*2
      add      r1, mmsize*2
      dec      r2d
      jnz      .loop8_%1
  %if cpuflag(avx2)
  .left8_%1:
      mov      r2d, r3d
      test     r3d, 8
      jz       .left4_%1
      mov%1    m0, [r1]
      pshufb   m0, m2
      mov%1    [r0 +  0], m0
      add r1, mmsize
      add r0, mmsize
  %endif
  .left4_%1:
      mov      r2d, r3d
      test     r3d, 4
      jz       .left
      mov%1    xm0, [r1]
  %if cpuflag(ssse3)
      pshufb   xm0, xm2
      mov%1    [r0], xm0
  %else
      pshuflw  m0, m0, 10110001b
      pshufhw  m0, m0, 10110001b
      mova     m2, m0
      psllw    m0, 8
      psrlw    m2, 8
      por      m2, m0
      mov%1    [r0], m2
  %endif
      add      r1, 16
      add      r0, 16
  %endmacro
  
  ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  %macro BSWAP32_BUF 0
  %if cpuflag(ssse3)||cpuflag(avx2)
  cglobal bswap32_buf, 3,4,3
      mov      r3, r1
      VBROADCASTI128  m2, [pb_bswap32]
  %else
  cglobal bswap32_buf, 3,4,5
      mov      r3, r1
  %endif
      or       r3, r0
      test     r3, mmsize - 1
      jz       .start_align
      BSWAP_LOOPS  u
      jmp      .left
  .start_align:
      BSWAP_LOOPS  a
  .left:
  %if cpuflag(ssse3)
      test     r2d, 2
      jz       .left1
      movq     xm0, [r1]
      pshufb   xm0, xm2
      movq     [r0], xm0
      add      r1, 8
      add      r0, 8
  .left1:
      test     r2d, 1
      jz       .end
      mov      r2d, [r1]
      bswap    r2d
      mov      [r0], r2d
  %else
      and      r2d, 3
      jz       .end
  .loop2:
      mov      r3d, [r1]
      bswap    r3d
      mov      [r0], r3d
      add      r1, 4
      add      r0, 4
      dec      r2d
      jnz      .loop2
  %endif
  .end:
      RET
  %endmacro
  
  INIT_XMM sse2
  BSWAP32_BUF
  
  INIT_XMM ssse3
  BSWAP32_BUF
  
  %if HAVE_AVX2_EXTERNAL
  INIT_YMM avx2
  BSWAP32_BUF
  %endif