Blame view

ffmpeg-4.2.2/libavcodec/x86/huffyuvdsp.asm 4.03 KB
aac5773f   hucm   功能基本完成,接口待打磨
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
  ;******************************************************************************
  ;* SIMD-optimized HuffYUV functions
  ;* Copyright (c) 2008 Loren Merritt
  ;* Copyright (c) 2014 Christophe Gisquet
  ;*
  ;* This file is part of FFmpeg.
  ;*
  ;* FFmpeg is free software; you can redistribute it and/or
  ;* modify it under the terms of the GNU Lesser General Public
  ;* License as published by the Free Software Foundation; either
  ;* version 2.1 of the License, or (at your option) any later version.
  ;*
  ;* FFmpeg is distributed in the hope that it will be useful,
  ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  ;* Lesser General Public License for more details.
  ;*
  ;* You should have received a copy of the GNU Lesser General Public
  ;* License along with FFmpeg; if not, write to the Free Software
  ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  ;******************************************************************************
  
  %include "libavutil/x86/x86util.asm"
  
  SECTION .text
  
  %include "libavcodec/x86/huffyuvdsp_template.asm"
  
  ;------------------------------------------------------------------------------
  ; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
  ;------------------------------------------------------------------------------
  
  %macro ADD_INT16 0
  cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
  %if mmsize > 8
      test srcq, mmsize-1
      jnz .unaligned
      test dstq, mmsize-1
      jnz .unaligned
  %endif
      INT16_LOOP a, add
  %if mmsize > 8
  .unaligned:
      INT16_LOOP u, add
  %endif
  %endmacro
  
  %if ARCH_X86_32
  INIT_MMX mmx
  ADD_INT16
  %endif
  
  INIT_XMM sse2
  ADD_INT16
  
  %if HAVE_AVX2_EXTERNAL
  INIT_YMM avx2
  ADD_INT16
  %endif
  
  ; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
  ;                               intptr_t w, uint8_t *left)
  %macro LEFT_BGR32 0
  cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
      shl           wq, 2
      movd          m0, [leftq]
      lea         dstq, [dstq + wq]
      lea         srcq, [srcq + wq]
      LSHIFT        m0, mmsize-4
      neg           wq
  .loop:
      movu          m1, [srcq+wq]
      mova          m2, m1
  %if mmsize == 8
      punpckhdq     m0, m0
  %endif
      LSHIFT        m1, 4
      paddb         m1, m2
  %if mmsize == 16
      pshufd        m0, m0, q3333
      mova          m2, m1
      LSHIFT        m1, 8
      paddb         m1, m2
  %endif
      paddb         m0, m1
      movu   [dstq+wq], m0
      add           wq, mmsize
      jl         .loop
      movd          m0, [dstq-4]
      movd     [leftq], m0
      REP_RET
  %endmacro
  
  %if ARCH_X86_32
  INIT_MMX mmx
  LEFT_BGR32
  %endif
  INIT_XMM sse2
  LEFT_BGR32
  
  ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
  INIT_MMX mmxext
  cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
      add      wd, wd
      movd    mm6, maskd
      SPLATW  mm6, mm6
      movq    mm0, [topq]
      movq    mm2, mm0
      movd    mm4, [left_topq]
      psllq   mm2, 16
      movq    mm1, mm0
      por     mm4, mm2
      movd    mm3, [leftq]
      psubw   mm0, mm4 ; t-tl
      add    dstq, wq
      add    topq, wq
      add   diffq, wq
      neg      wq
      jmp .skip
  .loop:
      movq    mm4, [topq+wq]
      movq    mm0, mm4
      psllq   mm4, 16
      por     mm4, mm1
      movq    mm1, mm0 ; t
      psubw   mm0, mm4 ; t-tl
  .skip:
      movq    mm2, [diffq+wq]
  %assign i 0
  %rep 4
      movq    mm4, mm0
      paddw   mm4, mm3 ; t-tl+l
      pand    mm4, mm6
      movq    mm5, mm3
      pmaxsw  mm3, mm1
      pminsw  mm5, mm1
      pminsw  mm3, mm4
      pmaxsw  mm3, mm5 ; median
      paddw   mm3, mm2 ; +residual
      pand    mm3, mm6
  %if i==0
      movq    mm7, mm3
      psllq   mm7, 48
  %else
      movq    mm4, mm3
      psrlq   mm7, 16
      psllq   mm4, 48
      por     mm7, mm4
  %endif
  %if i<3
      psrlq   mm0, 16
      psrlq   mm1, 16
      psrlq   mm2, 16
  %endif
  %assign i i+1
  %endrep
      movq [dstq+wq], mm7
      add      wq, 8
      jl .loop
      movzx   r2d, word [dstq-2]
      mov [leftq], r2d
      movzx   r2d, word [topq-2]
      mov [left_topq], r2d
      RET