Blame view

ffmpeg-4.2.2/libavcodec/x86/lossless_videodsp_init.c 4.86 KB
aac5773f   hucm   功能基本完成,接口待打磨
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
  /*
   * Lossless video DSP utils
   *
   * This file is part of FFmpeg.
   *
   * FFmpeg is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
   * License as published by the Free Software Foundation; either
   * version 2.1 of the License, or (at your option) any later version.
   *
   * FFmpeg is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   * Lesser General Public License for more details.
   *
   * You should have received a copy of the GNU Lesser General Public
   * License along with FFmpeg; if not, write to the Free Software
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
  #include "config.h"
  #include "libavutil/x86/asm.h"
  #include "../lossless_videodsp.h"
  #include "libavutil/x86/cpu.h"
  
  void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t w);
  void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
  void ff_add_bytes_avx2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
  
  void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
                                 const uint8_t *diff, ptrdiff_t w,
                                 int *left, int *left_top);
  void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
                               const uint8_t *diff, ptrdiff_t w,
                               int *left, int *left_top);
  
  int  ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
                              ptrdiff_t w, int left);
  int  ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
                                        ptrdiff_t w, int left);
  int  ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
                                       ptrdiff_t w, int left);
  
  int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
  int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
  
  void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
  void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
  
  #if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
  static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
                                   const uint8_t *diff, ptrdiff_t w,
                                   int *left, int *left_top)
  {
      x86_reg w2 = -w;
      x86_reg x;
      int l  = *left     & 0xff;
      int tl = *left_top & 0xff;
      int t;
      __asm__ volatile (
          "mov          %7, %3            \n"
          "1:                             \n"
          "movzbl (%3, %4), %2            \n"
          "mov          %2, %k3           \n"
          "sub         %b1, %b3           \n"
          "add         %b0, %b3           \n"
          "mov          %2, %1            \n"
          "cmp          %0, %2            \n"
          "cmovg        %0, %2            \n"
          "cmovg        %1, %0            \n"
          "cmp         %k3, %0            \n"
          "cmovg       %k3, %0            \n"
          "mov          %7, %3            \n"
          "cmp          %2, %0            \n"
          "cmovl        %2, %0            \n"
          "add    (%6, %4), %b0           \n"
          "mov         %b0, (%5, %4)      \n"
          "inc          %4                \n"
          "jl           1b                \n"
          : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
          : "r"(dst + w), "r"(diff + w), "rm"(top + w)
      );
      *left     = l;
      *left_top = tl;
  }
  #endif
  
  void ff_llviddsp_init_x86(LLVidDSPContext *c)
  {
      int cpu_flags = av_get_cpu_flags();
  
  #if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
      if (cpu_flags & AV_CPU_FLAG_CMOV)
          c->add_median_pred = add_median_pred_cmov;
  #endif
  
      if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
          c->add_bytes = ff_add_bytes_mmx;
      }
  
      if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
          /* slower than cmov version on AMD */
          if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
              c->add_median_pred = ff_add_median_pred_mmxext;
      }
  
      if (EXTERNAL_SSE2(cpu_flags)) {
          c->add_bytes       = ff_add_bytes_sse2;
          c->add_median_pred = ff_add_median_pred_sse2;
      }
  
      if (EXTERNAL_SSSE3(cpu_flags)) {
          c->add_left_pred = ff_add_left_pred_ssse3;
          c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
          c->add_gradient_pred   = ff_add_gradient_pred_ssse3;
      }
  
      if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
          c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
          c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3;
      }
  
      if (EXTERNAL_AVX2_FAST(cpu_flags)) {
          c->add_bytes       = ff_add_bytes_avx2;
          c->add_left_pred   = ff_add_left_pred_unaligned_avx2;
          c->add_gradient_pred = ff_add_gradient_pred_avx2;
      }
  }