trellis-64.asm
25.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
;*****************************************************************************
;* trellis-64.asm: x86_64 trellis quantization
;*****************************************************************************
;* Copyright (C) 2012-2024 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
; This is a pretty straight-forward translation of the C code, except:
; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level.
; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop:
; 4x parallel, handling 4 node_ctxs of the same coef (even if some of those
; nodes are invalid).
; * Interprocedural register allocation. Eliminates argument-passing overhead
; to trellis_coef* subroutines. Also reduces codesize.
; Optimizations that I tried, and rejected because they were not faster:
; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3].
; Costs too much icache compared to the negligible speedup.
; * There are only 21 possible sets of live node_ctxs; we could keep track of
; exactly which set we're in and feed that (along with abs_level) into a jump
; table instead of the switch to select a trellis_coef subroutine. This would
; eliminate all branches about which node_ctxs are live, but costs either a
; bunch of icache or a bunch of call/ret, and the jump table itself is
; unpredictable.
; * Separate versions of trellis_coef* depending on whether we're doing the 1st
; or the 2nd of the two abs_level candidates. This would eliminate some
; branches about if(score is better).
; * Special case more values of coef. I had a coef2 at some intermediate point
; in the optimization process, but it didn't end up worthwhile in conjunction
; with all the other optimizations.
; * Unroll or simd writeback. I don't know why this didn't help.
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
pd_m16: times 4 dd -16
sq_1: dq 1, 0
pq_128: times 2 dq 128
pq_ffffffff: times 2 dq 0xffffffff
cextern pd_8
cextern pd_0123
cextern pd_4567
cextern_common cabac_entropy
cextern_common cabac_transition
cextern cabac_size_unary
cextern cabac_transition_unary
cextern_common dct4_weight_tab
cextern_common dct8_weight_tab
cextern_common dct4_weight2_tab
cextern_common dct8_weight2_tab
cextern_common last_coeff_flag_offset_8x8
cextern_common significant_coeff_flag_offset_8x8
cextern_common coeff_flag_offset_chroma_422_dc
SECTION .text
%define TRELLIS_SCORE_BIAS 1<<60
%define SIZEOF_NODE 16
%define CABAC_SIZE_BITS 8
%define LAMBDA_BITS 4
%macro SQUARE 2 ; dst, tmp
; could use pmuldq here, to eliminate the abs. but that would involve
; templating a sse4 version of all of trellis, for negligible speedup.
%if cpuflag(ssse3)
pabsd m%1, m%1
pmuludq m%1, m%1
%elif HIGH_BIT_DEPTH
ABSD m%2, m%1
SWAP %1, %2
pmuludq m%1, m%1
%else
pmuludq m%1, m%1
pand m%1, [pq_ffffffff]
%endif
%endmacro
%macro LOAD_DUP 2 ; dst, src
%if cpuflag(ssse3)
movddup %1, %2
%else
movd %1, %2
punpcklqdq %1, %1
%endif
%endmacro
;-----------------------------------------------------------------------------
; int trellis_cabac_4x4_psy(
; const int *unquant_mf, const uint8_t *zigzag, int lambda2,
; int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct,
; uint8_t *cabac_state_sig, uint8_t *cabac_state_last,
; uint64_t level_state0, uint16_t level_state1,
; int b_ac, dctcoef *fenc_dct, int psy_trellis )
;-----------------------------------------------------------------------------
%macro TRELLIS 4
%define num_coefs %2
%define dc %3
%define psy %4
cglobal %1, 4,15,9
%assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef*
%assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
SUB rsp, pad
DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
%if WIN64
%define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
%else
%define level_statem rsp+stack_offset+32
%endif
%define b_acm r11m ; 4x4 only
%define b_interlacedm r11m ; 8x8 only
%define i_coefsm1 r11m ; dc only
%define fenc_dctm r12m
%define psy_trellism r13m
%if num_coefs == 64
shl dword b_interlacedm, 6
%define dct_weight1_tab dct8_weight_tab
%define dct_weight2_tab dct8_weight2_tab
%else
%define dct_weight1_tab dct4_weight_tab
%define dct_weight2_tab dct4_weight2_tab
%endif
%define stack rsp
%define last_nnzm [stack+0]
%define zigzagm [stack+8]
mov last_nnzm, iid
mov zigzagm, zigzagq
%if WIN64 == 0
%define orig_coefsm [stack+16]
%define quant_coefsm [stack+24]
mov orig_coefsm, orig_coefsq
mov quant_coefsm, quant_coefsq
%endif
%define unquant_mfm [stack+32]
%define levelgt1_ctxm [stack+40]
%define ssd stack+48
%define cost_siglast stack+80
%define level_tree stack+96
; trellis_node_t is laid out differently than C.
; struct-of-arrays rather than array-of-structs, for simd.
%define nodes_curq r7
%define nodes_prevq r8
%define node_score(x) x*8
%define node_level_idx(x) 64+x*4
%define node_cabac_state(x) 96+x*4
lea nodes_curq, [level_tree + level_tree_size]
lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE]
mov r6, TRELLIS_SCORE_BIAS
mov [nodes_curq + node_score(0)], r6
mov dword [nodes_curq + node_level_idx(0)], 0
movd mm0, [level_statem + 0]
punpcklbw mm0, [level_statem + 4]
punpcklwd mm0, [level_statem + 8]
%define level_state_packed mm0 ; version for copying into node.cabac_state
pcmpeqb m7, m7 ; TRELLIS_SCORE_MAX
movq [nodes_curq + node_score(1)], m7
mova [nodes_curq + node_score(2)], m7
%define levels_usedq r4
%define levels_usedd r4d
mov dword [level_tree], 0
mov levels_usedd, 1
%define abs_levelq r9
%define abs_leveld r9d
%define abs_coefq r14
%define zigzagiq r5
%define zigzagid r5d
%if num_coefs == 8
mov dword levelgt1_ctxm, 8
%else
mov dword levelgt1_ctxm, 9
%endif
%if psy
LOAD_DUP m6, psy_trellism
%define psy_trellis m6
%elif dc
LOAD_DUP m6, [unquant_mfq]
paddd m6, m6
%define unquant_mf m6
%endif
%if dc == 0
mov unquant_mfm, unquant_mfq
%endif
; Keep a single offset register to PICify all global constants.
; They're all relative to "beginning of this asm file's .text section",
; even tables that aren't in this file.
; (Any address in .text would work, this one was just convenient.)
lea r0, [$$]
%define GLOBAL +r0-$$
TRELLIS_LOOP 0 ; node_ctx 0..3
TRELLIS_LOOP 1 ; node_ctx 1..7
.writeback:
; int level = bnode->level_idx;
; for( int i = b_ac; i <= last_nnz; i++ )
; dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]);
; level = level_tree[level].next;
mov iid, last_nnzm
add zigzagq, iiq
neg iiq
%if num_coefs == 16 && dc == 0
mov r2d, b_acm
add iiq, r2
%endif
%define dctq r10
mov r0d, [nodes_curq + node_level_idx(0) + rax*4]
.writeback_loop:
movzx r2, byte [zigzagq + iiq]
%if cpuflag(ssse3)
movd m0, [level_tree + r0*4]
movzx r0, word [level_tree + r0*4]
psrld m0, 16
movd m1, [dctq + r2*SIZEOF_DCTCOEF]
%if HIGH_BIT_DEPTH
psignd m0, m1
movd [dctq + r2*SIZEOF_DCTCOEF], m0
%else
psignw m0, m1
movd r4d, m0
mov [dctq + r2*SIZEOF_DCTCOEF], r4w
%endif
%else
mov r5d, [level_tree + r0*4]
%if HIGH_BIT_DEPTH
mov r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
%else
movsx r4d, word [dctq + r2*SIZEOF_DCTCOEF]
%endif
movzx r0d, r5w
sar r4d, 31
shr r5d, 16
xor r5d, r4d
sub r5d, r4d
%if HIGH_BIT_DEPTH
mov [dctq + r2*SIZEOF_DCTCOEF], r5d
%else
mov [dctq + r2*SIZEOF_DCTCOEF], r5w
%endif
%endif
inc iiq
jle .writeback_loop
mov eax, 1
.return:
ADD rsp, pad
RET
%if num_coefs == 16 && dc == 0
.return_zero:
pxor m0, m0
mova [r10+ 0], m0
mova [r10+16], m0
%if HIGH_BIT_DEPTH
mova [r10+32], m0
mova [r10+48], m0
%endif
jmp .return
%endif
%endmacro ; TRELLIS
%macro TRELLIS_LOOP 1 ; ctx_hi
.i_loop%1:
; if( !quant_coefs[i] )
mov r6, quant_coefsm
%if HIGH_BIT_DEPTH
mov abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
%else
movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
%endif
; int sigindex = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
mov r10, cabac_state_sigm
%if num_coefs == 64
mov r6d, b_interlacedm
add r6d, iid
movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
movzx r10, byte [r10 + r6]
%elif num_coefs == 8
movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
movzx r10, byte [r10 + r13]
%else
movzx r10, byte [r10 + iiq]
%endif
test abs_leveld, abs_leveld
jnz %%.nonzero_quant_coef
%if %1 == 0
; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
; * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
; nodes_cur[0].score -= cost_sig0;
movzx r10, word [cabac_entropy + r10*2 GLOBAL]
imul r10, lambda2q
shr r10, CABAC_SIZE_BITS - LAMBDA_BITS
sub [nodes_curq + node_score(0)], r10
%endif
ZERO_LEVEL_IDX %1, cur
jmp .i_continue%1
%%.nonzero_quant_coef:
; int sign_coef = orig_coefs[zigzag[i]];
; int abs_coef = abs( sign_coef );
; int q = abs( quant_coefs[i] );
movzx zigzagid, byte [zigzagq+iiq]
movd m0, abs_leveld
mov r6, orig_coefsm
%if HIGH_BIT_DEPTH
LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
%else
LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
psrad m1, 16 ; sign_coef
%endif
punpcklqdq m0, m0 ; quant_coef
%if cpuflag(ssse3)
pabsd m0, m0
pabsd m2, m1 ; abs_coef
%else
pxor m8, m8
pcmpgtd m8, m1 ; sign_mask
pxor m0, m8
pxor m2, m1, m8
psubd m0, m8
psubd m2, m8
%endif
psubd m0, [sq_1] ; abs_level
movd abs_leveld, m0
xchg nodes_curq, nodes_prevq
; if( i < num_coefs-1 )
; int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i
; cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
; cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
; cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;
; cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;
%if %1 == 0
%if dc && num_coefs != 8
cmp iid, i_coefsm1
%else
cmp iid, num_coefs-1
%endif
je %%.zero_siglast
%endif
movzx r11, word [cabac_entropy + r10*2 GLOBAL]
xor r10, 1
movzx r12, word [cabac_entropy + r10*2 GLOBAL]
mov [cost_siglast+0], r11d
mov r10, cabac_state_lastm
%if num_coefs == 64
movzx r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL]
movzx r10, byte [r10 + r6]
%elif num_coefs == 8
movzx r10, byte [r10 + r13]
%else
movzx r10, byte [r10 + iiq]
%endif
movzx r11, word [cabac_entropy + r10*2 GLOBAL]
add r11, r12
mov [cost_siglast+4], r11d
%if %1 == 0
xor r10, 1
movzx r10, word [cabac_entropy + r10*2 GLOBAL]
add r10, r12
mov [cost_siglast+8], r10d
%endif
%%.skip_siglast:
; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
; int d = abs_coef - unquant_abs_level;
; uint64_t ssd = (int64_t)d*d * coef_weight[i];
%if dc
pmuludq m0, unquant_mf
%else
mov r10, unquant_mfm
LOAD_DUP m3, [r10 + zigzagiq*4]
pmuludq m0, m3
%endif
paddd m0, [pq_128]
psrld m0, 8 ; unquant_abs_level
%if psy || dc == 0
mova m4, m0
%endif
psubd m0, m2
SQUARE 0, 3
%if dc
psllq m0, 8
%else
LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
pmuludq m0, m5
%endif
%if psy
test iid, iid
jz %%.dc_rounding
; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef
; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));
; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
; ssd1[k] -= psy_weight * psy_value;
mov r6, fenc_dctm
%if HIGH_BIT_DEPTH
LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
%else
LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
psrad m3, 16 ; orig_coef
%endif
%if cpuflag(ssse3)
psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef)
%else
PSIGN d, m4, m8
%endif
psubd m3, m1 ; predicted_coef
paddd m4, m3
%if cpuflag(ssse3)
pabsd m4, m4
%else
ABSD m3, m4
SWAP 4, 3
%endif
LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
pmuludq m1, psy_trellis
pmuludq m4, m1
psubq m0, m4
%if %1
%%.dc_rounding:
%endif
%endif
%if %1 == 0
mova [ssd], m0
%endif
%if dc == 0 && %1 == 0
test iid, iid
jnz %%.skip_dc_rounding
%%.dc_rounding:
; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks.
; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15);
; uint64_t ssd = (int64_t)d*d * coef_weight[i];
psrad m1, 31 ; sign_coef>>31
paddd m4, [pd_8]
paddd m4, m1
pand m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15
psubd m4, m2 ; d
SQUARE 4, 3
pmuludq m4, m5
mova [ssd], m4
%%.skip_dc_rounding:
%endif
mova [ssd+16], m0
%assign stack_offset_bak stack_offset
cmp abs_leveld, 1
jl %%.switch_coef0
%if %1 == 0
mov r10, [ssd] ; trellis_coef* args
%endif
movq r12, m0
; for( int j = 0; j < 8; j++ )
; nodes_cur[j].score = TRELLIS_SCORE_MAX;
%if cpuflag(ssse3)
mova [nodes_curq + node_score(0)], m7
mova [nodes_curq + node_score(2)], m7
%else ; avoid store-forwarding stalls on k8/k10
%if %1 == 0
movq [nodes_curq + node_score(0)], m7
%endif
movq [nodes_curq + node_score(1)], m7
movq [nodes_curq + node_score(2)], m7
movq [nodes_curq + node_score(3)], m7
%endif
mova [nodes_curq + node_score(4)], m7
mova [nodes_curq + node_score(6)], m7
je %%.switch_coef1
%%.switch_coefn:
call trellis_coefn.entry%1
call trellis_coefn.entry%1b
jmp .i_continue1
%%.switch_coef1:
call trellis_coef1.entry%1
call trellis_coefn.entry%1b
jmp .i_continue1
%%.switch_coef0:
call trellis_coef0_%1
call trellis_coef1.entry%1b
.i_continue%1:
dec iid
%if num_coefs == 16 && dc == 0
cmp iid, b_acm
%endif
jge .i_loop%1
call trellis_bnode_%1
%if %1 == 0
%if num_coefs == 16 && dc == 0
jz .return_zero
%else
jz .return
%endif
jmp .writeback
%%.zero_siglast:
xor r6d, r6d
mov [cost_siglast+0], r6
mov [cost_siglast+8], r6d
jmp %%.skip_siglast
%endif
%endmacro ; TRELLIS_LOOP
; just a synonym for %if
%macro IF0 1+
%endmacro
%macro IF1 1+
%1
%endmacro
%macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev
; for( int j = 0; j < 8; j++ )
; nodes_cur[j].level_idx = levels_used;
; level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 };
; levels_used++;
add levels_usedd, 3
and levels_usedd, ~3 ; allow aligned stores
movd m0, levels_usedd
pshufd m0, m0, 0
IF%1 mova m1, m0
paddd m0, [pd_0123]
IF%1 paddd m1, [pd_4567]
mova m2, [nodes_%2q + node_level_idx(0)]
IF%1 mova m3, [nodes_%2q + node_level_idx(4)]
mova [nodes_curq + node_level_idx(0)], m0
IF%1 mova [nodes_curq + node_level_idx(4)], m1
mova [level_tree + (levels_usedq+0)*4], m2
IF%1 mova [level_tree + (levels_usedq+4)*4], m3
add levels_usedd, (1+%1)*4
%endmacro
INIT_XMM sse2
TRELLIS trellis_cabac_4x4, 16, 0, 0
TRELLIS trellis_cabac_8x8, 64, 0, 0
TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
TRELLIS trellis_cabac_dc, 16, 1, 0
TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
INIT_XMM ssse3
TRELLIS trellis_cabac_4x4, 16, 0, 0
TRELLIS trellis_cabac_8x8, 64, 0, 0
TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
TRELLIS trellis_cabac_dc, 16, 1, 0
TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
%define stack rsp+gprsize
%define scoreq r14
%define bitsq r13
%define bitsd r13d
INIT_XMM
%macro clocal 1
ALIGN 16
global mangle(private_prefix %+ _%1)
mangle(private_prefix %+ _%1):
%1:
%assign stack_offset stack_offset_bak+gprsize
%endmacro
%macro TRELLIS_BNODE 1 ; ctx_hi
clocal trellis_bnode_%1
; int j = ctx_hi?1:0;
; trellis_node_t *bnode = &nodes_cur[j];
; while( ++j < (ctx_hi?8:4) )
; if( nodes_cur[j].score < bnode->score )
; bnode = &nodes_cur[j];
%assign j %1
mov rax, [nodes_curq + node_score(j)]
lea rax, [rax*8 + j]
%rep 3+3*%1
%assign j j+1
mov r11, [nodes_curq + node_score(j)]
lea r11, [r11*8 + j]
cmp rax, r11
cmova rax, r11
%endrep
mov r10, dctm
and eax, 7
ret
%endmacro ; TRELLIS_BNODE
TRELLIS_BNODE 0
TRELLIS_BNODE 1
%macro TRELLIS_COEF0 1 ; ctx_hi
clocal trellis_coef0_%1
; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
mov r11d, [cost_siglast+0]
imul r11, lambda2q
shr r11, CABAC_SIZE_BITS - LAMBDA_BITS
add r11, [ssd+16]
%if %1 == 0
; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1;
mov scoreq, [nodes_prevq + node_score(0)]
add scoreq, [ssd]
sub scoreq, r11
mov [nodes_curq + node_score(0)], scoreq
%endif
; memcpy
mov scoreq, [nodes_prevq + node_score(1)]
mov [nodes_curq + node_score(1)], scoreq
mova m1, [nodes_prevq + node_score(2)]
mova [nodes_curq + node_score(2)], m1
%if %1
mova m1, [nodes_prevq + node_score(4)]
mova [nodes_curq + node_score(4)], m1
mova m1, [nodes_prevq + node_score(6)]
mova [nodes_curq + node_score(6)], m1
%endif
mov r6d, [nodes_prevq + node_cabac_state(3)]
mov [nodes_curq + node_cabac_state(3)], r6d
%if %1
mova m1, [nodes_prevq + node_cabac_state(4)]
mova [nodes_curq + node_cabac_state(4)], m1
%endif
ZERO_LEVEL_IDX %1, prev
ret
%endmacro ; TRELLIS_COEF0
TRELLIS_COEF0 0
TRELLIS_COEF0 1
%macro START_COEF 1 ; gt1
; if( (int64_t)nodes_prev[0].score < 0 ) continue;
mov scoreq, [nodes_prevq + node_score(j)]
%if j > 0
test scoreq, scoreq
js .ctx %+ nextj_if_invalid
%endif
; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 );
%if j >= 3
movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4
movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL]
%else
movzx r6d, byte [level_statem + coeff_abs_level1_offs]
%endif
%if %1
xor r6d, 1
%endif
movzx bitsd, word [cabac_entropy + r6*2 GLOBAL]
; n.score += ssd;
; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
%if j == 0
add scoreq, r10
add bitsd, [cost_siglast+8]
%else
add scoreq, r12
add bitsd, [cost_siglast+4]
%endif
%endmacro ; START_COEF
%macro END_COEF 1
; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
imul bitsq, lambda2q
shr bitsq, CABAC_SIZE_BITS - LAMBDA_BITS
add scoreq, bitsq
; if( n.score < nodes_cur[node_ctx].score )
; SET_LEVEL( n, abs_level );
; nodes_cur[node_ctx] = n;
cmp scoreq, [nodes_curq + node_score(node_ctx)]
jae .ctx %+ nextj_if_valid
mov [nodes_curq + node_score(node_ctx)], scoreq
%if j == 2 || (j <= 3 && node_ctx == 4)
; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states
movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed
%elif j >= 3
; if we have updated before, then copy cabac_state from the parent node
mov r6d, [nodes_prevq + node_cabac_state(j)]
mov [nodes_curq + node_cabac_state(node_ctx)], r6d
%endif
%if j >= 3 ; skip the transition if we're not going to reuse the context
mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
%endif
%if %1 && node_ctx == 7
mov r6d, levelgt1_ctxm
mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
%endif
mov r6d, [nodes_prevq + node_level_idx(j)]
%if %1
mov r11d, abs_leveld
shl r11d, 16
or r6d, r11d
%else
or r6d, 1<<16
%endif
mov [level_tree + levels_usedq*4], r6d
mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd
inc levels_usedd
%endmacro ; END_COEF
%macro COEF1 2
%assign j %1
%assign nextj_if_valid %1+1
%assign nextj_if_invalid %2
%if j < 4
%assign coeff_abs_level1_offs j+1
%else
%assign coeff_abs_level1_offs 0
%endif
%if j < 3
%assign node_ctx j+1
%else
%assign node_ctx j
%endif
.ctx %+ j:
START_COEF 0
add bitsd, 1 << CABAC_SIZE_BITS
END_COEF 0
%endmacro ; COEF1
%macro COEFN 2
%assign j %1
%assign nextj_if_valid %2
%assign nextj_if_invalid %2
%if j < 4
%assign coeff_abs_level1_offs j+1
%assign coeff_abs_levelgt1_offs 5
%else
%assign coeff_abs_level1_offs 0
%assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc
%endif
%if j < 4
%assign node_ctx 4
%elif j < 7
%assign node_ctx j+1
%else
%assign node_ctx 7
%endif
.ctx %+ j:
START_COEF 1
; if( abs_level >= 15 )
; bits += bs_size_ue_big(...)
add bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
; n.cabac_state[levelgt1_ctx]
%if j == 7 ; && compiling support for 4:2:2
mov r6d, levelgt1_ctxm
%define coeff_abs_levelgt1_offs r6
%endif
%if j == 7
movzx r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9
%else
movzx r10, byte [level_statem + coeff_abs_levelgt1_offs]
%endif
; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]];
add r10d, r1d
movzx r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL]
add bitsd, r6d
%if node_ctx == 7
movzx r10, byte [cabac_transition_unary + r10-128 GLOBAL]
%endif
END_COEF 1
%endmacro ; COEFN
clocal trellis_coef1
.entry0b: ; ctx_lo, larger of the two abs_level candidates
mov r10, [ssd+8]
sub r10, r11
mov r12, [ssd+24]
sub r12, r11
.entry0: ; ctx_lo, smaller of the two abs_level candidates
COEF1 0, 4
COEF1 1, 4
COEF1 2, 4
COEF1 3, 4
.ctx4:
rep ret
.entry1b: ; ctx_hi, larger of the two abs_level candidates
mov r12, [ssd+24]
sub r12, r11
.entry1: ; ctx_hi, smaller of the two abs_level candidates
trellis_coef1_hi:
COEF1 1, 2
COEF1 2, 3
COEF1 3, 4
COEF1 4, 5
COEF1 5, 6
COEF1 6, 7
COEF1 7, 8
.ctx8:
rep ret
%macro COEFN_PREFIX 1
; int prefix = X264_MIN( abs_level - 1, 14 );
mov r1d, abs_leveld
cmp abs_leveld, 15
jge .level_suffix%1
xor r5d, r5d
.skip_level_suffix%1:
shl r1d, 7
%endmacro
%macro COEFN_SUFFIX 1
.level_suffix%1:
; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
lea r5d, [abs_levelq-14]
bsr r5d, r5d
shl r5d, CABAC_SIZE_BITS+1
add r5d, 1<<CABAC_SIZE_BITS
; int prefix = X264_MIN( abs_level - 1, 14 );
mov r1d, 15
jmp .skip_level_suffix%1
%endmacro
clocal trellis_coefn
.entry0b:
mov r10, [ssd+8]
mov r12, [ssd+24]
inc abs_leveld
.entry0:
; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then
; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints
; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize.
; I can't do both, as return-on-first-failure doesn't work for ctx_hi.
; The C version has to be fully separate since C doesn't support multiple
; entrypoints. But return-on-first-failure isn't very important here (as
; opposed to coef1), so I might as well reduce codesize.
COEFN_PREFIX 0
COEFN 0, 1
COEFN 1, 2
COEFN 2, 3
COEFN 3, 8
.ctx8:
mov zigzagq, zigzagm ; unspill since r1 was clobbered
ret
.entry1b:
mov r12, [ssd+24]
inc abs_leveld
.entry1:
COEFN_PREFIX 1
COEFN 4, 5
COEFN 5, 6
COEFN 6, 7
COEFN 7, 1
jmp .ctx1
COEFN_SUFFIX 0
COEFN_SUFFIX 1