bidir.cl
9.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/* Mode selection routines, select the least SATD cost mode for each lowres
* macroblock. When measuring B slices, this includes measuring the cost of
* three bidir modes. */
/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres,
int2 fencpos,
read_only image2d_t fref0_planes,
int2 qpos0,
read_only image2d_t fref1_planes,
int2 qpos1,
int weight,
local sum2_t *tmpp,
int idx )
{
volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
sum2_t b0, b1, b2, b3;
sum2_t sum = 0;
// fencpos is full-pel position of original MB
// qpos0 is qpel position within reference frame 0
// qpos1 is qpel position within reference frame 1
int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
uint vA, vB;
uint enc, ref0, ref1;
uint a0, a1;
const int weight2 = 64 - weight;
#define READ_BIDIR_DIFF( OUT, X )\
enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
ref0 = rhadd( vA, vB );\
vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
ref1 = rhadd( vA, vB );\
OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
#define READ_DIFF_EX( OUT, a, b )\
READ_BIDIR_DIFF( a0, a );\
READ_BIDIR_DIFF( a1, b );\
OUT = a0 + (a1<<BITS_PER_SUM);
#define ROW_8x4_SATD( a, b, c )\
fencpos.y += a;\
fref0Apos.y += b;\
fref0Bpos.y += b;\
fref1Apos.y += c;\
fref1Bpos.y += c;\
READ_DIFF_EX( b0, 0, 4 );\
READ_DIFF_EX( b1, 1, 5 );\
READ_DIFF_EX( b2, 2, 6 );\
READ_DIFF_EX( b3, 3, 7 );\
HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
ROW_8x4_SATD( 0, 0, 0 );
ROW_8x4_SATD( 4, 4, 4 );
#undef READ_BIDIR_DIFF
#undef READ_DIFF_EX
#undef ROW_8x4_SATD
return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
}
/*
* mode selection - pick the least cost partition type for each 8x8 macroblock.
* Intra, list0 or list1. When measuring a B slice, also test three bidir
* possibilities.
*
* fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
* hold many frames worth of motion vectors. We must offset into the correct
* location for this frame's vectors:
*
* CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
* GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
*
* global launch dimensions for P slice estimate: [mb_width, mb_height]
* global launch dimensions for B slice estimate: [mb_width * 4, mb_height]
*/
kernel void mode_selection( read_only image2d_t fenc_lowres,
read_only image2d_t fref0_planes,
read_only image2d_t fref1_planes,
const global short2 *fenc_lowres_mvs0,
const global short2 *fenc_lowres_mvs1,
const global short2 *fref1_lowres_mvs0,
const global int16_t *fenc_lowres_mv_costs0,
const global int16_t *fenc_lowres_mv_costs1,
const global uint16_t *fenc_intra_cost,
global uint16_t *lowres_costs,
global int *frame_stats,
local int16_t *cost_local,
local sum2_t *satd_local,
int mb_width,
int bipred_weight,
int dist_scale_factor,
int b,
int p0,
int p1,
int lambda )
{
int mb_x = get_global_id( 0 );
int b_bidir = b < p1;
if( b_bidir )
{
/* when mode_selection is run for B frames, it must perform BIDIR SATD
* measurements, so it is launched with four times as many threads in
* order to spread the work around more of the GPU. And it can add
* padding threads in the X direction. */
mb_x >>= 2;
if( mb_x >= mb_width )
return;
}
int mb_y = get_global_id( 1 );
int mb_height = get_global_size( 1 );
int mb_count = mb_width * mb_height;
int mb_xy = mb_x + mb_y * mb_width;
/* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
if( mb_x < 4 && mb_y == 0 )
frame_stats[mb_x] = 0;
int bcost = COST_MAX;
int list_used = 0;
if( !b_bidir )
{
int icost = fenc_intra_cost[mb_xy];
COPY2_IF_LT( bcost, icost, list_used, 0 );
}
if( b != p0 )
{
int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
}
if( b != p1 )
{
int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
}
if( b_bidir )
{
int2 coord = (int2)(mb_x, mb_y) << 3;
int mb_i = get_global_id( 0 ) & 3;
int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
cost_local += mb_in_group * 4;
satd_local += mb_in_group * 16;
#define TRY_BIDIR( mv0, mv1, penalty )\
{\
int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
}
/* temporal prediction */
short2 dmv0, dmv1;
short2 mvr = fref1_lowres_mvs0[mb_xy];
dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8;
dmv1 = dmv0 - mvr;
TRY_BIDIR( dmv0, dmv1, 0 )
if( as_uint( dmv0 ) || as_uint( dmv1 ) )
{
/* B-direct prediction */
dmv0 = 0; dmv1 = 0;
TRY_BIDIR( dmv0, dmv1, 0 );
}
/* L0+L1 prediction */
dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
TRY_BIDIR( dmv0, dmv1, 5 );
#undef TRY_BIDIR
}
lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
}
/*
* parallel sum inter costs
*
* global launch dimensions: [256, mb_height]
*/
kernel void sum_inter_cost( const global uint16_t *fenc_lowres_costs,
const global uint16_t *inv_qscale_factor,
global int *fenc_row_satds,
global int *frame_stats,
int mb_width,
int bframe_bias,
int b,
int p0,
int p1 )
{
int y = get_global_id( 1 );
int mb_height = get_global_size( 1 );
int row_satds = 0;
int cost_est = 0;
int cost_est_aq = 0;
int intra_mbs = 0;
for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
{
int mb_xy = x + y * mb_width;
int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
if( list == 0 && b_frame_score_mb )
intra_mbs++;
int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
row_satds += cost_aq;
if( b_frame_score_mb )
{
cost_est += cost;
cost_est_aq += cost_aq;
}
}
local int buffer[256];
int x = get_global_id( 0 );
row_satds = parallel_sum( row_satds, x, buffer );
cost_est = parallel_sum( cost_est, x, buffer );
cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
intra_mbs = parallel_sum( intra_mbs, x, buffer );
if( b != p1 )
// Use floating point math to avoid 32bit integer overflow conditions
cost_est = (int)((float)cost_est * 100.0f / (120.0f + (float)bframe_bias));
if( get_global_id( 0 ) == 0 )
{
fenc_row_satds[y] = row_satds;
atomic_add( frame_stats + COST_EST, cost_est );
atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
atomic_add( frame_stats + INTRA_MBS, intra_mbs );
}
}