motionsearch.cl
8.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
/* Hierarchical (iterative) OpenCL lowres motion search */
inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
{
/* edge macroblocks might not have a direct descendant, use nearest */
x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
return (mb_width>>1) * y + x;
}
/* Four threads calculate an 8x8 SAD. Each does two rows */
int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
{
frefpos.y += idx << 1;
fencpos.y += idx << 1;
int cost = 0;
if( frefpos.x < 0 )
{
/* slow path when MV goes past left edge. The GPU clamps reads from
* (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
* want are [0, 0, 1, 2]
*/
for( int y = 0; y < 2; y++ )
{
for( int x = 0; x < 8; x++ )
{
pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
cost += abs_diff( enc, ref );
}
}
}
else
{
uint4 enc, ref, costs = 0;
enc = read_imageui( fenc, sampler, fencpos );
ref = read_imageui( fref, sampler, frefpos );
costs += abs_diff( enc, ref );
enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
costs += abs_diff( enc, ref );
enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
costs += abs_diff( enc, ref );
enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
costs += abs_diff( enc, ref );
cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
}
costs[idx] = cost;
return costs[0] + costs[1] + costs[2] + costs[3];
}
/* One thread performs 8x8 SAD */
int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
{
if( frefpos.x < 0 )
{
/* slow path when MV goes past left edge */
int cost = 0;
for( int y = 0; y < 8; y++ )
{
for( int x = 0; x < 8; x++ )
{
uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
cost += abs_diff( enc, ref );
}
}
return cost;
}
else
{
uint4 enc, ref, cost = 0;
for( int y = 0; y < 8; y++ )
{
for( int x = 0; x < 8; x += 4 )
{
enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
cost += abs_diff( enc, ref );
}
}
return cost.s0 + cost.s1 + cost.s2 + cost.s3;
}
}
/*
* hierarchical motion estimation
*
* Each kernel launch is a single iteration
*
* MB per work group is determined by lclx / 4 * lcly
*
* global launch dimensions: [mb_width * 4, mb_height]
*/
kernel void hierarchical_motion( read_only image2d_t fenc,
read_only image2d_t fref,
const global short2 *in_mvs,
global short2 *out_mvs,
global int16_t *out_mv_costs,
global short2 *mvp_buffer,
local int16_t *cost_local,
local short2 *mvc_local,
int mb_width,
int lambda,
int me_range,
int scale,
int b_shift_index,
int b_first_iteration,
int b_reverse_references )
{
int mb_x = get_global_id( 0 ) >> 2;
if( mb_x >= mb_width )
return;
int mb_height = get_global_size( 1 );
int mb_i = get_global_id( 0 ) & 3;
int mb_y = get_global_id( 1 );
int mb_xy = mb_y * mb_width + mb_x;
const int mb_size = 8;
int2 coord = (int2)(mb_x, mb_y) * mb_size;
const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
cost_local += 4 * mb_in_group;
int i_mvc = 0;
mvc_local += 4 * mb_in_group;
mvc_local[mb_i] = 0;
int2 mvp =0;
if( !b_first_iteration )
{
#define MVC( DX, DY )\
{\
int px = mb_x + DX;\
int py = mb_y + DY;\
mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
in_mvs[mb_width * py + px];\
mvc_local[i_mvc] >>= (short) scale;\
i_mvc++;\
}
/* Find MVP from median of MVCs */
if( b_reverse_references )
{
/* odd iterations: derive MVP from down and right */
if( mb_x < mb_width - 1 )
MVC( 1, 0 );
if( mb_y < mb_height - 1 )
{
MVC( 0, 1 );
if( mb_x > b_shift_index )
MVC( -1, 1 );
if( mb_x < mb_width - 1 )
MVC( 1, 1 );
}
}
else
{
/* even iterations: derive MVP from up and left */
if( mb_x > 0 )
MVC( -1, 0 );
if( mb_y > 0 )
{
MVC( 0, -1 );
if( mb_x < mb_width - 1 )
MVC( 1, -1 );
if( mb_x > b_shift_index )
MVC( -1, -1 );
}
}
#undef MVC
mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
}
/* current mvp matches the previous mvp and we have not changed scale. We know
* we're going to arrive at the same MV again, so just copy the previous
* result to our output. */
if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
{
out_mvs[mb_xy] = in_mvs[mb_xy];
return;
}
mvp_buffer[mb_xy] = convert_short2_sat(mvp);
int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
int2 bestmv = clamp(mvp, mv_min, mv_max);
int2 refcrd = coord + bestmv;
/* measure cost at bestmv */
int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
do
{
/* measure costs at offsets from bestmv */
refcrd = coord + bestmv + dia_offs[mb_i];
int2 trymv = bestmv + dia_offs[mb_i];
int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
cost_local[mb_i] = (cost<<2) | mb_i;
cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
if( (cost >> 2) >= bcost )
break;
bestmv += dia_offs[cost&3];
bcost = cost>>2;
if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
break;
}
while( --me_range > 0 );
int2 trymv = 0, diff = 0;
#define COST_MV_NO_PAD( L )\
trymv = clamp( trymv, mv_min, mv_max );\
diff = convert_int2_sat(abs_diff( mvp, trymv ));\
if( diff.x > 1 || diff.y > 1 ) {\
int2 refcrd = coord + trymv;\
int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\
if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
COST_MV_NO_PAD( 0 );
if( !b_first_iteration )
{
/* try cost at previous iteration's MV, if MVP was too far away */
int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]);
prevmv >>= scale;
trymv = prevmv;
COST_MV_NO_PAD( lambda );
}
for( int i = 0; i < i_mvc; i++ )
{
/* try cost at each candidate MV, if MVP was too far away */
trymv = convert_int2_sat( mvc_local[i] );
COST_MV_NO_PAD( lambda );
}
if( mb_i == 0 )
{
bestmv <<= scale;
out_mvs[mb_xy] = convert_short2_sat(bestmv);
out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
}
}