#version 450 #define LOCAL_SZ_X 256 layout(push_constant) uniform pushBlock { int channels; int in_h; int in_w; int out_h; int out_w; int padding_h; int padding_w; int filter_h; int filter_w; int stride_h; int stride_w; int total; int padded_area; } p; layout(binding = 0) readonly buffer Input0{ float in_buffer[]; }; layout(binding = 1) writeonly buffer Output{ float out_buffer[]; }; layout(local_size_x = LOCAL_SZ_X, local_size_y = 1, local_size_z = 1) in; void main() { int global_size = int(gl_WorkGroupSize.x * gl_NumWorkGroups.x); int gid = int(gl_GlobalInvocationID.x); for (int index = gid; index < p.total; index += global_size) { const int pw = index % p.out_w; const int ph = (index / p.out_w) % p.out_h; const int c = (index / p.out_w / p.out_h) % p.channels; const int n = index / p.out_w / p.out_h / p.channels; int hstart = ph * p.stride_h - p.padding_h; int wstart = pw * p.stride_w - p.padding_w; int hend = min(hstart + p.filter_h, p.in_h + p.padding_h); int wend = min(wstart + p.filter_w, p.in_w + p.padding_w); int pool_size; if (p.padded_area == 1) { pool_size = (hend - hstart) * (wend - wstart); hstart = max(hstart, 0); wstart = max(wstart, 0); hend = min(hend, p.in_h); wend = min(wend, p.in_w); } else { hstart = max(hstart, 0); wstart = max(wstart, 0); hend = min(hend, p.in_h); wend = min(wend, p.in_w); pool_size = (hend - hstart) * (wend - wstart); } float aveval = 0; int off = (n * p.channels + c) * p.in_h * p.in_w; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { aveval += in_buffer[off + h * p.in_w + w]; } } out_buffer[index] = aveval / pool_size; } }