Hi, today I have a C and OpenCL code fine-tuning problem to think about. This problem is quite complex to solve and truth be told I do not know how and why it does not work. The example is from real project at https://github.com/sowson/darknet.

Let me first show you slow to compute version of example method in OpenCL.

__kernel void mean_kernel(int N, __global float *x, int batch, int filters, int spatial, __global float *mean) { float scale = 1.f/(batch * spatial); int id = (get_group_id(0) + get_group_id(1)*get_num_groups(0)) * get_local_size(0) + get_local_id(0); if (id >= N) return; int i = id; mean[i] = 0; int j, k; for (j = 0; j < batch; ++j) { for (k = 0; k < spatial; ++k) { int index = j * filters * spatial + i * spatial + k; mean[i] += x[index]; } } mean[i] *= scale; }

To give you all the aspects of computation you need to see also C code that invokes this one GPU kernel.

void mean_gpu(cl_mem_ext x, int batch, int filters, int spatial, cl_mem_ext mean) { size_t N = filters; dim2 dimGrid; dimGrid = dim2_create(N, 1); opencl_kernel(opencl_mean_kernel[opencl_device_id_t], dimGrid, 12, &N, sizeof(cl_int), &x.mem, sizeof(cl_mem), &batch, sizeof(cl_int), &filters, sizeof(cl_int), &spatial, sizeof(cl_int), &mean.mem, sizeof(cl_mem)); }

Now let me show you fine-tuned ready code in OpenCL.

__kernel void fast_mean_kernel(int tuning, __local float *sums, int filters, int batch, int spatial, __global float *x, __global float *mean) { int i = get_global_id(0); int t = get_local_id(0); sums[t] = 0; int j,k; for (j = 0; j < batch; ++j) { for (k = t; k < spatial; k += tuning) { int index = j * filters * spatial + i * spatial + k; sums[t] += x[index]; } } barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); if (t == 0) { mean[i] = 0; int s; for(s = 0; s < tuning; ++s) { mean[i] += sums[s]; } mean[i] /= (spatial * batch); } }

To give you all the aspects of computation you need to see also C code that invokes this one GPU kernel.

void fast_mean_gpu(cl_mem_ext x, int batch, int filters, int spatial, cl_mem_ext mean) { int tuning = 16; dim2 dimGridG1; dimGridG1 = dim2_create(filters, 1); dim2 dimGridL1; dimGridL1 = dim2_create(tuning, 1); opencl_kernel_local(opencl_fast_mean_kernel[opencl_device_id_t], dimGridG1, dimGridL1, 14, &tuning, sizeof(cl_int), NULL, tuning*sizeof(cl_float), &filters, sizeof(cl_int), &batch, sizeof(cl_int), &spatial, sizeof(cl_int), &x.mem, sizeof(cl_mem), &mean.mem, sizeof(cl_mem)); }

Now it is time to show you code that works but only if tuning parapeter is equal 1.

__kernel void fast_variance_kernel(int tuning, __local float *sums, int filters, int batch, int spatial, __global float *x, __global float *mean, __global float *variance) { int i = get_global_id(0); int t = get_local_id(0); sums[t] = 0; int j,k; for (j = 0; j < batch; ++j) { for (k = t; k < spatial; k += tuning) { int index = j * filters * spatial + i * spatial + k; sums[t] += pow((x[index] - mean[i]), 2); } } barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); if (t == 0) { variance[i] = 0; int s; for(s = 0; s < tuning; ++s) { variance[i] += sums[s]; } variance[i] /= (spatial * batch - 1); } }

To give you all the aspects of computation you need to see also C code that invokes this one GPU kernel.

void fast_variance_gpu(cl_mem_ext x, cl_mem_ext mean, int batch, int filters, int spatial, cl_mem_ext variance) { int tuning = 1; // It cannot be more than 1, but why? :D dim2 dimGridG1; dimGridG1 = dim2_create(filters, 1); dim2 dimGridL1; dimGridL1 = dim2_create(tuning, 1); opencl_kernel_local(opencl_fast_variance_kernel[opencl_device_id_t], dimGridG1, dimGridL1, 16, &tuning, sizeof(cl_int), NULL, tuning*sizeof(cl_float), &filters, sizeof(cl_int), &batch, sizeof(cl_int), &spatial, sizeof(cl_int), &x.mem, sizeof(cl_mem), &mean.mem, sizeof(cl_mem), &variance.mem, sizeof(cl_mem)); }

To be honest I do not know why this code does not work correctly when tuning parameter is more than 1.

In the end I can show you how I tested this code in the sandbox simulation of indexes for all code changes.

#include <stdio.h> #include <stdlib.h> int main() { int filters = 5; int batch = 7; int spatial = 11; int tuning = 4; int i, t, j, k; int ix = 0; int *x = calloc(filters*batch*spatial, sizeof(int)); for (i = 0; i < filters; ++i) { for (j = 0; j < batch; ++j) { for (k = 0; k < spatial; ++k) { int index = j * filters * spatial + i * spatial + k; x[index] = i; } } } for (i = 0; i < filters; ++i) { for (t = 0; t < tuning; ++t) { for (j = 0; j < batch; ++j) { for (k = t; k < spatial; k += tuning) { int index = j * filters * spatial + i * spatial + k; if(x[index] != i) { printf("FAIL!\n"); printf("c:%i i:%i (was:%i) t:%i, j:%i k:%i\n", ix, i, x[index], t, j, k); goto END; } else { printf("c:%i i:%i index:%i\n", ix, i, index); } } } } } printf("PASS!\n"); END: if (!x) free(x); return 0; }

Seams to be fine, but it does not work correctly from the math perspective, appreciate any viewpoint on this. Thanks!

p ;).

Pingback: GPU OpenCL Fine-Tuning Problem Solution – iblog.isowa.io

It is solved at: https://iblog.isowa.io/2020/06/22/gpu-opencl-fine-tuning-problem-solution/ Thanks!