GPU OpenCL Fine-Tuning Problem

Hi, today I have a C and OpenCL code fine-tuning problem to think about. This problem is quite complex to solve and truth be told I do not know how and why it does not work. The example is from real project at https://github.com/sowson/darknet.
OpenCL
Let me first show you slow to compute version of example method in OpenCL.

__kernel void  mean_kernel(int N, __global float *x, int batch, int filters, int spatial, __global float *mean)
{
    float scale = 1.f/(batch * spatial);
    int id = (get_group_id(0) + get_group_id(1)*get_num_groups(0)) * get_local_size(0) + get_local_id(0);
    if (id >= N) return;
    int i = id;
    mean[i] = 0;
    int j, k;
    for (j = 0; j < batch; ++j) {
        for (k = 0; k < spatial; ++k) {
            int index = j * filters * spatial + i * spatial + k;
            mean[i] += x[index];
        }
    }
    mean[i] *= scale;
}

To give you all the aspects of computation you need to see also C code that invokes this one GPU kernel.

void mean_gpu(cl_mem_ext x, int batch, int filters, int spatial, cl_mem_ext mean)
{
    size_t N = filters;
    dim2 dimGrid;
    dimGrid = dim2_create(N, 1);
    opencl_kernel(opencl_mean_kernel[opencl_device_id_t], dimGrid, 12, &N, sizeof(cl_int), &x.mem, sizeof(cl_mem), &batch, sizeof(cl_int), &filters, sizeof(cl_int), &spatial, sizeof(cl_int), &mean.mem, sizeof(cl_mem));
}

Now let me show you fine-tuned ready code in OpenCL.

__kernel void fast_mean_kernel(int tuning, __local float *sums, int filters, int batch, int spatial, __global float *x, __global float *mean)
{
    int i = get_global_id(0);
    int t = get_local_id(0);
    sums[t] = 0;
    int j,k;
    for (j = 0; j < batch; ++j) {
        for (k = t; k < spatial; k += tuning) {
            int index = j * filters * spatial + i * spatial + k;
            sums[t] += x[index];
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    if (t == 0) {
        mean[i] = 0;
        int s;
        for(s = 0; s < tuning; ++s) {
            mean[i] += sums[s];
        }
        mean[i] /= (spatial * batch);
    }
}

To give you all the aspects of computation you need to see also C code that invokes this one GPU kernel.

void fast_mean_gpu(cl_mem_ext x, int batch, int filters, int spatial, cl_mem_ext mean)
{
    int tuning = 16;
    dim2 dimGridG1;
    dimGridG1 = dim2_create(filters, 1);
    dim2 dimGridL1;
    dimGridL1 = dim2_create(tuning, 1);
    opencl_kernel_local(opencl_fast_mean_kernel[opencl_device_id_t], dimGridG1, dimGridL1, 14, &tuning, sizeof(cl_int), NULL, tuning*sizeof(cl_float), &filters, sizeof(cl_int), &batch, sizeof(cl_int), &spatial, sizeof(cl_int), &x.mem, sizeof(cl_mem), &mean.mem, sizeof(cl_mem));
}

Now it is time to show you code that works but only if tuning parapeter is equal 1.

__kernel void fast_variance_kernel(int tuning, __local float *sums, int filters, int batch, int spatial, __global float *x, __global float *mean, __global float *variance)
{
    int i = get_global_id(0);
    int t = get_local_id(0);
    sums[t] = 0;
    int j,k;
    for (j = 0; j < batch; ++j) {
        for (k = t; k < spatial; k += tuning) {
            int index = j * filters * spatial + i * spatial + k;
            sums[t] += pow((x[index] - mean[i]), 2);
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    if (t == 0) {
        variance[i] = 0;
        int s;
        for(s = 0; s < tuning; ++s) {
            variance[i] += sums[s];
        }
        variance[i] /= (spatial * batch - 1);
    }
}

To give you all the aspects of computation you need to see also C code that invokes this one GPU kernel.

void fast_variance_gpu(cl_mem_ext x, cl_mem_ext mean, int batch, int filters, int spatial, cl_mem_ext variance)
{
    int tuning = 1; // It cannot be more than 1, but why? :D
    dim2 dimGridG1;
    dimGridG1 = dim2_create(filters, 1);
    dim2 dimGridL1;
    dimGridL1 = dim2_create(tuning, 1);
    opencl_kernel_local(opencl_fast_variance_kernel[opencl_device_id_t], dimGridG1, dimGridL1, 16, &tuning, sizeof(cl_int), NULL, tuning*sizeof(cl_float), &filters, sizeof(cl_int), &batch, sizeof(cl_int), &spatial, sizeof(cl_int), &x.mem, sizeof(cl_mem), &mean.mem, sizeof(cl_mem), &variance.mem, sizeof(cl_mem));
}

To be honest I do not know why this code does not work correctly when tuning parameter is more than 1.
In the end I can show you how I tested this code in the sandbox simulation of indexes for all code changes.

#include <stdio.h>
#include <stdlib.h>
int main() {
    int filters = 5;
    int batch = 7;
    int spatial = 11;
    int tuning = 4;
    int i, t, j, k;
    int ix = 0;
    int *x = calloc(filters*batch*spatial, sizeof(int));
    for (i = 0; i < filters; ++i) {
        for (j = 0; j < batch; ++j) {
            for (k = 0; k < spatial; ++k) {
                int index = j * filters * spatial + i * spatial + k;
                x[index] = i;
            }
        }
    }
    for (i = 0; i < filters; ++i) {
        for (t = 0; t < tuning; ++t) {
            for (j = 0; j < batch; ++j) {
                for (k = t; k < spatial; k += tuning) {
                    int index = j * filters * spatial + i * spatial + k;
                    if(x[index] != i) {
                        printf("FAIL!\n");
                        printf("c:%i i:%i (was:%i) t:%i, j:%i k:%i\n",
                                ix, i, x[index], t, j, k);
                        goto END;
                    }
                    else {
                        printf("c:%i i:%i index:%i\n",
                                ix, i, index);
                    }
                }
            }
        }
    }
    printf("PASS!\n");
    END:
    if (!x) free(x);
    return 0;
}

Seams to be fine, but it does not work correctly from the math perspective, appreciate any viewpoint on this. Thanks!
p ;).

One Reply to “GPU OpenCL Fine-Tuning Problem”

  1. The solution of this problem is simple… the condition that is “if (t == 0)” is never equal true… it needs to be removed. As simple as that. Thanks!

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.