Floating Pointers on OpenCL cl_mem

OpenCLHi, First welcome in 2018 on my blog, I hope you all feel excited in the new year like me, sorry to be silent for a while, but I was absorbing my first GitHub fork which is “sowson/darknet” and I was coding like a crazy one. Still, I have several to-dos on this code, but I really like it. And regarding this journey, I would like to share with you something awesome that I was looking at on the entire Internet and GitHub. The problem I was trying to solve is the well-known limitation of the OpenCL that CUDA does not have when you use C language for low-level development. In CUDA, you can have on variable_gpu that is float * type following floating point operations. What does it mean? It means you can use + oror += or -= operators to change the address and access different parts of memory.

variable_gpu += 20;

or

variable_pointed_gpu = variable_gpu + 20;

The above examples can be easily achieved with CUDA, but OpenCL uses cl_mem abstraction, and only in kernels code can you use float*. In C implementation, you do not have too many choices, but fortunately, I found one excellent tip: a function named clCreateSubBuffer. I decided to use this function, but first, I extended the classical cl_mem as follows in the header file.

typedef struct _cl_mem_ext cl_mem_ext;
typedef struct _cl_mem_ext {
    cl_mem mem;
    cl_mem org;
    size_t len;
    size_t off;
    size_t obs;
    size_t cnt;
    cl_mem_ext (*inc) (cl_mem_ext dat, int inc, size_t len);
    cl_mem_ext (*dec) (cl_mem_ext dat, int dec, size_t len);
    cl_mem_ext (*add) (cl_mem_ext dat, int add, size_t len);
    cl_mem_ext (*rem) (cl_mem_ext dat, int rem, size_t len);
} cl_mem_ext;
cl_mem_ext inc(cl_mem_ext buf, int inc, size_t len);
cl_mem_ext dec(cl_mem_ext buf, int dec, size_t len);
cl_mem_ext mov(cl_mem_ext buf, size_t len);
cl_mem_ext add(cl_mem_ext buf, int inc, size_t len);
cl_mem_ext rem(cl_mem_ext buf, int dec, size_t len);
cl_mem_ext upd(cl_mem_ext buf, size_t len);

There are many elements in the extension I would like to propose, but I thought about that very carefully, and I need all of them. Now I would like to share with you possible usages of that abstraction that includes creating and release of cl_mem_ext, and all you need to do is to use that implementation in the c file and rename all places where you have cl_mem to use cl_mem_ext instead. The implementation is as follows.

cl_mem_ext opencl_make_array(float *x, size_t n)
{
    cl_mem_ext buf;
    buf.len = n;
    buf.obs = sizeof(cl_float);
    buf.off = 0;
    buf.cnt = 0;
    buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE,
                             buf.len * buf.obs, NULL, NULL);
    buf.mem = buf.org;
    buf.inc = inc;
    buf.dec = dec;
    buf.add = add;
    buf.rem = rem;
    if (x != NULL)
        opencl_push_array(buf, x, n);
    else
    {
        float *cptr = (float*) calloc(n * sizeof(float), 1);
        if (cptr != NULL)
            opencl_push_array(buf, cptr, n);
        free(cptr);
    }
    return buf;
}
cl_mem_ext opencl_make_int_array(size_t n)
{
    cl_mem_ext buf;
    buf.len = n;
    buf.obs = sizeof(cl_int);
    buf.off = 0;
    buf.cnt = 0;
    buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE,
                             buf.len * buf.obs, NULL, NULL);
    buf.mem = buf.org;
    buf.inc = inc;
    buf.dec = dec;
    buf.add = add;
    buf.rem = rem;
    return buf;
}
void opencl_push_array(cl_mem_ext x_gpu, float *x, size_t n)
{
    cl_int clErr = clEnqueueWriteBuffer(opencl_queue, x_gpu.mem, CL_TRUE, 0,
                   (n - x_gpu.off) * x_gpu.obs, x, 0, NULL, NULL);
    if (clErr != CL_SUCCESS)
        printf("Could not push array to device. Error code %d\n", clErr);
}
void opencl_pull_array(cl_mem_ext x_gpu, float *x, size_t n)
{
    cl_int clErr = clEnqueueReadBuffer(opencl_queue, x_gpu.mem, CL_TRUE, 0,
                   (n - x_gpu.off) * x_gpu.obs, x, 0, NULL, NULL);
    if (clErr != CL_SUCCESS)
        printf("Could not pull array from device. Error code %d\n", clErr);
}
void opencl_free(cl_mem_ext x_gpu)
{
    x_gpu.len = 0;
    x_gpu.obs = 0;
    clReleaseMemObject(x_gpu.org);
    x_gpu.mem = 0;
    x_gpu.off = 0;
    x_gpu.cnt = 0;
    x_gpu.inc = 0;
    x_gpu.dec = 0;
    x_gpu.add = 0;
    x_gpu.rem = 0;
}
cl_mem_ext inc(cl_mem_ext buf, int inc, size_t len) {
    if (buf.len == 0) return buf;
    buf.off += inc;
    buf.cnt += 1;
    return mov(buf, len);
}
cl_mem_ext dec(cl_mem_ext buf, int dec, size_t len) {
    if (buf.len == 0) return buf;
    buf.off -= dec;
    buf.cnt -= 1;
    return mov(buf, len);
}
cl_mem_ext mov(cl_mem_ext buf, size_t len) {
    if (buf.len == 0) return buf;
    cl_buffer_region region;
    region.origin = buf.off * buf.obs;
    region.size = len != 0 ? len * buf.obs : (buf.len - buf.off) * buf.obs;
    cl_int err = 0;
    buf.mem = clCreateSubBuffer(
              buf.org, CL_MEM_READ_WRITE,
              CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
    if (err != CL_SUCCESS)
    {
        printf("Could not sub buffer from device. Error code %d\n", err);
    }
    return buf;
}
cl_mem_ext add(cl_mem_ext buf, int inc, size_t len) {
    if (buf.len == 0) return buf;
    buf.off = inc;
    buf.cnt = 1;
    return upd(buf, len);
}
cl_mem_ext rem(cl_mem_ext buf, int dec, size_t len) {
    if (buf.len == 0) return buf;
    buf.off = dec;
    buf.cnt = 1;
    return upd(buf, len);
}
cl_mem_ext upd(cl_mem_ext buf, size_t len) {
    if (buf.len == 0) return buf;
    cl_mem_ext ret;
    ret.org = buf.org;
    ret.len = buf.len;
    ret.obs = buf.obs;
    ret.org = buf.org;
    ret.off = buf.off;
    ret.cnt = buf.cnt;
    cl_buffer_region region;
    region.origin = ret.off * ret.obs;
    region.size = len != 0 ? len * ret.obs : (ret.len - ret.off) * ret.obs;
    cl_int err = 0;
    ret.mem = clCreateSubBuffer(
        ret.org, CL_MEM_READ_WRITE,
        CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
    if (err != CL_SUCCESS)
    {
        printf("Could not sub buffer from device. Error code %d\n", err);
    }
    return ret;
}

And now, instead of examples in CUDA in OpenCL, you can do the following implementation.

variable_gpu.inc(variable_gpu, 20, 20);

or

variable_pointed_gpu = variable_gpu.add(variable_gpu, 20, 20);

Thanks for reading,

p ;).

Leave a Reply

Your email address will not be published. Required fields are marked *

*

This site uses Akismet to reduce spam. Learn how your comment data is processed.