Hi, First welcome in 2018 on my blog, I hope you all feel excited in the new year like me, sorry to be silent for a while, but I was absorbing my first GitHub fork which is “sowson/darknet” and I was coding like a crazy one. Still, I have several to-dos on this code, but I really like it. And regarding this journey, I would like to share with you something awesome that I was looking at on the entire Internet and GitHub. The problem I was trying to solve is the well-known limitation of the OpenCL that CUDA does not have when you use C language for low-level development. In CUDA, you can have on variable_gpu that is float * type following floating point operations. What does it mean? It means you can use + or – or += or -= operators to change the address and access different parts of memory.
variable_gpu += 20;
or
variable_pointed_gpu = variable_gpu + 20;
The above examples can be easily achieved with CUDA, but OpenCL uses cl_mem abstraction, and only in kernels code can you use float*. In C implementation, you do not have too many choices, but fortunately, I found one excellent tip: a function named clCreateSubBuffer. I decided to use this function, but first, I extended the classical cl_mem as follows in the header file.
typedef struct _cl_mem_ext cl_mem_ext; typedef struct _cl_mem_ext { cl_mem mem; cl_mem org; size_t len; size_t off; size_t obs; size_t cnt; cl_mem_ext (*inc) (cl_mem_ext dat, int inc, size_t len); cl_mem_ext (*dec) (cl_mem_ext dat, int dec, size_t len); cl_mem_ext (*add) (cl_mem_ext dat, int add, size_t len); cl_mem_ext (*rem) (cl_mem_ext dat, int rem, size_t len); } cl_mem_ext; cl_mem_ext inc(cl_mem_ext buf, int inc, size_t len); cl_mem_ext dec(cl_mem_ext buf, int dec, size_t len); cl_mem_ext mov(cl_mem_ext buf, size_t len); cl_mem_ext add(cl_mem_ext buf, int inc, size_t len); cl_mem_ext rem(cl_mem_ext buf, int dec, size_t len); cl_mem_ext upd(cl_mem_ext buf, size_t len);
There are many elements in the extension I would like to propose, but I thought about that very carefully, and I need all of them. Now I would like to share with you possible usages of that abstraction that includes creating and release of cl_mem_ext, and all you need to do is to use that implementation in the c file and rename all places where you have cl_mem to use cl_mem_ext instead. The implementation is as follows.
cl_mem_ext opencl_make_array(float *x, size_t n) { cl_mem_ext buf; buf.len = n; buf.obs = sizeof(cl_float); buf.off = 0; buf.cnt = 0; buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE, buf.len * buf.obs, NULL, NULL); buf.mem = buf.org; buf.inc = inc; buf.dec = dec; buf.add = add; buf.rem = rem; if (x != NULL) opencl_push_array(buf, x, n); else { float *cptr = (float*) calloc(n * sizeof(float), 1); if (cptr != NULL) opencl_push_array(buf, cptr, n); free(cptr); } return buf; } cl_mem_ext opencl_make_int_array(size_t n) { cl_mem_ext buf; buf.len = n; buf.obs = sizeof(cl_int); buf.off = 0; buf.cnt = 0; buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE, buf.len * buf.obs, NULL, NULL); buf.mem = buf.org; buf.inc = inc; buf.dec = dec; buf.add = add; buf.rem = rem; return buf; } void opencl_push_array(cl_mem_ext x_gpu, float *x, size_t n) { cl_int clErr = clEnqueueWriteBuffer(opencl_queue, x_gpu.mem, CL_TRUE, 0, (n - x_gpu.off) * x_gpu.obs, x, 0, NULL, NULL); if (clErr != CL_SUCCESS) printf("Could not push array to device. Error code %d\n", clErr); } void opencl_pull_array(cl_mem_ext x_gpu, float *x, size_t n) { cl_int clErr = clEnqueueReadBuffer(opencl_queue, x_gpu.mem, CL_TRUE, 0, (n - x_gpu.off) * x_gpu.obs, x, 0, NULL, NULL); if (clErr != CL_SUCCESS) printf("Could not pull array from device. Error code %d\n", clErr); } void opencl_free(cl_mem_ext x_gpu) { x_gpu.len = 0; x_gpu.obs = 0; clReleaseMemObject(x_gpu.org); x_gpu.mem = 0; x_gpu.off = 0; x_gpu.cnt = 0; x_gpu.inc = 0; x_gpu.dec = 0; x_gpu.add = 0; x_gpu.rem = 0; } cl_mem_ext inc(cl_mem_ext buf, int inc, size_t len) { if (buf.len == 0) return buf; buf.off += inc; buf.cnt += 1; return mov(buf, len); } cl_mem_ext dec(cl_mem_ext buf, int dec, size_t len) { if (buf.len == 0) return buf; buf.off -= dec; buf.cnt -= 1; return mov(buf, len); } cl_mem_ext mov(cl_mem_ext buf, size_t len) { if (buf.len == 0) return buf; cl_buffer_region region; region.origin = buf.off * buf.obs; region.size = len != 0 ? len * buf.obs : (buf.len - buf.off) * buf.obs; cl_int err = 0; buf.mem = clCreateSubBuffer( buf.org, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); if (err != CL_SUCCESS) { printf("Could not sub buffer from device. Error code %d\n", err); } return buf; } cl_mem_ext add(cl_mem_ext buf, int inc, size_t len) { if (buf.len == 0) return buf; buf.off = inc; buf.cnt = 1; return upd(buf, len); } cl_mem_ext rem(cl_mem_ext buf, int dec, size_t len) { if (buf.len == 0) return buf; buf.off = dec; buf.cnt = 1; return upd(buf, len); } cl_mem_ext upd(cl_mem_ext buf, size_t len) { if (buf.len == 0) return buf; cl_mem_ext ret; ret.org = buf.org; ret.len = buf.len; ret.obs = buf.obs; ret.org = buf.org; ret.off = buf.off; ret.cnt = buf.cnt; cl_buffer_region region; region.origin = ret.off * ret.obs; region.size = len != 0 ? len * ret.obs : (ret.len - ret.off) * ret.obs; cl_int err = 0; ret.mem = clCreateSubBuffer( ret.org, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); if (err != CL_SUCCESS) { printf("Could not sub buffer from device. Error code %d\n", err); } return ret; }
And now, instead of examples in CUDA in OpenCL, you can do the following implementation.
variable_gpu.inc(variable_gpu, 20, 20);
or
variable_pointed_gpu = variable_gpu.add(variable_gpu, 20, 20);
Thanks for reading,
p ;).