Hi, I want to show you code samples for OpenCL that may be important when you start with this great graphics card computing (GPU-computing) library. The goal is to make code as good as possible on any platform that supports OpenCL and on GPUs from AMD, Intel, NVidia, and Mali on CPUs from AMD, Intel ARM. All examples are from the Darknet on OpenCL port I did some time ago.
The first is because of the memory transfer issue between system memory (RAM) and graphics card video memory (VRAM). The abstractions below present the conception of cl_mem (OpenCL VRAM abstraction), a handy C pattern. The goal is to hold ptr to RAM and cl_mem to VRAM as a pair. Other elements are helpful for other described later tasks.
// https://github.com/sowson/darknet // src/opencl.h typedef struct _cl_mem_ext cl_mem_ext; typedef struct _cl_mem_ext { cl_mem mem; cl_mem org; size_t len; size_t off; size_t obs; size_t cnt; cl_mem_ext (*cln) (cl_mem_ext buf); cl_mem_ext (*inc) (cl_mem_ext buf, int inc, size_t len); cl_mem_ext (*dec) (cl_mem_ext buf, int dec, size_t len); cl_mem_ext (*add) (cl_mem_ext buf, int add, size_t len); cl_mem_ext (*rem) (cl_mem_ext buf, int rem, size_t len); void* ptr; void* map; cl_command_queue que; } cl_mem_ext;
The second example is how to create VRAM and RAM parts in the best possible multi-platform way. It is similar to the calloc C function.
// https://github.com/sowson/darknet // src/opencl.c cl_mem_ext opencl_make_array(float *x, size_t n) { cl_mem_ext buf; buf.len = n; buf.obs = sizeof(cl_float); buf.off = 0; buf.cnt = 0; buf.ptr = x; cl_int clErr; buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, buf.len * buf.obs, buf.ptr, &clErr); if (clErr != CL_SUCCESS) printf("could create buffer on device. error: %s\n", clCheckError(clErr)); buf.mem = buf.org; buf.cln = cln; buf.inc = inc; buf.dec = dec; buf.add = add; buf.rem = rem; buf.que = opencl_queues[opencl_device_id_t]; return buf; }
The third described technique shows how to push data from RAM to VRAM also. It is in the best possible manner for multi-platform support.
// https://github.com/sowson/darknet // src/opencl.c void opencl_push_array(cl_mem_ext x_gpu, float *x, size_t n) { if (x_gpu.ptr == (void*)x) { cl_int clErr = clEnqueueWriteBuffer(x_gpu.que, x_gpu.mem, CL_TRUE, 0, (n - x_gpu.off) * x_gpu.obs, x, 0, NULL, NULL); if (clErr != CL_SUCCESS) printf("could not push array to device. error: %s\n", clCheckError(clErr)); } else { cl_int clErr; x_gpu.map = clEnqueueMapBuffer(x_gpu.que, x_gpu.org, CL_TRUE, CL_MAP_WRITE, 0, (n - x_gpu.off) * x_gpu.obs, 0, NULL, NULL, &clErr); if (clErr != CL_SUCCESS) { printf("could not map array to device. error: %s\n", clCheckError(clErr)); exit(1); } memcpy(x_gpu.map, x, (n - x_gpu.off) * x_gpu.obs); clErr = clEnqueueUnmapMemObject(x_gpu.que, x_gpu.org, x_gpu.map, 0, NULL, NULL); if (clErr != CL_SUCCESS) printf("could not unmap array from device. error: %s\n", clCheckError(clErr)); } }
The fourth described technique also shows how to pull data from VRAM to RAM. It is in the best possible manner for multi-platform support.
// https://github.com/sowson/darknet // src/opencl.c void opencl_pull_array(cl_mem_ext x_gpu, float *x, size_t n) { if (x_gpu.ptr == (void*)x) { cl_int clErr = clEnqueueReadBuffer(x_gpu.que, x_gpu.mem, CL_TRUE, 0, (n - x_gpu.off) * x_gpu.obs, x, 0, NULL, NULL); if (clErr != CL_SUCCESS) printf("could not pull array from device. error: %s\n", clCheckError(clErr)); } else { cl_int clErr; x_gpu.map = clEnqueueMapBuffer(x_gpu.que, x_gpu.org, CL_TRUE, CL_MAP_READ, 0, (n - x_gpu.off) * x_gpu.obs, 0, NULL, NULL, &clErr); if (clErr != CL_SUCCESS) { printf("could not map array to device. error: %s\n", clCheckError(clErr)); exit(1); } memcpy(x, x_gpu.map, (n - x_gpu.off) * x_gpu.obs); clErr = clEnqueueUnmapMemObject(x_gpu.que, x_gpu.org, x_gpu.map, 0, NULL, NULL); if (clErr != CL_SUCCESS) printf("could not unmap array from device. error: %s\n", clCheckError(clErr)); } }
And the fifth presented technique is for solving the “offset problem” when you need to move from one place to another in VRAM and get only the required part. Usage is similar to the “+=” and “-=” operators in the C and CUDA possible where the VRAM is (most often) float*.
// https://github.com/sowson/darknet // src/opencl.c cl_mem_ext mov(cl_mem_ext buf, size_t len) { cl_buffer_region region; region.origin = buf.off * buf.obs; region.size = len != 0 ? len * buf.obs : (buf.len - buf.off) * buf.obs; cl_int clErr = 0; buf.mem = clCreateSubBuffer(buf.org, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &clErr); if (clErr != CL_SUCCESS) { printf("could not create sub-buffer on device. error: %s\n", clCheckError(clErr)); } return buf; }
Ultimately, I made a cherry on the cake example in the clBLAS library I patched for multi-threading and multi-queue computing. It is in C++ and for SGEMM (matrixes-multiplication). If you are wonder all changes, please look on my clBLAS at Github, the first and so far the only commit. The only issue is that it works only on macOS (well done, Operating System Threading Model). Why on GNU/Linx not? Well, to be honest, I am not 100% sure, but I assume it is because of the POSIX threading model, where threads are processes with shared memory, but like I said, I tested, and it does not work… the root cause is still under my investigation. The last thing is that NVidia OpenCL also does not support this solution because it assumes that each OpenCL queue will use separated compiled in memory in the same OpenCL context the OpenCL kernel, which is not possible with NVidia OpenCL where seeds have to be unique. I think this is overengineering optimization.
// https://github.com/sowson/clBLAS // src/library/blas/xgemm.cc typedef struct kernel_map_key_ { cl_command_queue queue; const char *kernelSource; // address of kernel source } kernel_map_key; bool operator<(const kernel_map_key & l, const kernel_map_key & r) { if (l.queue < r.queue) { return true; } else if (r.queue < l.queue) { return false; } if (l.kernelSource < r.kernelSource) { return true; } else if (r.kernelSource < l.kernelSource) { return false; } return false; }
Of course, you maybe wonder how to use this patched clBLAS library to be multi-threading ready.
// https://github.com/sowson/darknet // src/gemm.c #if !defined(GPU_MULTI) && !defined(ARM) void gemm_offset_gpu(int TA, int TB, int M, int N, int K, float ALPHA, cl_mem_ext A_gpu, int offset_A, int lda, cl_mem_ext B_gpu, int offset_B, int ldb, float BETA, cl_mem_ext C_gpu, int offset_C, int ldc) { cl_int clErr; clErr = clblasSgemm(clblasRowMajor, (TA ? clblasTrans : clblasNoTrans), (TB ? clblasTrans : clblasNoTrans), M, N, K, ALPHA, A_gpu.mem, offset_A, lda, B_gpu.mem, offset_B, ldb, BETA, C_gpu.mem, offset_C, ldc, 1, &opencl_queues[opencl_device_id_t], 0, NULL, NULL); if (clErr != CL_SUCCESS) { printf("gemm_gpu: clblasSgemm failed. Errorcode: %d\n", clErr); } }
I hope you like these C/C++ code examples, next time, I plan to explain a way of making multi-GPU ready OpenCL code… so stay tuned.
p ;).