PhD OpenCL Challenges

Hi, today I want to show you code samples for OpenCL that may be important when you start with these amazing graphics cards computing (GPU-computing) library. The goal is to make code as good as possible on any platform that supports OpenCL and on GPUs from AMD, Intel, NVidia, Mali on CPUs from AMD, Intel, ARM. All examples are from the Darknet on OpenCL port I did some time ago.

The first one is because of the memory transfer issue between system memory (RAM) to and from graphics card video memory (VRAM). The abstractions that are shown below present the abstraction of cl_mem (OpenCL VRAM abstraction) it is a very useful C pattern. The goal is to hold ptr to RAM and cl_mem to VRAM as a pair. other elements are useful for other described later tasks.


// https://github.com/sowson/darknet
// src/opencl.h

typedef struct _cl_mem_ext cl_mem_ext;

typedef struct _cl_mem_ext {
    cl_mem mem;
    cl_mem org;
    size_t len;
    size_t off;
    size_t obs;
    size_t cnt;
    cl_mem_ext (*cln) (cl_mem_ext buf);
    cl_mem_ext (*inc) (cl_mem_ext buf, int inc, size_t len);
    cl_mem_ext (*dec) (cl_mem_ext buf, int dec, size_t len);
    cl_mem_ext (*add) (cl_mem_ext buf, int add, size_t len);
    cl_mem_ext (*rem) (cl_mem_ext buf, int rem, size_t len);
    void* ptr;
    void* map;
    cl_command_queue que;
} cl_mem_ext;

The second example is how to create VRAM and RAM parts in the best possible multi-platform way. It is similar to the calloc C function.


// https://github.com/sowson/darknet
// src/opencl.c

cl_mem_ext opencl_make_array(float *x, size_t n)
{
    cl_mem_ext buf;

    buf.len = n;
    buf.obs = sizeof(cl_float);
    buf.off = 0;
    buf.cnt = 0;

    buf.ptr = x;

    cl_int clErr;
    buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
                             buf.len * buf.obs, buf.ptr, &clErr);
    if (clErr != CL_SUCCESS)
        printf("could create buffer on device. error: %s\n", clCheckError(clErr));

    buf.mem = buf.org;

    buf.cln = cln;
    buf.inc = inc;
    buf.dec = dec;
    buf.add = add;
    buf.rem = rem;

    buf.que = opencl_queues[opencl_device_id_t];

    return buf;
}

The third described technique shows how to push data from RAM to VRAM also it is in the best possible manner for multi-platform support.


// https://github.com/sowson/darknet
// src/opencl.c

void opencl_push_array(cl_mem_ext x_gpu, float *x, size_t n)
{
    if (x_gpu.ptr == (void*)x) {
        cl_int clErr = clEnqueueWriteBuffer(x_gpu.que, x_gpu.mem, CL_TRUE, 0, (n - x_gpu.off) * x_gpu.obs, x, 0, NULL, NULL);
        if (clErr != CL_SUCCESS)
            printf("could not push array to device. error: %s\n", clCheckError(clErr));
    } else {
        cl_int clErr;
        x_gpu.map = clEnqueueMapBuffer(x_gpu.que, x_gpu.org, CL_TRUE, CL_MAP_WRITE,
                                       0, (n - x_gpu.off) * x_gpu.obs, 0, NULL, NULL, &clErr);
        if (clErr != CL_SUCCESS) {
            printf("could not map array to device. error: %s\n", clCheckError(clErr));
            exit(1);
        }
        memcpy(x_gpu.map, x, (n - x_gpu.off) * x_gpu.obs);
        clErr = clEnqueueUnmapMemObject(x_gpu.que, x_gpu.org, x_gpu.map, 0, NULL, NULL);
        if (clErr != CL_SUCCESS)
            printf("could not unmap array from device. error: %s\n", clCheckError(clErr));
    }
}

The fourth described technique shows how to pull data from VRAM to RAM also it is in the best possible manner for multi-platform support.


// https://github.com/sowson/darknet
// src/opencl.c

void opencl_pull_array(cl_mem_ext x_gpu, float *x, size_t n)
{
    if (x_gpu.ptr == (void*)x) {
        cl_int clErr = clEnqueueReadBuffer(x_gpu.que, x_gpu.mem, CL_TRUE, 0, (n - x_gpu.off) * x_gpu.obs, x, 0, NULL, NULL);
        if (clErr != CL_SUCCESS)
            printf("could not pull array from device. error: %s\n", clCheckError(clErr));
    } else {
        cl_int clErr;
        x_gpu.map = clEnqueueMapBuffer(x_gpu.que, x_gpu.org, CL_TRUE, CL_MAP_READ,
                                       0, (n - x_gpu.off) * x_gpu.obs, 0, NULL, NULL, &clErr);
        if (clErr != CL_SUCCESS) {
            printf("could not map array to device. error: %s\n", clCheckError(clErr));
            exit(1);
        }
        memcpy(x, x_gpu.map, (n - x_gpu.off) * x_gpu.obs);
        clErr = clEnqueueUnmapMemObject(x_gpu.que, x_gpu.org, x_gpu.map, 0, NULL, NULL);
        if (clErr != CL_SUCCESS)
            printf("could not unmap array from device. error: %s\n", clCheckError(clErr));
    }
}

And the fifth presented technique is for solving the “offset problem” when you need to move from one place to another in VRAM and get only the needed part. Usage is similar to the “+=” and “-=” operators in the C and CUDA possible where the VRAM is (most often) “float*”.


// https://github.com/sowson/darknet
// src/opencl.c

cl_mem_ext mov(cl_mem_ext buf, size_t len) {
    cl_buffer_region region;

    region.origin = buf.off * buf.obs;
    region.size = len != 0 ? len * buf.obs : (buf.len - buf.off) * buf.obs;

    cl_int clErr = 0;
    buf.mem = clCreateSubBuffer(buf.org, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &clErr);

    if (clErr != CL_SUCCESS)
    {
        printf("could not create sub-buffer on device. error: %s\n", clCheckError(clErr));
    }

    return buf;
}

In the end I made a cherry of the cake example in clBLAS library I patched for multi-threading and multi-queue compute. It is in C++ and for SGEMM (matrixes-multiplication). If you are wonder all changes pleas look on my clBLAS at Github the first and so far the only commit. The only issue is that it works only on macOS (well done Operating System Threading Model). Why on GNU/Linx not? Well, to be honest, I am not 100% sure, but I assume it is because of the POSIX threading model, where threads are processes with shared memory, but like I said, I tested and it does not work… the root-cause is still under my investigation. The last thing is that NVidia OpenCL also does not support this solution, because it assumes that each OpenCL queue will use separated compiled in memory in the same OpenCL context the OpenCL kernel, and that is not possible with NVidia OpenCL where kernels are has to be unique. I think this is overengineering optimization.


// https://github.com/sowson/clBLAS
// src/library/blas/xgemm.cc

typedef struct kernel_map_key_ {
  cl_command_queue queue;
  const char *kernelSource; // address of kernel source
} kernel_map_key;

bool operator<(const kernel_map_key & l, const kernel_map_key & r) {
  if (l.queue < r.queue) {
    return true;
  } else if (r.queue < l.queue) {
    return false;
  }
  if (l.kernelSource < r.kernelSource) {
    return true;
  } else if (r.kernelSource < l.kernelSource) {
    return false;
  }
  return false;
}

Of course you maybe wonder how to use this patched clBLAS library to be the multi-threading ready.


// https://github.com/sowson/darknet
// src/gemm.c

#if !defined(GPU_MULTI) && !defined(ARM)
void gemm_offset_gpu(int TA, int TB, int M, int N, int K,
              float ALPHA,
              cl_mem_ext A_gpu, int offset_A, int lda,
              cl_mem_ext B_gpu, int offset_B, int ldb,
              float BETA,
              cl_mem_ext C_gpu, int offset_C, int ldc)
{
    cl_int clErr;

    clErr = clblasSgemm(clblasRowMajor,
                        (TA ? clblasTrans : clblasNoTrans),
                        (TB ? clblasTrans : clblasNoTrans),
                        M, N, K, ALPHA,
                        A_gpu.mem, offset_A, lda,
                        B_gpu.mem, offset_B, ldb,
                        BETA,
                        C_gpu.mem, offset_C, ldc,
                        1, &opencl_queues[opencl_device_id_t], 0, NULL, NULL);

    if (clErr != CL_SUCCESS)
    {
        printf("gemm_gpu: clblasSgemm failed. Errorcode: %d\n", clErr);
    }
}

I hope you like this C/C++ code examples, next time I plan to explain to you a way of making multi-GPU ready OpenCL code… so stay tuned.
p ;).

Leave a Reply

Your email address will not be published. Required fields are marked *

*

This site uses Akismet to reduce spam. Learn how your comment data is processed.