Examples

Image Rendering

CPU

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>
#define USE_FORGE_CPU_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cmath>
#include <complex>

const unsigned DIMX = 512;
const unsigned DIMY = 512;

struct Bitmap {
    unsigned char* ptr;
    unsigned width;
    unsigned height;
};

Bitmap createBitmap(unsigned w, unsigned h);
void destroyBitmap(Bitmap& bmp);
void kernel(Bitmap& bmp);
int julia(int x, int y, int width, int height);

int main(void) {
    Bitmap bmp = createBitmap(DIMX, DIMY);

    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Fractal Demo");
    wnd.makeCurrent();

    /* create an font object and load necessary font
     * and later pass it on to window object so that
     * it can be used for rendering text
     *
     * NOTE: THIS IS OPTIONAL STEP, BY DEFAULT WINDOW WILL
     * HAVE FONT ALREADY SETUP*/
    forge::Font fnt;
#if defined(OS_WIN)
    fnt.loadSystemFont("Calibri");
#else
    fnt.loadSystemFont("Vera");
#endif
    wnd.setFont(&fnt);

    /* Create an image object which creates the necessary
     * textures and pixel buffer objects to hold the image
     * */
    forge::Image img(DIMX, DIMY, FG_RGBA, forge::u8);
    /* copy your data into the pixel buffer object exposed by
     * forge::Image class and then proceed to rendering.
     * To help the users with copying the data from compute
     * memory to display memory, Forge provides copy headers
     * along with the library to help with this task
     */
    kernel(bmp);

    GfxHandle* handle = 0;

    // create GL-CPU interop buffer
    createGLBuffer(&handle, img.pixels(), FORGE_IMAGE_BUFFER);

    // copy the data from compute buffer to graphics buffer
    copyToGLBuffer(handle, (ComputeResourceHandle)bmp.ptr, img.size());

    do { wnd.draw(img); } while (!wnd.close());

    // destroy GL-CPU Interop buffer
    releaseGLBuffer(handle);
    destroyBitmap(bmp);
    return 0;
}

Bitmap createBitmap(unsigned w, unsigned h) {
    Bitmap retVal;
    retVal.width  = w;
    retVal.height = h;
    retVal.ptr    = new unsigned char[4 * w * h];
    return retVal;
}

void destroyBitmap(Bitmap& bmp) { delete[] bmp.ptr; }

void kernel(Bitmap& bmp) {
    for (unsigned y = 0; y < bmp.height; ++y) {
        for (unsigned x = 0; x < bmp.width; ++x) {
            int offset              = x + y * bmp.width;
            int juliaVal            = julia(x, y, bmp.width, bmp.height);
            bmp.ptr[offset * 4 + 0] = 255 * juliaVal;
            bmp.ptr[offset * 4 + 1] = 0;
            bmp.ptr[offset * 4 + 2] = 0;
            bmp.ptr[offset * 4 + 3] = 255;
        }
    }
}

int julia(int x, int y, int width, int height) {
    const float scale = 1.5;
    float jx          = scale * (float)(width / 2.0f - x) / (width / 2.0f);
    float jy          = scale * (float)(height / 2.0f - y) / (height / 2.0f);

    std::complex<float> c(-0.8f, 0.156f);
    std::complex<float> a(jx, jy);

    for (int i = 0; i < 200; i++) {
        a = a * a + c;
        if (abs(a) > 1000) return 0;
    }

    return 1;
}

CUDA

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <cuComplex.h>
#include <cuda_runtime.h>
#include <forge.h>
#define USE_FORGE_CUDA_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cstdio>

const unsigned DIMX   = 512;
const unsigned DIMY   = 512;
const size_t TOT_SIZE = DIMX * DIMY * 4;

void kernel(unsigned char* dev_out);

int main(void) {
    unsigned char* dev_out;

    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Fractal Demo");
    wnd.makeCurrent();

    /* Create an image object which creates the necessary
     * textures and pixel buffer objects to hold the image
     * */
    forge::Image img(DIMX, DIMY, FG_RGBA, forge::u8);

    GfxHandle* handle = 0;

    // create GL-CPU interop buffer
    createGLBuffer(&handle, img.pixels(), FORGE_IMAGE_BUFFER);

    /* copy your data into the pixel buffer object exposed by
     * forge::Image class and then proceed to rendering.
     * To help the users with copying the data from compute
     * memory to display memory, Forge provides copy headers
     * along with the library to help with this task
     */
    FORGE_CUDA_CHECK(cudaMalloc((void**)&dev_out, TOT_SIZE));
    kernel(dev_out);

    // copy the data from compute buffer to graphics buffer
    copyToGLBuffer(handle, (ComputeResourceHandle)dev_out, img.size());

    do { wnd.draw(img); } while (!wnd.close());

    // destroy GL-CPU Interop buffer
    releaseGLBuffer(handle);
    FORGE_CUDA_CHECK(cudaFree(dev_out));
    return 0;
}

__device__ int julia(int x, int y) {
    const float scale = 1.5;
    float jx          = scale * (float)(DIMX / 2.0f - x) / (DIMX / 2.0f);
    float jy          = scale * (float)(DIMY / 2.0f - y) / (DIMY / 2.0f);

    cuFloatComplex c = make_cuFloatComplex(-0.8f, 0.156f);
    cuFloatComplex a = make_cuFloatComplex(jx, jy);

    for (int i = 0; i < 200; i++) {
        a = cuCaddf(cuCmulf(a, a), c);
        if (cuCabsf(a) > 1000.0f) return 0;
    }

    return 1;
}

__global__ void julia(unsigned char* out) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < DIMX && y < DIMY) {
        int offset = x + y * DIMX;
        // now calculate the value at that position
        int juliaValue = julia(x, y);

        out[offset * 4 + 2] = 255 * juliaValue;
        out[offset * 4 + 0] = 0;
        out[offset * 4 + 1] = 0;
        out[offset * 4 + 3] = 255;
    }
}

inline int divup(int a, int b) { return (a + b - 1) / b; }

void kernel(unsigned char* dev_out) {
    static const dim3 threads(8, 8);
    dim3 blocks(divup(DIMX, threads.x), divup(DIMY, threads.y));

    // clang-format off
    julia<<<blocks, threads>>>(dev_out);
    // clang-format on
}

OpenCL

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>

#include "cl_helpers.h"

#include <algorithm>
#include <iostream>
#include <mutex>
#include <sstream>

using namespace cl;
using namespace std;

const unsigned DIMX     = 512;
const unsigned DIMY     = 512;
const unsigned IMG_SIZE = DIMX * DIMY * 4;

#define USE_FORGE_OPENCL_COPY_HELPERS
#include <fg/compute_copy.h>

// clang-format off
static const std::string fractal_ocl_kernel =
R"EOK(
float magnitude(float2 a) {
    return sqrt(a.s0*a.s0+a.s1*a.s1);
}
float2 mul(float2 a, float2 b) {
    return (float2)(a.s0*b.s0-a.s1*b.s1, a.s1*b.s0+a.s0*b.s1);
}
float2 add(float2 a, float2 b) {
    return (float2)(a.s0+b.s0, a.s1+b.s1);
}
int pixel(int x, int y, int width, int height) {
    const float scale = 1.5;
    float jx = scale * (float)(width/2.0f - x)/(width/2.0f);
    float jy = scale * (float)(height/2.0f - y)/(height/2.0f);
    float2 c = (float2)(-0.8f, 0.156f);
    float2 a = (float2)(jx, jy);

    for (int i=0; i<200; i++) {
        a = add(mul(a, a), c);
        if (magnitude(a) > 1000.0f)
            return 0;
    }
    return 1;
}

kernel
void julia(global unsigned char* out, const unsigned w, const unsigned h) {
    int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
    int y = get_group_id(1) * get_local_size(1) + get_local_id(1);
    if (x<w && y<h) {
        int offset        = x + y * w;
        int juliaValue    = pixel(x, y, w, h);
        out[offset*4 + 1] = 255 * juliaValue;
        out[offset*4 + 0] = 0;
        out[offset*4 + 2] = 0;
        out[offset*4 + 3] = 255;
    }
}
)EOK";
// clang-format on

inline int divup(int a, int b) {
    return (a + b - 1) / b;
}

void kernel(cl::Buffer& devOut, cl::CommandQueue& queue) {
    static std::once_flag compileFlag;
    static cl::Program prog;
    static cl::Kernel kern;

    std::call_once(compileFlag, [queue]() {
        prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(),
                           fractal_ocl_kernel, true);
        kern = cl::Kernel(prog, "julia");
    });

    auto juliaOp = cl::KernelFunctor<Buffer, unsigned, unsigned>(kern);

    static const NDRange local(8, 8);
    NDRange global(local[0] * divup(DIMX, (int)(local[0])),
                   local[1] * divup(DIMY, (int)(local[1])));

    juliaOp(EnqueueArgs(queue, global, local), devOut, DIMX, DIMY);
}

int main(void) {
    try {
        /*
         * First Forge call should be a window creation call
         * so that necessary OpenGL context is created for any
         * other forge::* object to be created successfully
         */
        forge::Window wnd(DIMX, DIMY, "Fractal Demo");
        wnd.makeCurrent();

        /* Create an image object which creates the necessary
         * textures and pixel buffer objects to hold the image
         * */
        forge::Image img(DIMX, DIMY, FG_RGBA, forge::u8);

        /*
         * Helper function to create a CLGL interop context.
         * This function checks for if the extension is available
         * and creates the context on the appropriate device.
         * Note: context and queue are defined in cl_helpers.h
         */
        context       = createCLGLContext(wnd);
        Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
        queue         = CommandQueue(context, device);

        /* copy your data into the pixel buffer object exposed by
         * forge::Image class and then proceed to rendering.
         * To help the users with copying the data from compute
         * memory to display memory, Forge provides copy headers
         * along with the library to help with this task
         */
        cl::Buffer devOut(context, CL_MEM_READ_WRITE, IMG_SIZE);

        kernel(devOut, queue);

        GfxHandle* handle = 0;

        // create GL-CPU interop buffer
        createGLBuffer(&handle, img.pixels(), FORGE_IMAGE_BUFFER);

        // copy the data from compute buffer to graphics buffer
        copyToGLBuffer(handle, (ComputeResourceHandle)devOut(), img.size());

        do { wnd.draw(img); } while (!wnd.close());

        // destroy GL-CPU Interop buffer
        releaseGLBuffer(handle);

    } catch (forge::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    } catch (cl::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    }

    return 0;
}

Bubblechart with Transparency

CPU

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>
#define USE_FORGE_CPU_COPY_HELPERS
#include <fg/compute_copy.h>
#include <algorithm>
#include <cmath>
#include <complex>
#include <functional>
#include <iostream>
#include <iterator>
#include <random>
#include <vector>

const unsigned DIMX = 1000;
const unsigned DIMY = 800;

const float FRANGE_START = 0.f;
const float FRANGE_END   = 2.f * 3.1415926f;

using namespace std;
void map_range_to_vec_vbo(float range_start, float range_end, float dx,
                          std::vector<float>& vec, float (*map)(float)) {
    if (range_start > range_end && dx > 0) return;
    for (float i = range_start; i < range_end; i += dx) {
        vec.push_back(i);
        vec.push_back((*map)(i));
    }
}

int main(void) {
    std::vector<float> cosData;
    std::vector<float> tanData;

    map_range_to_vec_vbo(FRANGE_START, FRANGE_END, 0.1f, cosData, &cosf);
    map_range_to_vec_vbo(FRANGE_START, FRANGE_END, 0.1f, tanData, &tanf);

    std::random_device r;

    std::default_random_engine e1(r());
    std::mt19937_64 gen(r());

    std::uniform_real_distribution<float> nDist(0.0f, 1.0f);
    std::uniform_real_distribution<float> cDist(0.2f, 0.6f);
    std::uniform_real_distribution<float> fDist(0.4f, 0.6f);

    auto clr = std::bind(cDist, gen);
    auto rnd = std::bind(nDist, e1);
    auto alp = std::bind(fDist, gen);

    std::vector<float> colors(3 * tanData.size());
    std::vector<float> alphas(tanData.size());
    std::vector<float> radii(tanData.size());

    std::generate(colors.begin(), colors.end(), clr);
    std::generate(radii.begin(), radii.end(),
                  [&] { return 20.0f + 60.0f * rnd(); });
    std::generate(alphas.begin(), alphas.end(), alp);

    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Bubble chart with Transparency Demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_2D);
    chart.setAxesLimits(FRANGE_START, FRANGE_END, -1.0f, 1.0f);

    /* Create several plot objects which creates the necessary
     * vertex buffer objects to hold the different plot types
     */
    forge::Plot plt1 =
        chart.plot((unsigned)(cosData.size() / 2), forge::f32, FG_PLOT_LINE,
                   FG_MARKER_TRIANGLE);  // or specify a specific plot type
    forge::Plot plt2 =
        chart.plot((unsigned)(tanData.size() / 2), forge::f32, FG_PLOT_LINE,
                   FG_MARKER_CIRCLE);  // last parameter specifies marker shape

    /* Set plot colors */
    plt1.setColor(FG_RED);
    plt2.setColor(FG_GREEN);  // use a forge predefined color
    /* Set plot legends */
    plt1.setLegend("Cosine");
    plt2.setLegend("Tangent");
    /* set plot global marker size */
    plt1.setMarkerSize(20);
    /* copy your data into the opengl buffer object exposed by
     * forge::Plot class and then proceed to rendering.
     * To help the users with copying the data from compute
     * memory to display memory, Forge provides copy headers
     * along with the library to help with this task
     */

    GfxHandle* handles[5];

    // create GL-CPU interop buffers
    createGLBuffer(&handles[0], plt1.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[1], plt2.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[2], plt2.colors(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[3], plt2.alphas(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[4], plt2.radii(), FORGE_VERTEX_BUFFER);

    // copy the data from compute buffer to graphics buffer
    copyToGLBuffer(handles[0], (ComputeResourceHandle)cosData.data(),
                   plt1.verticesSize());
    copyToGLBuffer(handles[1], (ComputeResourceHandle)tanData.data(),
                   plt2.verticesSize());

    /* update color value for tan graph */
    copyToGLBuffer(handles[2], (ComputeResourceHandle)colors.data(),
                   plt2.colorsSize());
    /* update alpha values for tan graph */
    copyToGLBuffer(handles[3], (ComputeResourceHandle)alphas.data(),
                   plt2.alphasSize());
    /* update marker sizes for tan graph markers */
    copyToGLBuffer(handles[4], (ComputeResourceHandle)radii.data(),
                   plt2.radiiSize());

    do { wnd.draw(chart); } while (!wnd.close());

    // destroy GL-CPU Interop buffer
    releaseGLBuffer(handles[0]);
    releaseGLBuffer(handles[1]);
    releaseGLBuffer(handles[2]);
    releaseGLBuffer(handles[3]);
    releaseGLBuffer(handles[4]);
    return 0;
}

CUDA

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>
#include <forge.h>
#define USE_FORGE_CUDA_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cstdio>
#include <iostream>

const unsigned DIMX = 1000;
const unsigned DIMY = 800;

static const float DX           = 0.1f;
static const float FRANGE_START = 0.f;
static const float FRANGE_END   = 2 * 3.141592f;
static const size_t DATA_SIZE   = (size_t)((FRANGE_END - FRANGE_START) / DX);

curandState_t* state;

void kernel(float* dev_out, int functionCode, float* colors, float* alphas,
            float* radii);

inline int divup(int a, int b) { return (a + b - 1) / b; }

__global__ void setupRandomKernel(curandState* states,
                                  unsigned long long seed) {
    unsigned tid = blockDim.x * blockIdx.x + threadIdx.x;
    curand_init(seed, tid, 0, &states[tid]);
}

int main(void) {
    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&state, DATA_SIZE * sizeof(curandState_t)));
    // clang-format off
    setupRandomKernel<<<divup(DATA_SIZE, 32), 32>>>(state, 314567);
    // clang-format on

    float* cos_out;
    float* tan_out;
    float* colors_out;
    float* alphas_out;
    float* radii_out;

    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&cos_out, sizeof(float) * DATA_SIZE * 2));
    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&tan_out, sizeof(float) * DATA_SIZE * 2));
    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&colors_out, sizeof(float) * DATA_SIZE * 3));
    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&alphas_out, sizeof(float) * DATA_SIZE));
    FORGE_CUDA_CHECK(cudaMalloc((void**)&radii_out, sizeof(float) * DATA_SIZE));

    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Bubble chart with Transparency Demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_2D);
    chart.setAxesLimits(FRANGE_START, FRANGE_END, -1.0f, 1.0f);

    /* Create several plot objects which creates the necessary
     * vertex buffer objects to hold the different plot types
     */
    forge::Plot plt1 =
        chart.plot(DATA_SIZE, forge::f32, FG_PLOT_LINE, FG_MARKER_TRIANGLE);
    forge::Plot plt2 =
        chart.plot(DATA_SIZE, forge::f32, FG_PLOT_LINE, FG_MARKER_CIRCLE);

    /* Set plot colors */
    plt1.setColor(FG_RED);
    plt2.setColor(FG_GREEN);  // use a forge predefined color
    /* Set plot legends */
    plt1.setLegend("Cosine");
    plt2.setLegend("Tangent");
    /* set plot global marker size */
    plt1.setMarkerSize(20);
    /* copy your data into the opengl buffer object exposed by
     * forge::Plot class and then proceed to rendering.
     * To help the users with copying the data from compute
     * memory to display memory, Forge provides copy headers
     * along with the library to help with this task
     */

    GfxHandle* handles[5];

    // create GL-CUDA interop buffers
    createGLBuffer(&handles[0], plt1.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[1], plt2.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[2], plt2.colors(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[3], plt2.alphas(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[4], plt2.radii(), FORGE_VERTEX_BUFFER);

    kernel(cos_out, 0, NULL, NULL, NULL);
    kernel(tan_out, 1, colors_out, alphas_out, radii_out);

    // copy the data from compute buffer to graphics buffer
    copyToGLBuffer(handles[0], (ComputeResourceHandle)cos_out,
                   plt1.verticesSize());
    copyToGLBuffer(handles[1], (ComputeResourceHandle)tan_out,
                   plt2.verticesSize());

    /* update color value for tan graph */
    copyToGLBuffer(handles[2], (ComputeResourceHandle)colors_out,
                   plt2.colorsSize());
    /* update alpha values for tan graph */
    copyToGLBuffer(handles[3], (ComputeResourceHandle)alphas_out,
                   plt2.alphasSize());
    /* update marker sizes for tan graph markers */
    copyToGLBuffer(handles[4], (ComputeResourceHandle)radii_out,
                   plt2.radiiSize());

    do { wnd.draw(chart); } while (!wnd.close());

    // destroy GL-CUDA Interop buffer
    releaseGLBuffer(handles[0]);
    releaseGLBuffer(handles[1]);
    releaseGLBuffer(handles[2]);
    releaseGLBuffer(handles[3]);
    releaseGLBuffer(handles[4]);
    // destroy CUDA handles
    FORGE_CUDA_CHECK(cudaFree(cos_out));
    FORGE_CUDA_CHECK(cudaFree(tan_out));
    FORGE_CUDA_CHECK(cudaFree(colors_out));
    FORGE_CUDA_CHECK(cudaFree(alphas_out));
    FORGE_CUDA_CHECK(cudaFree(radii_out));

    return 0;
}

__global__ void mapKernel(float* out, int functionCode, float frange_start,
                          float dx) {
    int id  = blockIdx.x * blockDim.x + threadIdx.x;
    float x = frange_start + id * dx;
    float y;

    switch (functionCode) {
        case 0: y = cos(x); break;
        case 1: y = tan(x); break;
        default: y = sin(x); break;
    }

    out[2 * id + 0] = x;
    out[2 * id + 1] = y;
}

__global__ void colorsKernel(float* colors, curandState* states) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;

    colors[3 * id + 0] = curand_uniform(&states[id]);
    colors[3 * id + 1] = curand_uniform(&states[id]);
    colors[3 * id + 2] = curand_uniform(&states[id]);
}

__global__ void randKernel(float* out, curandState* states, float min,
                           float scale) {
    int id  = blockIdx.x * blockDim.x + threadIdx.x;
    out[id] = curand_uniform(&states[id]) * scale + min;
}

void kernel(float* dev_out, int functionCode, float* colors, float* alphas,
            float* radii) {
    static const dim3 threads(32);
    dim3 blocks(divup(DATA_SIZE, 32));

    // clang-format off
    mapKernel<<<blocks, threads>>>(dev_out, functionCode, FRANGE_START, DX);

    if (colors) colorsKernel<<<blocks, threads>>>(colors, state);

    if (alphas) randKernel<<<blocks, threads>>>(alphas, state, 0, 1);

    if (radii) randKernel<<<blocks, threads>>>(radii, state, 20, 60);
    // clang-format on
}

OpenCL

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>

#include "cl_helpers.h"

#include <algorithm>
#include <cmath>
#include <ctime>
#include <iostream>
#include <sstream>
#include <vector>

using namespace cl;
using namespace std;

const unsigned DIMX = 1000;
const unsigned DIMY = 800;

static const float DX           = 0.1f;
static const float FRANGE_START = 0.f;
static const float FRANGE_END   = 2 * 3.141592f;
static const int DATA_SIZE      = (int)((FRANGE_END - FRANGE_START) / DX);

#define USE_FORGE_OPENCL_COPY_HELPERS
#include <fg/compute_copy.h>

// clang-format off
static const std::string chartKernels =
R"EOK(
float rand(int x) {
    x = (x << 13) ^ x;
    return (1.0 - ((x * (x * x * 15731 + 789221) + 1376312589) & 0x7fffffff) /
                      1073741824.0);
}

kernel void randKernel(global float* out, unsigned seed, float min, float scale,
                       int DATA_SIZE) {
    int id = get_global_id(0);
    if (id < DATA_SIZE) out[id] = scale * (1 + rand(seed * id)) / 2.0f + min;
}

kernel void colorsKernel(global float* out, unsigned rseed, unsigned gseed,
                         unsigned bseed, int DATA_SIZE) {
    int id = get_global_id(0);
    if (id < DATA_SIZE) {
        out[3 * id + 0] = (1 + rand(rseed * id)) / 2.0f;
        out[3 * id + 1] = (1 + rand(gseed * id)) / 2.0f;
        out[3 * id + 2] = (1 + rand(bseed * id)) / 2.0f;
    }
}

kernel void mapKernel(global float* out, int functionCode, float FRANGE_START,
                      float DX, int DATA_SIZE) {
    int id  = get_global_id(0);
    float x = FRANGE_START + id * DX;
    float y;

    switch (functionCode) {
        case 0: y = cos(x); break;
        case 1: y = tan(x); break;
        default: y = sin(x); break;
    }

    if (id < DATA_SIZE) {
        out[2 * id + 0] = x;
        out[2 * id + 1] = y;
    }
}
)EOK";
// clang-format on

inline int divup(int a, int b)
{
    return (a + b - 1) / b;
}

void kernel(cl::Buffer& devOut, int fnCode, int outFlags, cl::Buffer& colorsOut,
            cl::Buffer& alphasOut, cl::Buffer& radiiOut,
            cl::CommandQueue& queue, cl::Device& device) {
    static bool compileFlag = true;

    static cl::Program prog;
    static cl::Kernel randKernel, colorsKernel, mapKernel;

    std::srand((unsigned)(std::time(0)));

    if (compileFlag) {
        try {
            prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), chartKernels,
                               false);

            std::vector<cl::Device> devs;
            devs.push_back(device);
            prog.build(devs);

            randKernel   = cl::Kernel(prog, "randKernel");
            colorsKernel = cl::Kernel(prog, "colorsKernel");
            mapKernel    = cl::Kernel(prog, "mapKernel");
        } catch (cl::Error err) {
            std::cout << "Compile Errors: " << std::endl;
            std::cout << err.what() << err.err() << std::endl;
            std::cout << prog.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)
                      << std::endl;
            exit(255);
        }
        std::cout << "Kernels compiled successfully" << std::endl;
        compileFlag = false;
    }

    static const NDRange local(32);
    NDRange global(local[0] * divup(DATA_SIZE, (int)(local[0])));

    mapKernel.setArg(0, devOut);
    mapKernel.setArg(1, fnCode);
    mapKernel.setArg(2, FRANGE_START);
    mapKernel.setArg(3, DX);
    mapKernel.setArg(4, DATA_SIZE);
    queue.enqueueNDRangeKernel(mapKernel, cl::NullRange, global, local);

    if (outFlags & 0x00000001) {
        colorsKernel.setArg(0, colorsOut);
        colorsKernel.setArg(1, std::rand());
        colorsKernel.setArg(2, std::rand());
        colorsKernel.setArg(3, std::rand());
        colorsKernel.setArg(4, DATA_SIZE);
        queue.enqueueNDRangeKernel(colorsKernel, cl::NullRange, global, local);
    }

    if (outFlags & 0x00000002) {
        randKernel.setArg(0, alphasOut);
        randKernel.setArg(1, std::rand());
        randKernel.setArg(2, 0.0f);
        randKernel.setArg(3, 1.0f);
        randKernel.setArg(4, DATA_SIZE);
        queue.enqueueNDRangeKernel(randKernel, cl::NullRange, global, local);
    }

    if (outFlags & 0x00000004) {
        randKernel.setArg(0, radiiOut);
        randKernel.setArg(1, std::rand());
        randKernel.setArg(2, 20.0f);
        randKernel.setArg(3, 60.0f);
        randKernel.setArg(4, DATA_SIZE);
        queue.enqueueNDRangeKernel(randKernel, cl::NullRange, global, local);
    }
}

int main(void) {
    try {
        /*
         * First Forge call should be a window creation call
         * so that necessary OpenGL context is created for any
         * other forge::* object to be created successfully
         */
        forge::Window wnd(DIMX, DIMY, "Bubble chart with Transparency Demo");
        wnd.makeCurrent();

        forge::Chart chart(FG_CHART_2D);
        chart.setAxesLimits(FRANGE_START, FRANGE_END, -1.0f, 1.0f);

        /* Create several plot objects which creates the necessary
         * vertex buffer objects to hold the different plot types
         */
        forge::Plot plt1 =
            chart.plot(DATA_SIZE, forge::f32, FG_PLOT_LINE, FG_MARKER_TRIANGLE);
        forge::Plot plt2 =
            chart.plot(DATA_SIZE, forge::f32, FG_PLOT_LINE, FG_MARKER_CIRCLE);

        /* Set plot colors */
        plt1.setColor(FG_RED);
        plt2.setColor(FG_GREEN);  // use a forge predefined color
        /* Set plot legends */
        plt1.setLegend("Cosine");
        plt2.setLegend("Tangent");
        /* set plot global marker size */
        plt1.setMarkerSize(20);

        /*
         * Helper function to create a CLGL interop context.
         * This function checks for if the extension is available
         * and creates the context on the appropriate device.
         * Note: context and queue are defined in cl_helpers.h
         */
        context       = createCLGLContext(wnd);
        Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
        queue         = CommandQueue(context, device);

        GfxHandle* handles[5];

        // create GL-OpenCL interop buffers
        createGLBuffer(&handles[0], plt1.vertices(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[1], plt2.vertices(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[2], plt2.colors(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[3], plt2.alphas(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[4], plt2.radii(), FORGE_VERTEX_BUFFER);

        cl::Buffer cosOut(context, CL_MEM_READ_WRITE,
                          sizeof(float) * DATA_SIZE * 2);
        cl::Buffer tanOut(context, CL_MEM_READ_WRITE,
                          sizeof(float) * DATA_SIZE * 2);
        cl::Buffer colorsOut(context, CL_MEM_READ_WRITE,
                             sizeof(float) * DATA_SIZE * 3);
        cl::Buffer alphasOut(context, CL_MEM_READ_WRITE,
                             sizeof(float) * DATA_SIZE);
        cl::Buffer radiiOut(context, CL_MEM_READ_WRITE,
                            sizeof(float) * DATA_SIZE);
        cl::Buffer dummy;

        kernel(cosOut, 0, 0, dummy, dummy, dummy, queue, device);
        kernel(tanOut, 1, 0x00000007, colorsOut, alphasOut, radiiOut, queue,
               device);

        /* copy your data into the opengl buffer object exposed by
         * forge::Plot class and then proceed to rendering.
         * To help the users with copying the data from compute
         * memory to display memory, Forge provides copy headers
         * along with the library to help with this task
         */
        copyToGLBuffer(handles[0], (ComputeResourceHandle)cosOut(),
                       plt1.verticesSize());
        copyToGLBuffer(handles[1], (ComputeResourceHandle)tanOut(),
                       plt2.verticesSize());

        /* update color value for tan graph */
        copyToGLBuffer(handles[2], (ComputeResourceHandle)colorsOut(),
                       plt2.colorsSize());
        /* update alpha values for tan graph */
        copyToGLBuffer(handles[3], (ComputeResourceHandle)alphasOut(),
                       plt2.alphasSize());
        /* update marker sizes for tan graph markers */
        copyToGLBuffer(handles[4], (ComputeResourceHandle)radiiOut(),
                       plt2.radiiSize());

        do { wnd.draw(chart); } while (!wnd.close());

        // destroy GL-OpenCL Interop buffer
        releaseGLBuffer(handles[0]);
        releaseGLBuffer(handles[1]);
        releaseGLBuffer(handles[2]);
        releaseGLBuffer(handles[3]);
        releaseGLBuffer(handles[4]);

    } catch (forge::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    } catch (cl::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    }

    return 0;
}

2D Vector Field

CPU

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>
#define USE_FORGE_CPU_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cmath>
#include <complex>
#include <iostream>
#include <vector>

const unsigned DIMX      = 640;
const unsigned DIMY      = 480;
const float PI           = 3.14159265359f;
const float MINIMUM      = 1.0f;
const float MAXIMUM      = 20.f;
const float STEP         = 2.0f;
const float NELEMS       = (MAXIMUM - MINIMUM + 1) / STEP;
const unsigned DPOINTS[] = {5, 5, 5, 15, 15, 5, 15, 15};

using namespace std;

void generatePoints(std::vector<float> &points, std::vector<float> &dirs) {
    points.clear();

    for (int j = 0; j < NELEMS; ++j) {
        float y = MINIMUM + j * STEP;
        for (int i = 0; i < NELEMS; ++i) {
            float x = MINIMUM + i * STEP;
            points.push_back(x);
            points.push_back(y);
            dirs.push_back(sin(2 * PI * x / 10.f));
            dirs.push_back(sin(2 * PI * y / 10.f));
        }
    }
}

int main(void) {
    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Vector Field Demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_2D);
    chart.setAxesLimits(MINIMUM - 1.0f, MAXIMUM, MINIMUM - 1.0f, MAXIMUM);
    chart.setAxesTitles("x-axis", "y-axis");

    forge::Plot divPoints =
        chart.plot(4, forge::u32, FG_PLOT_SCATTER, FG_MARKER_CIRCLE);
    divPoints.setColor(0.9f, 0.9f, 0.0f, 1.f);
    divPoints.setLegend("Convergence Points");
    divPoints.setMarkerSize(24);

    forge::VectorField field =
        chart.vectorField((unsigned)(NELEMS * NELEMS), forge::f32);
    field.setColor(0.f, 0.6f, 0.3f, 1.f);

    std::vector<float> points;
    std::vector<float> dirs;
    generatePoints(points, dirs);

    GfxHandle *handles[3];

    createGLBuffer(&handles[0], divPoints.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[1], field.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[2], field.directions(), FORGE_VERTEX_BUFFER);

    copyToGLBuffer(handles[0], (ComputeResourceHandle)DPOINTS,
                   divPoints.verticesSize());
    copyToGLBuffer(handles[1], (ComputeResourceHandle)points.data(),
                   field.verticesSize());
    copyToGLBuffer(handles[2], (ComputeResourceHandle)dirs.data(),
                   field.directionsSize());

    do { wnd.draw(chart); } while (!wnd.close());

    // destroy GL-cpu interop buffers
    releaseGLBuffer(handles[0]);
    releaseGLBuffer(handles[1]);
    releaseGLBuffer(handles[2]);

    return 0;
}

CUDA

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <cuda_runtime.h>
#include <forge.h>
#define USE_FORGE_CUDA_COPY_HELPERS
#include <fg/compute_copy.h>

#define PI 3.14159265359

const unsigned DIMX      = 640;
const unsigned DIMY      = 480;
const float MINIMUM      = 1.0f;
const float MAXIMUM      = 20.f;
const float STEP         = 2.0f;
const float NELEMS       = (MAXIMUM - MINIMUM + 1) / STEP;
const unsigned DPOINTS[] = {5, 5, 5, 15, 15, 5, 15, 15};

void generatePoints(float* points, float* dirs);

inline int divup(int a, int b) { return (a + b - 1) / b; }

int main(void) {
    unsigned* dpoints;
    float* points;
    float* dirs;
    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Vector Field Demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_2D);
    chart.setAxesLimits(MINIMUM - 1.0f, MAXIMUM, MINIMUM - 1.0f, MAXIMUM);
    chart.setAxesTitles("x-axis", "y-axis");

    forge::Plot divPoints =
        chart.plot(4, forge::u32, FG_PLOT_SCATTER, FG_MARKER_CIRCLE);
    divPoints.setColor(0.9f, 0.9f, 0.0f, 1.f);
    divPoints.setLegend("Convergence Points");
    divPoints.setMarkerSize(24);

    size_t npoints = (size_t)(NELEMS * NELEMS);

    forge::VectorField field =
        chart.vectorField((unsigned)(npoints), forge::f32);
    field.setColor(0.f, 0.6f, 0.3f, 1.f);

    FORGE_CUDA_CHECK(cudaMalloc((void**)&dpoints, 8 * sizeof(unsigned)));
    FORGE_CUDA_CHECK(cudaMalloc((void**)&points, 2 * npoints * sizeof(float)));
    FORGE_CUDA_CHECK(cudaMalloc((void**)&dirs, 2 * npoints * sizeof(float)));

    GfxHandle* handles[3];

    createGLBuffer(&handles[0], divPoints.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[1], field.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[2], field.directions(), FORGE_VERTEX_BUFFER);

    FORGE_CUDA_CHECK(cudaMemcpy(dpoints, DPOINTS, 8 * sizeof(unsigned),
                                cudaMemcpyHostToDevice));
    generatePoints(points, dirs);

    copyToGLBuffer(handles[0], (ComputeResourceHandle)dpoints,
                   divPoints.verticesSize());

    copyToGLBuffer(handles[1], (ComputeResourceHandle)points,
                   field.verticesSize());
    copyToGLBuffer(handles[2], (ComputeResourceHandle)dirs,
                   field.directionsSize());

    do { wnd.draw(chart); } while (!wnd.close());

    // destroy GL-CUDA interop buffers
    releaseGLBuffer(handles[0]);
    releaseGLBuffer(handles[1]);
    releaseGLBuffer(handles[2]);
    // destroy CUDA handles
    FORGE_CUDA_CHECK(cudaFree(dpoints));
    FORGE_CUDA_CHECK(cudaFree(points));
    FORGE_CUDA_CHECK(cudaFree(dirs));

    return 0;
}

__global__ void pointGenKernel(float* points, float* dirs, int nelems,
                               float minimum, float step) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;

    if (i < nelems && j < nelems) {
        int id = i + j * nelems;

        float x = minimum + i * step;
        float y = minimum + j * step;

        points[2 * id + 0] = x;
        points[2 * id + 1] = y;

        dirs[2 * id + 0] = sinf(2.0f * PI * x / 10.f);
        dirs[2 * id + 1] = sinf(2.0f * PI * y / 10.f);
    }
}

void generatePoints(float* points, float* dirs) {
    static dim3 threads(8, 8);
    dim3 blocks(divup((int)(NELEMS), threads.x),
                divup((int)(NELEMS), threads.y));

    // clang-format off
    pointGenKernel<<<blocks, threads>>>(points, dirs, (int)(NELEMS), MINIMUM,
                                        STEP);
    // clang-format on
}

OpenCL

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>

#include "cl_helpers.h"

#include <algorithm>
#include <cmath>
#include <ctime>
#include <iostream>
#include <iterator>
#include <sstream>
#include <vector>

using namespace cl;
using namespace std;

const unsigned DIMX      = 640;
const unsigned DIMY      = 480;
const float MINIMUM      = 1.0f;
const float MAXIMUM      = 20.f;
const float STEP         = 2.0f;
const float NELEMS       = (MAXIMUM - MINIMUM + 1) / STEP;
const unsigned DPOINTS[] = {5, 5, 5, 15, 15, 5, 15, 15};

#define USE_FORGE_OPENCL_COPY_HELPERS
#include <fg/compute_copy.h>

// clang-format off
static const std::string fieldKernel =
R"EOK(
constant float PI = 3.14159265359;

kernel void pointGenKernel(global float* points, global float* dirs, int NELEMS,
                           float MINIMUM, float STEP) {
    int i = get_global_id(0);
    int j = get_global_id(1);

    if (i < NELEMS && j < NELEMS) {
        int id = i + j * NELEMS;

        float x = MINIMUM + i * STEP;
        float y = MINIMUM + j * STEP;

        points[2 * id + 0] = x;
        points[2 * id + 1] = y;

        dirs[2 * id + 0] = sin(2.0 * PI * x / 10.0);
        dirs[2 * id + 1] = sin(2.0 * PI * y / 10.0);
    }
}
)EOK";
// clang-format on

inline int divup(int a, int b)
{
    return (a + b - 1) / b;
}

void generatePoints(cl::Buffer& points, cl::Buffer& dirs,
                    cl::CommandQueue& queue, cl::Device& device) {
    static bool compileFlag = true;

    static cl::Program prog;
    static cl::Kernel pointGenKernel;

    if (compileFlag) {
        try {
            prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), fieldKernel,
                               false);

            std::vector<cl::Device> devs;
            devs.push_back(device);
            prog.build(devs);

            pointGenKernel = cl::Kernel(prog, "pointGenKernel");
        } catch (cl::Error err) {
            std::cout << "Compile Errors: " << std::endl;
            std::cout << err.what() << err.err() << std::endl;
            std::cout << prog.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)
                      << std::endl;
            exit(255);
        }
        std::cout << "Kernels compiled successfully" << std::endl;
        compileFlag = false;
    }

    static const NDRange local(8, 8);
    NDRange global(local[0] * divup((int)(NELEMS), (int)(local[0])),
                   local[1] * divup((int)(NELEMS), (int)(local[1])));

    pointGenKernel.setArg(0, points);
    pointGenKernel.setArg(1, dirs);
    pointGenKernel.setArg(2, (int)NELEMS);
    pointGenKernel.setArg(3, MINIMUM);
    pointGenKernel.setArg(4, STEP);
    queue.enqueueNDRangeKernel(pointGenKernel, cl::NullRange, global, local);
}

int main(void) {
    try {
        /*
         * First Forge call should be a window creation call
         * so that necessary OpenGL context is created for any
         * other forge::* object to be created successfully
         */
        forge::Window wnd(DIMX, DIMY, "Vector Field Demo");
        wnd.makeCurrent();

        forge::Chart chart(FG_CHART_2D);
        chart.setAxesLimits(MINIMUM - 1.0f, MAXIMUM, MINIMUM - 1.0f, MAXIMUM);
        chart.setAxesTitles("x-axis", "y-axis");

        forge::Plot divPoints =
            chart.plot(4, forge::u32, FG_PLOT_SCATTER, FG_MARKER_CIRCLE);
        divPoints.setColor(0.9f, 0.9f, 0.0f, 1.f);
        divPoints.setLegend("Convergence Points");
        divPoints.setMarkerSize(24);

        size_t npoints = (size_t)(NELEMS * NELEMS);

        forge::VectorField field =
            chart.vectorField((unsigned)(npoints), forge::f32);
        field.setColor(0.f, 0.6f, 0.3f, 1.f);

        /*
         * Helper function to create a CLGL interop context.
         * This function checks for if the extension is available
         * and creates the context on the appropriate device.
         * Note: context and queue are defined in cl_helpers.h
         */
        context       = createCLGLContext(wnd);
        Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
        queue         = CommandQueue(context, device);

        GfxHandle* handles[3];

        createGLBuffer(&handles[0], divPoints.vertices(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[1], field.vertices(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[2], field.directions(), FORGE_VERTEX_BUFFER);

        cl::Buffer dpoints(context, CL_MEM_READ_WRITE, sizeof(unsigned) * 8);
        cl::Buffer points(context, CL_MEM_READ_WRITE,
                          sizeof(float) * 2 * npoints);
        cl::Buffer dirs(context, CL_MEM_READ_WRITE,
                        sizeof(float) * 2 * npoints);

        queue.enqueueWriteBuffer(dpoints, CL_TRUE, 0, sizeof(unsigned) * 8,
                                 DPOINTS);
        generatePoints(points, dirs, queue, device);

        copyToGLBuffer(handles[0], (ComputeResourceHandle)dpoints(),
                       divPoints.verticesSize());

        copyToGLBuffer(handles[1], (ComputeResourceHandle)points(),
                       field.verticesSize());
        copyToGLBuffer(handles[2], (ComputeResourceHandle)dirs(),
                       field.directionsSize());

        do { wnd.draw(chart); } while (!wnd.close());

        // destroy GL-CUDA interop buffers
        releaseGLBuffer(handles[0]);
        releaseGLBuffer(handles[1]);
        releaseGLBuffer(handles[2]);

    } catch (forge::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    } catch (cl::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    }

    return 0;
}

Histogram

CPU

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>
#define USE_FORGE_CPU_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cmath>
#include <complex>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <vector>

const unsigned IMGW  = 256;
const unsigned IMGH  = 256;
const unsigned DIMX  = 1000;
const unsigned DIMY  = 800;
const unsigned NBINS = 256;

using namespace std;

struct Bitmap {
    unsigned char* ptr;
    unsigned width;
    unsigned height;
};

class PerlinNoise {
   private:
    float base[IMGW][IMGH];
    float perlin[IMGW][IMGH];

   public:
    PerlinNoise();
    float noise(float u, float v);
};

Bitmap createBitmap(unsigned w, unsigned h);

void destroyBitmap(Bitmap& bmp);

void kernel(Bitmap& bmp);

void populateBins(Bitmap& bmp, int* hist_array, const unsigned nbins,
                  float* hist_cols);

int main(int argc, char* argv[]) {
    Bitmap bmp = createBitmap(IMGW, IMGH);
    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Histogram Demo");
    wnd.makeCurrent();

    forge::Image img(IMGW, IMGH, FG_RGBA, forge::u8);

    forge::Chart chart(FG_CHART_2D);

    /* set x axis limits to maximum and minimum values of data
     * and y axis limits to range [0, number of pixels ideally]
     * but practically total number of pixels as y range will skew
     * the histogram graph vertically. Therefore setting it to
     * 25% of total number of pixels */
    chart.setAxesLimits(0, 1, 0, IMGW * IMGH / (float)(NBINS / 4.0));

    /*
     * Create histogram object specifying number of bins
     */
    forge::Histogram hist = chart.histogram(NBINS, forge::s32);
    /*
     * Set histogram colors
     */
    hist.setColor(FG_YELLOW);

    GfxHandle* handles[3];

    createGLBuffer(&handles[0], img.pixels(), FORGE_IMAGE_BUFFER);
    createGLBuffer(&handles[1], hist.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[2], hist.colors(), FORGE_VERTEX_BUFFER);

    wnd.setColorMap((fg_color_map)(argc == 2 ? atoi(argv[1]) : 1));

    do {
        /*
         * generate image, and prepare data to pass into
         * Histogram's underlying vertex buffer object
         */
        kernel(bmp);

        copyToGLBuffer(handles[0], (ComputeResourceHandle)bmp.ptr, img.size());

        // forge::copy(img, (const void*)bmp.ptr);

        /* copy your data into the vertex buffer object exposed by
         * forge::Histogram class and then proceed to rendering.
         * To help the users with copying the data from compute
         * memory to display memory, Forge provides copy headers
         * along with the library to help with this task
         */
        std::vector<int> histArray(NBINS, 0);
        std::vector<float> colArray(3 * NBINS, 0.0f);
        populateBins(bmp, histArray.data(), NBINS, colArray.data());

        copyToGLBuffer(handles[1], (ComputeResourceHandle)histArray.data(),
                       hist.verticesSize());
        copyToGLBuffer(handles[2], (ComputeResourceHandle)colArray.data(),
                       hist.colorsSize());

        /*
         * Split the window into grid regions
         */
        // wnd.draw(2, 2, 0, img,  "Dynamic Perlin Noise" );
        // wnd.draw(2, 2, 1, img,  "Dynamic Perlin Noise" );
        // wnd.draw(2, 1, 1, chart, "Histogram of Noisy Image");
        wnd.draw(2, 3, 0, img, "Dynamic Perlin Noise");
        wnd.draw(2, 3, 1, img, "Dynamic Perlin Noise");
        wnd.draw(2, 3, 2, img, "Dynamic Perlin Noise");
        wnd.draw(2, 2, 2, chart, "Histogram of Noisy Image");
        wnd.draw(2, 2, 3, chart, "Histogram of Noisy Image");

        wnd.swapBuffers();
    } while (!wnd.close());

    releaseGLBuffer(handles[0]);
    releaseGLBuffer(handles[1]);
    releaseGLBuffer(handles[2]);

    return 0;
}

float interp(float x0, float x1, float alpha) {
    return x0 * (1 - alpha) + alpha * x1;
}

PerlinNoise::PerlinNoise() {
    std::srand((unsigned)(std::time(0)));

    for (unsigned i = 0; i < IMGW; i++) {
        for (unsigned j = 0; j < IMGH; j++) {
            base[i][j]   = std::rand() / (float)(RAND_MAX);
            perlin[i][j] = 0;
        }
    }

    float persistence = 0.5f;
    float amp         = 1.0f;
    float tamp        = 0.0f;

    for (int octave = 6; octave >= 0; --octave) {
        int period = 1 << octave;
        float freq = 1.0f / period;

        for (unsigned i = 0; i < IMGW; i++) {
            int si0      = (i / period) * period;
            int si1      = (si0 + period) % IMGW;
            float hblend = (i - si0) * freq;

            for (unsigned j = 0; j < IMGH; j++) {
                int sj0      = (j / period) * period;
                int sj1      = (sj0 + period) % IMGH;
                float vblend = (j - sj0) * freq;

                float top = interp(base[si0][sj0], base[si1][sj0], hblend);
                float bot = interp(base[si0][sj1], base[si1][sj1], hblend);

                perlin[i][j] += (amp * interp(top, bot, vblend));
            }
        }
        tamp += amp;
        amp *= persistence;
    }

    for (unsigned i = 0; i < IMGW; i++)
        for (unsigned j = 0; j < IMGH; j++) perlin[i][j] /= tamp;
}

float PerlinNoise::noise(float u, float v) {
    return perlin[(unsigned)(IMGW * u)][(unsigned)(IMGH * v)];
}

Bitmap createBitmap(unsigned w, unsigned h) {
    Bitmap retVal;
    retVal.width  = w;
    retVal.height = h;
    retVal.ptr    = new unsigned char[4 * w * h];
    return retVal;
}

void destroyBitmap(Bitmap& bmp) { delete[] bmp.ptr; }

void kernel(Bitmap& bmp) {
    PerlinNoise perlin;

    for (unsigned y = 0; y < bmp.height; ++y) {
        for (unsigned x = 0; x < bmp.width; ++x) {
            int offset = x + y * bmp.width;

            float u = x / (float)(bmp.width);
            float v = y / (float)(bmp.height);

            unsigned char noiseVal  = (unsigned char)(255 * perlin.noise(u, v));
            bmp.ptr[offset * 4 + 0] = noiseVal;
            bmp.ptr[offset * 4 + 1] = noiseVal;
            bmp.ptr[offset * 4 + 2] = noiseVal;
            bmp.ptr[offset * 4 + 3] = 255;
        }
    }
}

void populateBins(Bitmap& bmp, int* hist_array, const unsigned nbins,
                  float* hist_cols) {
    for (unsigned y = 0; y < bmp.height; ++y) {
        for (unsigned x = 0; x < bmp.width; ++x) {
            int offset             = x + y * bmp.width;
            unsigned char noiseVal = bmp.ptr[offset * 4];
            unsigned idx           = (int)((float)noiseVal / 255.f * nbins);
            hist_array[idx]++;
        }
    }

    for (unsigned b = 0; b < nbins; ++b) {
        hist_cols[3 * b + 0] = std::rand() / (float)RAND_MAX;
        hist_cols[3 * b + 1] = std::rand() / (float)RAND_MAX;
        hist_cols[3 * b + 2] = std::rand() / (float)RAND_MAX;
    }
}

CUDA

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <cuComplex.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>
#include <forge.h>
#define USE_FORGE_CUDA_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cstdio>

const unsigned IMGW  = 256;
const unsigned IMGH  = 256;
const unsigned DIMX  = 1000;
const unsigned DIMY  = 800;
const unsigned NBINS = 256;

curandState_t* state;

struct Bitmap {
    unsigned char* ptr;
    unsigned width;
    unsigned height;
};

class PerlinNoise {
   public:
    float* base;
    float* perlin;

    PerlinNoise();
    ~PerlinNoise();
    void generateNoise();
};

Bitmap createBitmap(unsigned w, unsigned h);

void destroyBitmap(Bitmap& bmp);

void kernel(Bitmap& bmp, PerlinNoise& pn);

void populateBins(Bitmap& bmp, int* hist_array, const unsigned nbins,
                  float* hist_cols);

__global__ void setupRandomKernel(curandState* states,
                                  unsigned long long seed) {
    unsigned tid = blockDim.x * blockIdx.x + threadIdx.x;
    curand_init(seed, tid, 0, &states[tid]);
}

int main(void) {
    Bitmap bmp = createBitmap(IMGW, IMGH);

    FORGE_CUDA_CHECK(cudaMalloc((void**)&state, NBINS * sizeof(curandState_t)));
    // clang-format off
    setupRandomKernel<<<1, NBINS>>>(state, 314567);
    // clang-format on

    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Histogram Demo");
    wnd.makeCurrent();

    forge::Image img(IMGW, IMGH, FG_RGBA, forge::u8);

    forge::Chart chart(FG_CHART_2D);

    chart.setAxesLabelFormat("%3.1f", "%.2e");

    /* set x axis limits to maximum and minimum values of data
     * and y axis limits to range [0, number of pixels ideally]
     * but practically total number of pixels as y range will skew
     * the histogram graph vertically. Therefore setting it to
     * 25% of total number of pixels */
    chart.setAxesLimits(0, 1, 0, IMGW * IMGH / (float)(NBINS / 4.0));

    /*
     * Create histogram object specifying number of bins
     */
    forge::Histogram hist = chart.histogram(NBINS, forge::s32);
    /*
     * Set histogram colors
     */
    hist.setColor(FG_YELLOW);

    PerlinNoise noiseGenerator;
    int* histOut;
    float* histColors;

    FORGE_CUDA_CHECK(cudaMalloc((void**)&histOut, NBINS * sizeof(int)));
    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&histColors, 3 * NBINS * sizeof(float)));

    GfxHandle* handles[3];

    createGLBuffer(&handles[0], img.pixels(), FORGE_IMAGE_BUFFER);
    createGLBuffer(&handles[1], hist.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[2], hist.colors(), FORGE_VERTEX_BUFFER);

    unsigned frame = 0;
    do {
        if (frame % 8 == 0) {
            kernel(bmp, noiseGenerator);
            copyToGLBuffer(handles[0], (ComputeResourceHandle)bmp.ptr,
                           img.size());

            populateBins(bmp, histOut, NBINS, histColors);

            copyToGLBuffer(handles[1], (ComputeResourceHandle)histOut,
                           hist.verticesSize());
            copyToGLBuffer(handles[2], (ComputeResourceHandle)histColors,
                           hist.colorsSize());

            frame = 0;
        }

        /*
         * Split the window into grid regions
         */
        wnd.draw(1, 2, 0, img, "Dynamic Perlin Noise");
        wnd.draw(1, 2, 1, chart, "Histogram of Noisy Image");

        wnd.swapBuffers();
        frame++;
    } while (!wnd.close());

    FORGE_CUDA_CHECK(cudaFree(histOut));
    FORGE_CUDA_CHECK(cudaFree(histColors));
    releaseGLBuffer(handles[0]);
    releaseGLBuffer(handles[1]);
    releaseGLBuffer(handles[2]);

    return 0;
}

Bitmap createBitmap(unsigned w, unsigned h) {
    Bitmap retVal;
    retVal.width  = w;
    retVal.height = h;
    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&retVal.ptr, sizeof(unsigned char) * 4 * w * h));
    return retVal;
}

void destroyBitmap(Bitmap& bmp) { FORGE_CUDA_CHECK(cudaFree(bmp.ptr)); }

PerlinNoise::PerlinNoise() {
    const size_t IMG_SIZE = IMGW * IMGH * sizeof(float);

    FORGE_CUDA_CHECK(cudaMalloc((void**)&base, IMG_SIZE));
    FORGE_CUDA_CHECK(cudaMalloc((void**)&perlin, IMG_SIZE));
}

PerlinNoise::~PerlinNoise() {
    FORGE_CUDA_CHECK(cudaFree(base));
    FORGE_CUDA_CHECK(cudaFree(perlin));
}

inline int divup(int a, int b) { return (a + b - 1) / b; }

__device__ float interp(float x0, float x1, float alpha) {
    return x0 * (1 - alpha) + alpha * x1;
}

__global__ void perlinInitKernel(float* base, float* perlin,
                                 curandState* state) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < IMGW && y < IMGH) {
        int index     = y * IMGW + x;
        base[index]   = curand_uniform(&state[index % NBINS]);
        perlin[index] = 0.0f;
    }
}

__global__ void perlinComputeKernel(float* perlin, float* base, float amp,
                                    int period) {
    unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < IMGW && y < IMGH) {
        int index = y * IMGW + x;

        float freq = 1.0f / period;

        int si0      = (x / period) * period;
        int si1      = (si0 + period) % IMGW;
        float hblend = (x - si0) * freq;

        int sj0      = (y / period) * period;
        int sj1      = (sj0 + period) % IMGH;
        float vblend = (y - sj0) * freq;

        float top =
            interp(base[si0 + IMGW * sj0], base[si1 + IMGW * sj0], hblend);
        float bot =
            interp(base[si0 + IMGW * sj1], base[si1 + IMGW * sj1], hblend);

        perlin[index] += (amp * interp(top, bot, vblend));
    }
}

__global__ void perlinNormalize(float* perlin, float tamp) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < IMGW && y < IMGH) {
        int index     = y * IMGW + x;
        perlin[index] = perlin[index] / tamp;
    }
}

void PerlinNoise::generateNoise() {
    static dim3 threads(32, 8);
    dim3 blocks(divup(IMGW, threads.x), divup(IMGH, threads.y));

    float persistence = 0.5f;
    float amp         = 1.0f;
    float tamp        = 0.0f;

    // clang-format off
    perlinInitKernel<<<blocks, threads>>>(base, perlin, state);
    // clang-format on

    for (int octave = 6; octave >= 0; --octave) {
        int period = 1 << octave;

        // clang-format off
        perlinComputeKernel<<<blocks, threads>>>(perlin, base, amp, period);
        // clang-format on

        tamp += amp;
        amp *= persistence;
    }

    // clang-format off
    perlinNormalize<<<blocks, threads>>>(perlin, tamp);
    // clang-format on
}

__global__ void fillImageKernel(unsigned char* ptr, unsigned width,
                                unsigned height, float* perlin) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        int offset = x + y * width;

        unsigned u = (unsigned)(IMGW * x / (float)(width));
        unsigned v = (unsigned)(IMGH * y / (float)(height));
        int idx    = u + v * IMGW;

        unsigned char val   = 255 * perlin[idx];
        ptr[offset * 4 + 0] = val;
        ptr[offset * 4 + 1] = val;
        ptr[offset * 4 + 2] = val;
        ptr[offset * 4 + 3] = 255;
    }
}

void kernel(Bitmap& bmp, PerlinNoise& pn) {
    static dim3 threads(32, 8);

    pn.generateNoise();

    dim3 blocks(divup(bmp.width, threads.x), divup(bmp.height, threads.y));

    // clang-format off
    fillImageKernel<<<blocks, threads>>>(bmp.ptr, bmp.width, bmp.height,
                                         pn.perlin);
    // clang-format on
}

__global__ void histogramKernel(const unsigned char* perlinNoise, int* histOut,
                                const unsigned nbins) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < IMGW && y < IMGH) {
        int offset             = y * IMGW + x;
        unsigned char noiseVal = perlinNoise[offset * 4 + 0];
        offset = __float2int_rd(nbins * (__int2float_rd(noiseVal) / 255.f));
        atomicAdd(histOut + offset, 1);
    }
}

__global__ void histColorsKernel(float* histColors, curandState* states) {
    int bin = blockIdx.x * blockDim.x + threadIdx.x;

    histColors[3 * bin + 0] = curand_uniform(&states[bin]);
    histColors[3 * bin + 1] = curand_uniform(&states[bin]);
    histColors[3 * bin + 2] = curand_uniform(&states[bin]);
}

void populateBins(Bitmap& bmp, int* histOut, const unsigned nbins,
                  float* histColors) {
    static const dim3 threads(8, 8);
    dim3 blocks(divup(bmp.width, threads.x), divup(bmp.height, threads.y));

    cudaMemset(histOut, 0, nbins * sizeof(int));

    // clang-format off
    histogramKernel<<<blocks, threads>>>(bmp.ptr, histOut, nbins);

    histColorsKernel<<<1, nbins>>>(histColors, state);
    // clang-format on
}

OpenCL

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>
#include <algorithm>
#include <cmath>
#include <ctime>
#include <iostream>
#include <iterator>
#include <sstream>
#include <vector>
#include "cl_helpers.h"

using namespace cl;
using namespace std;

const unsigned IMGW     = 256;
const unsigned IMGH     = 256;
const unsigned DIMX     = 1000;
const unsigned DIMY     = 800;
const unsigned IMG_SIZE = IMGW * IMGH * 4;
const unsigned NBINS    = 256;
const float PERSISTENCE = 0.5f;

#define USE_FORGE_OPENCL_COPY_HELPERS
#include <fg/compute_copy.h>

// clang-format off
static const std::string perlinKernels =
R"EOK(
float rand(int x) {
    x = (x << 13) ^ x;
    return (1.0 - ((x * (x * x * 15731 + 789221) + 1376312589) & 0x7fffffff) /
                      1073741824.0);
}

float interp(float x0, float x1, float t) { return x0 + (x1 - x0) * t; }

kernel void init(global float* base, global float* perlin, int IMGW, int IMGH,
                 int randSeed) {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < IMGW && y < IMGH) {
        int i     = x + y * IMGW;
        base[i]   = (1 + rand(randSeed * i)) / 2.0f;
        perlin[i] = 0.0f;
    }
}

kernel void compute(global float* perlin, global float* base, unsigned IMGW,
                    unsigned IMGH, float amp, int period) {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < IMGW && y < IMGH) {
        int index = y * IMGW + x;

        float freq = 1.0f / period;

        int si0      = (x / period) * period;
        int si1      = (si0 + period) % IMGW;
        float hblend = (x - si0) * freq;

        int sj0      = (y / period) * period;
        int sj1      = (sj0 + period) % IMGH;
        float vblend = (y - sj0) * freq;

        float top =
            interp(base[si0 + IMGW * sj0], base[si1 + IMGW * sj0], hblend);
        float bot =
            interp(base[si0 + IMGW * sj1], base[si1 + IMGW * sj1], hblend);

        perlin[index] += (amp * interp(top, bot, vblend));
    }
}

kernel void normalizeNoise(global float* perlin, unsigned IMGW, unsigned IMGH,
                           float tamp) {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < IMGW && y < IMGH) {
        int index     = y * IMGW + x;
        perlin[index] = perlin[index] / tamp;
    }
}

kernel void fillImage(global unsigned char* ptr, unsigned width,
                      unsigned height, global float* perlin, unsigned IMGW,
                      unsigned IMGH) {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < width && y < height) {
        int offset = x + y * width;

        unsigned u = (unsigned)(IMGW * x / (float)(width));
        unsigned v = (unsigned)(IMGH * y / (float)(height));
        int idx    = u + v * IMGW;

        unsigned char val   = 255 * perlin[idx];
        ptr[offset * 4 + 0] = val;
        ptr[offset * 4 + 1] = val;
        ptr[offset * 4 + 2] = val;
        ptr[offset * 4 + 3] = 255;
    }
}

kernel void memSet(global int* out, unsigned len) {
    if (get_global_id(0) < len) out[get_global_id(0)] = 0;
}

kernel void histogram(const global unsigned char* perlinNoise,
                      global int* histOut, const unsigned w, const unsigned h,
                      const unsigned nbins) {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < w && y < h) {
        int offset             = y * w + x;
        unsigned char noiseVal = perlinNoise[offset * 4 + 0];
        offset                 = (int)(nbins * (noiseVal / 255.f));
        atomic_add(histOut + offset, 1);
    }
}

kernel void setColors(global float* out, unsigned rseed, unsigned gseed,
                      unsigned bseed) {
    int i          = get_global_id(0);
    out[3 * i + 0] = (1 + rand(rseed * i)) / 2.0f;
    out[3 * i + 1] = (1 + rand(gseed * i)) / 2.0f;
    out[3 * i + 2] = (1 + rand(bseed * i)) / 2.0f;
};
)EOK";
// clang-format on

inline
int divup(int a, int b)
{
    return (a + b - 1) / b;
}

void kernel(cl::Buffer& image, cl::Buffer& base, cl::Buffer& perlin,
            cl::Buffer& histOut, cl::Buffer& colors, cl::CommandQueue& queue,
            cl::Device& device) {
    static bool compileFlag = true;
    static cl::Program prog;
    static cl::Kernel initKernel, computeKernel, normKernel, fillKernel;
    static cl::Kernel memSetKernel, genHistogram, genHistColors;

    std::srand((unsigned)(std::time(0)));

    if (compileFlag) {
        try {
            prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), perlinKernels,
                               false);

            std::vector<cl::Device> devs;
            devs.push_back(device);
            prog.build(devs);

            initKernel    = cl::Kernel(prog, "init");
            computeKernel = cl::Kernel(prog, "compute");
            normKernel    = cl::Kernel(prog, "normalizeNoise");
            fillKernel    = cl::Kernel(prog, "fillImage");
            memSetKernel  = cl::Kernel(prog, "memSet");
            genHistogram  = cl::Kernel(prog, "histogram");
            genHistColors = cl::Kernel(prog, "setColors");
        } catch (cl::Error err) {
            std::cout << "Compile Errors: " << std::endl;
            std::cout << err.what() << err.err() << std::endl;
            std::cout << prog.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)
                      << std::endl;
            exit(255);
        }
        std::cout << "Kernels compiled successfully" << std::endl;
        compileFlag = false;
    }

    static const NDRange local(16, 16);
    NDRange global(local[0] * divup(IMGW, (int)(local[0])),
                   local[1] * divup(IMGH, (int)(local[1])));

    float persistence = 0.5f;
    float amp         = 1.0f;
    float tamp        = 0.0f;

    initKernel.setArg(0, base);
    initKernel.setArg(1, perlin);
    initKernel.setArg(2, IMGW);
    initKernel.setArg(3, IMGH);
    initKernel.setArg(4, std::rand());
    queue.enqueueNDRangeKernel(initKernel, cl::NullRange, global, local);

    for (int octave = 6; octave >= 0; --octave) {
        int period = 1 << octave;
        computeKernel.setArg(0, perlin);
        computeKernel.setArg(1, base);
        computeKernel.setArg(2, IMGW);
        computeKernel.setArg(3, IMGH);
        computeKernel.setArg(4, amp);
        computeKernel.setArg(5, period);
        queue.enqueueNDRangeKernel(computeKernel, cl::NullRange, global, local);
        tamp += amp;
        amp *= persistence;
    }

    normKernel.setArg(0, perlin);
    normKernel.setArg(1, IMGW);
    normKernel.setArg(2, IMGH);
    normKernel.setArg(3, tamp);
    queue.enqueueNDRangeKernel(normKernel, cl::NullRange, global, local);

    fillKernel.setArg(0, image);
    fillKernel.setArg(1, IMGW);
    fillKernel.setArg(2, IMGH);
    fillKernel.setArg(3, perlin);
    fillKernel.setArg(4, IMGW);
    fillKernel.setArg(5, IMGH);
    queue.enqueueNDRangeKernel(fillKernel, cl::NullRange, global, local);

    static const NDRange global_hist(NBINS);

    memSetKernel.setArg(0, histOut);
    memSetKernel.setArg(1, NBINS);
    queue.enqueueNDRangeKernel(memSetKernel, cl::NullRange, global_hist);

    genHistogram.setArg(0, image);
    genHistogram.setArg(1, histOut);
    genHistogram.setArg(2, IMGW);
    genHistogram.setArg(3, IMGH);
    genHistogram.setArg(4, NBINS);
    queue.enqueueNDRangeKernel(genHistogram, cl::NullRange, global, local);

    genHistColors.setArg(0, colors);
    genHistColors.setArg(1, std::rand());
    genHistColors.setArg(2, std::rand());
    genHistColors.setArg(3, std::rand());
    queue.enqueueNDRangeKernel(genHistColors, cl::NullRange, global_hist);
}

int main(void) {
    try {
        /*
         * First Forge call should be a window creation call
         * so that necessary OpenGL context is created for any
         * other forge::* object to be created successfully
         */
        forge::Window wnd(DIMX, DIMY, "Histogram Demo");
        wnd.makeCurrent();

        forge::Image img(IMGW, IMGH, FG_RGBA, forge::u8);

        forge::Chart chart(FG_CHART_2D);

        chart.setAxesLabelFormat("%3.1f", "%.2e");

        /* set x axis limits to maximum and minimum values of data
         * and y axis limits to range [0, number of pixels ideally]
         * but practically total number of pixels as y range will skew
         * the histogram graph vertically. Therefore setting it to
         * 25% of total number of pixels */
        chart.setAxesLimits(0, 1, 0, IMGW * IMGH / (float)(NBINS / 4.0));

        /*
         * Create histogram object specifying number of bins
         */
        forge::Histogram hist = chart.histogram(NBINS, forge::s32);
        /*
         * Set histogram colors
         */
        hist.setColor(FG_YELLOW);

        /*
         * Helper function to create a CLGL interop context.
         * This function checks for if the extension is available
         * and creates the context on the appropriate device.
         * Note: context and queue are defined in cl_helpers.h
         */
        context       = createCLGLContext(wnd);
        Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
        queue         = CommandQueue(context, device);

        cl::Buffer image(context, CL_MEM_READ_WRITE, IMG_SIZE);
        cl::Buffer baseNoise(context, CL_MEM_READ_WRITE, IMG_SIZE);
        cl::Buffer perlinNoise(context, CL_MEM_READ_WRITE, IMG_SIZE);
        cl::Buffer histOut(context, CL_MEM_READ_WRITE, NBINS * sizeof(int));
        cl::Buffer colors(context, CL_MEM_READ_WRITE,
                          3 * NBINS * sizeof(float));

        GfxHandle* handles[3];

        createGLBuffer(&handles[0], img.pixels(), FORGE_IMAGE_BUFFER);
        createGLBuffer(&handles[1], hist.vertices(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[2], hist.colors(), FORGE_VERTEX_BUFFER);

        unsigned frame = 0;
        do {
            if (frame % 8 == 0) {
                kernel(image, baseNoise, perlinNoise, histOut, colors, queue,
                       device);

                copyToGLBuffer(handles[0], (ComputeResourceHandle)image(),
                               img.size());
                copyToGLBuffer(handles[1], (ComputeResourceHandle)histOut(),
                               hist.verticesSize());
                copyToGLBuffer(handles[2], (ComputeResourceHandle)colors(),
                               hist.colorsSize());

                frame = 0;
            }

            /*
             * Split the window into grid regions
             */
            wnd.draw(1, 2, 0, img, "Dynamic Perlin Noise");
            wnd.draw(1, 2, 1, chart, "Histogram of Noisy Image");

            wnd.swapBuffers();
            frame++;
        } while (!wnd.close());

        releaseGLBuffer(handles[0]);
        releaseGLBuffer(handles[1]);
        releaseGLBuffer(handles[2]);

    } catch (forge::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    } catch (cl::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    }

    return 0;
}

3D Line Plot

CPU

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>
#define USE_FORGE_CPU_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cmath>
#include <complex>
#include <iostream>
#include <vector>

const unsigned DIMX = 1000;
const unsigned DIMY = 800;

static const float ZMIN = 0.1f;
static const float ZMAX = 10.f;

const float DX     = 0.005f;
const size_t ZSIZE = (size_t)((ZMAX - ZMIN) / DX + 1);

using namespace std;

void generateCurve(float t, float dx, std::vector<float>& vec) {
    vec.clear();
    for (int i = 0; i < (int)ZSIZE; ++i) {
        float z = ZMIN + i * dx;
        vec.push_back((float)(cos(z * t + t) / z));
        vec.push_back((float)(sin(z * t + t) / z));
        vec.push_back((float)(z + 0.1 * sin(t)));
    }
}

int main(void) {
    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Three dimensional line plot demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_3D);

    chart.setAxesLabelFormat("%3.1f", "%3.1f", "%.2e");

    chart.setAxesLimits(-1.1f, 1.1f, -1.1f, 1.1f, 0.f, 10.f);

    chart.setAxesTitles("x-axis", "y-axis", "z-axis");

    forge::Plot plot3 = chart.plot(ZSIZE, forge::f32);

    // generate a surface
    std::vector<float> function;
    static float t = 0;
    generateCurve(t, DX, function);

    GfxHandle* handle;
    createGLBuffer(&handle, plot3.vertices(), FORGE_VERTEX_BUFFER);

    /* copy your data into the pixel buffer object exposed by
     * forge::Plot class and then proceed to rendering.
     * To help the users with copying the data from compute
     * memory to display memory, Forge provides copy headers
     * along with the library to help with this task
     */
    copyToGLBuffer(handle, (ComputeResourceHandle)function.data(),
                   plot3.verticesSize());

    do {
        t += 0.01f;
        generateCurve(t, DX, function);
        copyToGLBuffer(handle, (ComputeResourceHandle)function.data(),
                       plot3.verticesSize());
        wnd.draw(chart);
    } while (!wnd.close());

    releaseGLBuffer(handle);

    return 0;
}

CUDA

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <cuComplex.h>
#include <cuda_runtime.h>
#include <forge.h>
#define USE_FORGE_CUDA_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cstdio>
#include <iostream>

const unsigned DIMX = 1000;
const unsigned DIMY = 800;

static const float ZMIN = 0.1f;
static const float ZMAX = 10.f;

const float DX     = 0.005f;
const size_t ZSIZE = (size_t)((ZMAX - ZMIN) / DX + 1);

void kernel(float t, float dx, float* dev_out);

int main(void) {
    float* dev_out;

    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Three dimensional line plot demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_3D);

    chart.setAxesLabelFormat("%3.1f", "%3.1f", "%.2e");

    chart.setAxesLimits(-1.1f, 1.1f, -1.1f, 1.1f, 0.f, 10.f);

    chart.setAxesTitles("x-axis", "y-axis", "z-axis");

    forge::Plot plot3 = chart.plot(ZSIZE, forge::f32);

    static float t = 0;
    FORGE_CUDA_CHECK(cudaMalloc((void**)&dev_out, ZSIZE * 3 * sizeof(float)));
    kernel(t, DX, dev_out);

    GfxHandle* handle;
    createGLBuffer(&handle, plot3.vertices(), FORGE_VERTEX_BUFFER);

    /* copy your data into the vertex buffer object exposed by
     * forge::Plot class and then proceed to rendering.
     * To help the users with copying the data from compute
     * memory to display memory, Forge provides copy headers
     * along with the library to help with this task
     */
    copyToGLBuffer(handle, (ComputeResourceHandle)dev_out,
                   plot3.verticesSize());

    do {
        t += 0.01f;
        kernel(t, DX, dev_out);
        copyToGLBuffer(handle, (ComputeResourceHandle)dev_out,
                       plot3.verticesSize());
        wnd.draw(chart);
    } while (!wnd.close());

    FORGE_CUDA_CHECK(cudaFree(dev_out));
    releaseGLBuffer(handle);
    return 0;
}

__global__ void generateCurve(float t, float dx, float* out, const float ZMIN,
                              const size_t ZSIZE) {
    int offset = blockIdx.x * blockDim.x + threadIdx.x;

    float z = ZMIN + offset * dx;
    if (offset < ZSIZE) {
        out[3 * offset]     = cos(z * t + t) / z;
        out[3 * offset + 1] = sin(z * t + t) / z;
        out[3 * offset + 2] = z + 0.1 * sin(t);
    }
}

inline int divup(int a, int b) { return (a + b - 1) / b; }

void kernel(float t, float dx, float* dev_out) {
    static const dim3 threads(1024);
    dim3 blocks(divup(ZSIZE, 1024));

    // clang-format off
    generateCurve<<<blocks, threads>>>(t, dx, dev_out, ZMIN, ZSIZE);
    // clang-format on
}

OpenCL

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>

#include "cl_helpers.h"

#include <algorithm>
#include <cmath>
#include <complex>
#include <iostream>
#include <iterator>
#include <mutex>
#include <vector>

const unsigned DIMX = 1000;
const unsigned DIMY = 800;

static const float ZMIN = 0.1f;
static const float ZMAX = 10.f;

const float DX              = 0.005f;
static const unsigned ZSIZE = (unsigned)((ZMAX - ZMIN) / DX + 1);

using namespace std;

#define USE_FORGE_OPENCL_COPY_HELPERS
#include <fg/compute_copy.h>

// clang-format off
static const std::string sincos_surf_kernel =
R"EOK(
kernel
void generateCurve(global float* out, const float t,
                   const float dx, const float zmin,
                   const unsigned SIZE) {
    int offset = get_global_id(0);
    float z = zmin + offset * dx;
    if (offset < SIZE) {
       out[offset*3 + 0] = cos(z*t+t)/z;
       out[offset*3 + 1] = sin(z*t+t)/z;
       out[offset*3 + 2] = z + 0.1*sin(t);
    }
}
)EOK";
// clang-format on

inline int divup(int a, int b) {
    return (a + b - 1) / b;
}

void kernel(cl::Buffer& devOut, cl::CommandQueue& queue, float t) {
    static std::once_flag compileFlag;
    static cl::Program prog;
    static cl::Kernel kern;

    std::call_once(compileFlag, [queue]() {
        prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(),
                           sincos_surf_kernel, true);
        kern = cl::Kernel(prog, "generateCurve");
    });

    NDRange global(ZSIZE);

    kern.setArg(0, devOut);
    kern.setArg(1, t);
    kern.setArg(2, DX);
    kern.setArg(3, ZMIN);
    kern.setArg(4, ZSIZE);
    queue.enqueueNDRangeKernel(kern, cl::NullRange, global);
}

int main(void) {
    try {
        /*
         * First Forge call should be a window creation call
         * so that necessary OpenGL context is created for any
         * other forge::* object to be created successfully
         */
        forge::Window wnd(DIMX, DIMY, "Three dimensional line plot demo");
        wnd.makeCurrent();

        forge::Chart chart(FG_CHART_3D);

        chart.setAxesLabelFormat("%3.1f", "%3.1f", "%.2e");

        chart.setAxesLimits(-1.1f, 1.1f, -1.1f, 1.1f, 0.f, 10.f);

        chart.setAxesTitles("x-axis", "y-axis", "z-axis");

        forge::Plot plot3 = chart.plot(ZSIZE, forge::f32);

        /*
         * Helper function to create a CLGL interop context.
         * This function checks for if the extension is available
         * and creates the context on the appropriate device.
         * Note: context and queue are defined in cl_helpers.h
         */
        context       = createCLGLContext(wnd);
        Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
        queue         = CommandQueue(context, device);

        cl::Buffer devOut(context, CL_MEM_READ_WRITE,
                          sizeof(float) * ZSIZE * 3);
        static float t = 0;
        kernel(devOut, queue, t);

        GfxHandle* handle;
        createGLBuffer(&handle, plot3.vertices(), FORGE_VERTEX_BUFFER);
        /* copy your data into the pixel buffer object exposed by
         * forge::Surface class and then proceed to rendering.
         * To help the users with copying the data from compute
         * memory to display memory, Forge provides copy headers
         * along with the library to help with this task
         */
        copyToGLBuffer(handle, (ComputeResourceHandle)devOut(),
                       plot3.verticesSize());

        do {
            t += 0.01f;
            kernel(devOut, queue, t);
            copyToGLBuffer(handle, (ComputeResourceHandle)devOut(),
                           plot3.verticesSize());
            wnd.draw(chart);
        } while (!wnd.close());

        releaseGLBuffer(handle);

    } catch (forge::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    } catch (cl::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    }
    return 0;
}

Multiple line plots

CPU

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>
#define USE_FORGE_CPU_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cmath>
#include <complex>
#include <iostream>
#include <vector>

const unsigned DIMX = 1000;
const unsigned DIMY = 800;

const float FRANGE_START = 0.f;
const float FRANGE_END   = 2.f * 3.1415926f;

using namespace std;
void map_range_to_vec_vbo(float range_start, float range_end, float dx,
                          std::vector<float>& vec, float (*map)(float)) {
    if (range_start > range_end && dx > 0) return;
    for (float i = range_start; i < range_end; i += dx) {
        vec.push_back(i);
        vec.push_back((*map)(i));
    }
}

int main(void) {
    std::vector<float> sinData;
    std::vector<float> cosData;
    std::vector<float> tanData;
    std::vector<float> logData;
    map_range_to_vec_vbo(FRANGE_START, FRANGE_END, 0.1f, sinData, &sinf);
    map_range_to_vec_vbo(FRANGE_START, FRANGE_END, 0.1f, cosData, &cosf);
    map_range_to_vec_vbo(FRANGE_START, FRANGE_END, 0.1f, tanData, &tanf);
    map_range_to_vec_vbo(FRANGE_START, FRANGE_END, 0.1f, logData, &log10f);

    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Plotting Demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_2D);
    chart.setAxesLimits(FRANGE_START, FRANGE_END, -1.0f, 1.0f);

    /* Create several plot objects which creates the necessary
     * vertex buffer objects to hold the different plot types
     */
    forge::Plot plt0 = chart.plot((unsigned)(sinData.size() / 2),
                                  forge::f32);  // create a default plot
    forge::Plot plt1 =
        chart.plot((unsigned)(cosData.size() / 2), forge::f32, FG_PLOT_LINE,
                   FG_MARKER_NONE);  // or specify a specific plot type
    forge::Plot plt2 = chart.plot(
        (unsigned)(tanData.size() / 2), forge::f32, FG_PLOT_LINE,
        FG_MARKER_TRIANGLE);  // last parameter specifies marker shape
    forge::Plot plt3 = chart.plot((unsigned)(logData.size() / 2), forge::f32,
                                  FG_PLOT_SCATTER, FG_MARKER_CROSS);

    /*
     * Set plot colors
     */
    plt0.setColor(FG_RED);
    plt1.setColor(FG_BLUE);
    plt2.setColor(FG_YELLOW);                 // use a forge predefined color
    plt3.setColor((forge::Color)0x257973FF);  // or any hex-valued color
    /*
     * Set plot legends
     */
    plt0.setLegend("Sine");
    plt1.setLegend("Cosine");
    plt2.setLegend("Tangent");
    plt3.setLegend("Log base 10");

    GfxHandle* handles[4];
    createGLBuffer(&handles[0], plt0.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[1], plt1.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[2], plt2.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[3], plt3.vertices(), FORGE_VERTEX_BUFFER);

    /* copy your data into the pixel buffer object exposed by
     * forge::Plot class and then proceed to rendering.
     * To help the users with copying the data from compute
     * memory to display memory, Forge provides copy headers
     * along with the library to help with this task
     */
    copyToGLBuffer(handles[0], (ComputeResourceHandle)sinData.data(),
                   plt0.verticesSize());
    copyToGLBuffer(handles[1], (ComputeResourceHandle)cosData.data(),
                   plt1.verticesSize());
    copyToGLBuffer(handles[2], (ComputeResourceHandle)tanData.data(),
                   plt2.verticesSize());
    copyToGLBuffer(handles[3], (ComputeResourceHandle)logData.data(),
                   plt3.verticesSize());

    do { wnd.draw(chart); } while (!wnd.close());

    releaseGLBuffer(handles[0]);
    releaseGLBuffer(handles[1]);
    releaseGLBuffer(handles[2]);
    releaseGLBuffer(handles[3]);

    return 0;
}

CUDA

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <cuComplex.h>
#include <cuda_runtime.h>
#include <forge.h>
#define USE_FORGE_CUDA_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cstdio>
#include <iostream>

const unsigned DIMX = 1000;
const unsigned DIMY = 800;

static const float dx           = 0.1f;
static const float FRANGE_START = 0.f;
static const float FRANGE_END   = 2 * 3.141592f;
static const size_t DATA_SIZE   = (size_t)((FRANGE_END - FRANGE_START) / dx);

void kernel(float* dev_out, int functionCode);

int main(void) {
    float* sin_out;
    float* cos_out;
    float* tan_out;
    float* log_out;

    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "Plotting Demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_2D);
    chart.setAxesLimits(FRANGE_START, FRANGE_END, -1.0f, 1.0f);

    /* Create several plot objects which creates the necessary
     * vertex buffer objects to hold the different plot types
     */
    forge::Plot plt0 =
        chart.plot(DATA_SIZE, forge::f32);  // create a default plot
    forge::Plot plt1 =
        chart.plot(DATA_SIZE, forge::f32, FG_PLOT_LINE,
                   FG_MARKER_NONE);  // or specify a specific plot type
    forge::Plot plt2 = chart.plot(
        DATA_SIZE, forge::f32, FG_PLOT_LINE,
        FG_MARKER_TRIANGLE);  // last parameter specifies marker shape
    forge::Plot plt3 =
        chart.plot(DATA_SIZE, forge::f32, FG_PLOT_SCATTER, FG_MARKER_CROSS);

    /*
     * Set plot colors
     */
    plt0.setColor(FG_RED);
    plt1.setColor(FG_BLUE);
    plt2.setColor(FG_YELLOW);                 // use a forge predefined color
    plt3.setColor((forge::Color)0x257973FF);  // or any hex-valued color
    /*
     * Set plot legends
     */
    plt0.setLegend("Sine");
    plt1.setLegend("Cosine");
    plt2.setLegend("Tangent");
    plt3.setLegend("Log base 10");

    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&sin_out, sizeof(float) * DATA_SIZE * 2));
    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&cos_out, sizeof(float) * DATA_SIZE * 2));
    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&tan_out, sizeof(float) * DATA_SIZE * 2));
    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&log_out, sizeof(float) * DATA_SIZE * 2));

    kernel(sin_out, 0);
    kernel(cos_out, 1);
    kernel(tan_out, 2);
    kernel(log_out, 3);

    GfxHandle* handles[4];
    createGLBuffer(&handles[0], plt0.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[1], plt1.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[2], plt2.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[3], plt3.vertices(), FORGE_VERTEX_BUFFER);

    /* copy your data into the vertex buffer object exposed by
     * forge::Plot class and then proceed to rendering.
     * To help the users with copying the data from compute
     * memory to display memory, Forge provides copy headers
     * along with the library to help with this task
     */
    copyToGLBuffer(handles[0], (ComputeResourceHandle)sin_out,
                   plt0.verticesSize());
    copyToGLBuffer(handles[1], (ComputeResourceHandle)cos_out,
                   plt1.verticesSize());
    copyToGLBuffer(handles[2], (ComputeResourceHandle)tan_out,
                   plt2.verticesSize());
    copyToGLBuffer(handles[3], (ComputeResourceHandle)log_out,
                   plt3.verticesSize());

    do { wnd.draw(chart); } while (!wnd.close());

    FORGE_CUDA_CHECK(cudaFree(sin_out));
    FORGE_CUDA_CHECK(cudaFree(cos_out));
    FORGE_CUDA_CHECK(cudaFree(tan_out));
    FORGE_CUDA_CHECK(cudaFree(log_out));
    releaseGLBuffer(handles[0]);
    releaseGLBuffer(handles[1]);
    releaseGLBuffer(handles[2]);
    releaseGLBuffer(handles[3]);

    return 0;
}

__global__ void simple_sinf(float* out, const size_t _data_size, int fnCode,
                            const float _dx, const float _frange_start) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < _data_size) {
        float x  = _frange_start + i * _dx;
        int idx  = 2 * i;
        out[idx] = x;

        switch (fnCode) {
            case 0: out[idx + 1] = sinf(x); break;
            case 1: out[idx + 1] = cosf(x); break;
            case 2: out[idx + 1] = tanf(x); break;
            case 3: out[idx + 1] = log10f(x); break;
        }
    }
}

inline int divup(int a, int b) { return (a + b - 1) / b; }

void kernel(float* dev_out, int functionCode) {
    static const dim3 threads(1024);
    dim3 blocks(divup(DATA_SIZE, 1024));

    // clang-format off
    simple_sinf<<<blocks, threads>>>(dev_out, DATA_SIZE, functionCode, dx,
                                     FRANGE_START);
    // clang-format on
}

OpenCL

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>

#include "cl_helpers.h"

#include <algorithm>
#include <iostream>
#include <iterator>
#include <mutex>
#include <sstream>

using namespace cl;
using namespace std;

const unsigned DIMX = 1000;
const unsigned DIMY = 800;

const float dx           = 0.1f;
const float FRANGE_START = 0.f;
const float FRANGE_END   = 2 * 3.141592f;
const unsigned DATA_SIZE = (unsigned)((FRANGE_END - FRANGE_START) / dx);

#define USE_FORGE_OPENCL_COPY_HELPERS
#include <fg/compute_copy.h>

// clang-format off
static const std::string sinf_ocl_kernel =
R"EOK(
kernel void sinf(global float* out, const float dx, const unsigned DATA_SIZE,
                 int fnCode) {
    unsigned x = get_global_id(0);
    if (x < DATA_SIZE) {
        out[2 * x] = x * dx;
        switch (fnCode) {
            case 0: out[2 * x + 1] = sin(x * dx); break;
            case 1: out[2 * x + 1] = cos(x * dx); break;
            case 2: out[2 * x + 1] = tan(x * dx); break;
            case 3: out[2 * x + 1] = log10(x * dx); break;
        }
    }
}
)EOK";
// clang-format on

void kernel(cl::Buffer& devOut, cl::CommandQueue& queue, int fnCode)
{
    static std::once_flag compileFlag;
    static cl::Program prog;
    static cl::Kernel kern;

    std::call_once(compileFlag, [queue]() {
        prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), sinf_ocl_kernel,
                           true);
        kern = cl::Kernel(prog, "sinf");
    });

    static const NDRange global(DATA_SIZE * 2);

    kern.setArg(0, devOut);
    kern.setArg(1, dx);
    kern.setArg(2, DATA_SIZE);
    kern.setArg(3, fnCode);
    queue.enqueueNDRangeKernel(kern, cl::NullRange, global);
}

int main(void) {
    try {
        /*
         * First Forge call should be a window creation call
         * so that necessary OpenGL context is created for any
         * other forge::* object to be created successfully
         */
        forge::Window wnd(DIMX, DIMY, "Plotting Demo");
        wnd.makeCurrent();

        forge::Chart chart(FG_CHART_2D);
        chart.setAxesLimits(FRANGE_START, FRANGE_END, -1.0f, 1.0f);

        /* Create several plot objects which creates the necessary
         * vertex buffer objects to hold the different plot types
         */
        forge::Plot plt0 =
            chart.plot(DATA_SIZE, forge::f32);  // create a default plot
        forge::Plot plt1 =
            chart.plot(DATA_SIZE, forge::f32, FG_PLOT_LINE,
                       FG_MARKER_NONE);  // or specify a specific plot type
        forge::Plot plt2 = chart.plot(
            DATA_SIZE, forge::f32, FG_PLOT_LINE,
            FG_MARKER_TRIANGLE);  // last parameter specifies marker shape
        forge::Plot plt3 =
            chart.plot(DATA_SIZE, forge::f32, FG_PLOT_SCATTER, FG_MARKER_CROSS);

        /*
         * Set plot colors
         */
        plt0.setColor(FG_RED);
        plt1.setColor(FG_BLUE);
        plt2.setColor(FG_YELLOW);  // use a forge predefined color
        plt3.setColor((forge::Color)0x257973FF);  // or any hex-valued color
        /*
         * Set plot legends
         */
        plt0.setLegend("Sine");
        plt1.setLegend("Cosine");
        plt2.setLegend("Tangent");
        plt3.setLegend("Log base 10");

        /*
         * Helper function to create a CLGL interop context.
         * This function checks for if the extension is available
         * and creates the context on the appropriate device.
         * Note: context and queue are defined in cl_helpers.h
         */
        context       = createCLGLContext(wnd);
        Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
        queue         = CommandQueue(context, device);

        cl::Buffer sinOut(context, CL_MEM_READ_WRITE,
                          sizeof(float) * DATA_SIZE * 2);
        cl::Buffer cosOut(context, CL_MEM_READ_WRITE,
                          sizeof(float) * DATA_SIZE * 2);
        cl::Buffer tanOut(context, CL_MEM_READ_WRITE,
                          sizeof(float) * DATA_SIZE * 2);
        cl::Buffer logOut(context, CL_MEM_READ_WRITE,
                          sizeof(float) * DATA_SIZE * 2);
        kernel(sinOut, queue, 0);
        kernel(cosOut, queue, 1);
        kernel(tanOut, queue, 2);
        kernel(logOut, queue, 3);

        GfxHandle* handles[4];
        createGLBuffer(&handles[0], plt0.vertices(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[1], plt1.vertices(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[2], plt2.vertices(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[3], plt3.vertices(), FORGE_VERTEX_BUFFER);
        /* copy your data into the vertex buffer object exposed by
         * forge::Plot class and then proceed to rendering.
         * To help the users with copying the data from compute
         * memory to display memory, Forge provides copy headers
         * along with the library to help with this task
         */
        copyToGLBuffer(handles[0], (ComputeResourceHandle)sinOut(),
                       plt0.verticesSize());
        copyToGLBuffer(handles[1], (ComputeResourceHandle)cosOut(),
                       plt1.verticesSize());
        copyToGLBuffer(handles[2], (ComputeResourceHandle)tanOut(),
                       plt2.verticesSize());
        copyToGLBuffer(handles[3], (ComputeResourceHandle)logOut(),
                       plt3.verticesSize());

        do { wnd.draw(chart); } while (!wnd.close());

        releaseGLBuffer(handles[0]);
        releaseGLBuffer(handles[1]);
        releaseGLBuffer(handles[2]);
        releaseGLBuffer(handles[3]);

    } catch (forge::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    } catch (cl::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    }
    return 0;
}

3D Vector Stream

CPU

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>
#define USE_FORGE_CPU_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cmath>
#include <complex>
#include <iostream>
#include <vector>

const unsigned DIMX = 640;
const unsigned DIMY = 480;
const float PI      = 3.14159265359f;
const float MINIMUM = 1.0f;
const float MAXIMUM = 20.f;
const float STEP    = 2.0f;
const int NELEMS    = (int)((MAXIMUM - MINIMUM + 1) / STEP);

using namespace std;

void generateColors(std::vector<float>& colors) {
    static const float AF_BLUE[]   = {0.0588f, 0.1137f, 0.2745f, 1.0f};
    static const float AF_ORANGE[] = {0.8588f, 0.6137f, 0.0745f, 1.0f};

    int numElems = NELEMS * NELEMS * NELEMS;
    colors.clear();
    for (int i = 0; i < numElems; ++i) {
        if ((i % 2) == 0) {
            colors.push_back(AF_ORANGE[0]);
            colors.push_back(AF_ORANGE[1]);
            colors.push_back(AF_ORANGE[2]);
        } else {
            colors.push_back(AF_BLUE[0]);
            colors.push_back(AF_BLUE[1]);
            colors.push_back(AF_BLUE[2]);
        }
    }
}

void generatePoints(std::vector<float>& points, std::vector<float>& dirs) {
    points.clear();

    for (int k = 0; k < NELEMS; ++k) {
        float z = MINIMUM + k * STEP;
        for (int j = 0; j < NELEMS; ++j) {
            float y = MINIMUM + j * STEP;
            for (int i = 0; i < NELEMS; ++i) {
                float x = MINIMUM + i * STEP;
                points.push_back(x);
                points.push_back(y);
                points.push_back(z);
                dirs.push_back(x - 10.0f);
                dirs.push_back(y - 10.0f);
                dirs.push_back(z - 10.0f);
            }
        }
    }
}

int main(void) {
    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "3D Vector Field Demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_3D);
    chart.setAxesLimits(MINIMUM - 1.0f, MAXIMUM, MINIMUM - 1.0f, MAXIMUM,
                        MINIMUM - 1.0f, MAXIMUM);
    chart.setAxesTitles("x-axis", "y-axis", "z-axis");

    int numElems             = NELEMS * NELEMS * NELEMS;
    forge::VectorField field = chart.vectorField(numElems, forge::f32);
    field.setColor(0.f, 1.f, 0.f, 1.f);

    std::vector<float> points;
    std::vector<float> colors;
    std::vector<float> dirs;
    generatePoints(points, dirs);
    generateColors(colors);

    GfxHandle* handles[3];
    createGLBuffer(&handles[0], field.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[1], field.colors(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[2], field.directions(), FORGE_VERTEX_BUFFER);

    copyToGLBuffer(handles[0], (ComputeResourceHandle)points.data(),
                   field.verticesSize());
    copyToGLBuffer(handles[1], (ComputeResourceHandle)colors.data(),
                   field.colorsSize());
    copyToGLBuffer(handles[2], (ComputeResourceHandle)dirs.data(),
                   field.directionsSize());

    do { wnd.draw(chart); } while (!wnd.close());

    releaseGLBuffer(handles[0]);
    releaseGLBuffer(handles[1]);
    releaseGLBuffer(handles[2]);

    return 0;
}

CUDA

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <cuda_runtime.h>
#include <forge.h>
#define USE_FORGE_CUDA_COPY_HELPERS
#include <fg/compute_copy.h>

const unsigned DIMX = 640;
const unsigned DIMY = 480;
const float MINIMUM = 1.0f;
const float MAXIMUM = 20.f;
const float STEP    = 2.0f;
const int NELEMS    = (int)((MAXIMUM - MINIMUM + 1) / STEP);

void generateColors(float* colors);

void generatePoints(float* points, float* dirs);

inline int divup(int a, int b) { return (a + b - 1) / b; }

int main(void) {
    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(DIMX, DIMY, "3D Vector Field Demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_3D);
    chart.setAxesLimits(MINIMUM - 1.0f, MAXIMUM, MINIMUM - 1.0f, MAXIMUM,
                        MINIMUM - 1.0f, MAXIMUM);
    chart.setAxesTitles("x-axis", "y-axis", "z-axis");

    int numElems             = NELEMS * NELEMS * NELEMS;
    forge::VectorField field = chart.vectorField(numElems, forge::f32);
    field.setColor(0.f, 1.f, 0.f, 1.f);

    float* points;
    float* colors;
    float* dirs;

    FORGE_CUDA_CHECK(cudaMalloc((void**)&points, 3 * numElems * sizeof(float)));
    FORGE_CUDA_CHECK(cudaMalloc((void**)&colors, 3 * numElems * sizeof(float)));
    FORGE_CUDA_CHECK(cudaMalloc((void**)&dirs, 3 * numElems * sizeof(float)));

    generatePoints(points, dirs);
    generateColors(colors);

    GfxHandle* handles[3];
    createGLBuffer(&handles[0], field.vertices(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[1], field.colors(), FORGE_VERTEX_BUFFER);
    createGLBuffer(&handles[2], field.directions(), FORGE_VERTEX_BUFFER);

    copyToGLBuffer(handles[0], (ComputeResourceHandle)points,
                   field.verticesSize());
    copyToGLBuffer(handles[1], (ComputeResourceHandle)colors,
                   field.colorsSize());
    copyToGLBuffer(handles[2], (ComputeResourceHandle)dirs,
                   field.directionsSize());

    do { wnd.draw(chart); } while (!wnd.close());

    releaseGLBuffer(handles[0]);
    releaseGLBuffer(handles[1]);
    releaseGLBuffer(handles[2]);

    FORGE_CUDA_CHECK(cudaFree(points));
    FORGE_CUDA_CHECK(cudaFree(colors));
    FORGE_CUDA_CHECK(cudaFree(dirs));

    return 0;
}

__global__ void genColorsKernel(float* colors, int nelems) {
    const float AF_BLUE[4]   = {0.0588f, 0.1137f, 0.2745f, 1.0f};
    const float AF_ORANGE[4] = {0.8588f, 0.6137f, 0.0745f, 1.0f};

    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < nelems) {
        if (i % 2 == 0) {
            colors[3 * i + 0] = AF_ORANGE[0];
            colors[3 * i + 1] = AF_ORANGE[1];
            colors[3 * i + 2] = AF_ORANGE[2];
        } else {
            colors[3 * i + 0] = AF_BLUE[0];
            colors[3 * i + 1] = AF_BLUE[1];
            colors[3 * i + 2] = AF_BLUE[2];
        }
    }
}

void generateColors(float* colors) {
    const int numElems = NELEMS * NELEMS * NELEMS;
    static const dim3 threads(512);
    dim3 blocks(divup(numElems, threads.x));

    // clang-format off
    genColorsKernel<<<blocks, threads>>>(colors, numElems);
    // clang-format on
}

__global__ void pointGenKernel(float* points, float* dirs, int nBBS0,
                               int nelems, float minimum, float step) {
    int k = blockIdx.x / nBBS0;
    int i = blockDim.x * (blockIdx.x - k * nBBS0) + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;

    if (i < nelems && j < nelems && k < nelems) {
        float x = minimum + i * step;
        float y = minimum + j * step;
        float z = minimum + k * step;

        int id = i + j * nelems + k * nelems * nelems;

        points[3 * id + 0] = x;
        points[3 * id + 1] = y;
        points[3 * id + 2] = z;

        dirs[3 * id + 0] = x - 10.f;
        dirs[3 * id + 1] = y - 10.f;
        dirs[3 * id + 2] = z - 10.f;
    }
}

void generatePoints(float* points, float* dirs) {
    static dim3 threads(8, 8);

    int blk_x = divup(NELEMS, threads.x);
    int blk_y = divup(NELEMS, threads.y);

    dim3 blocks(blk_x * NELEMS, blk_y);

    // clang-format off
    pointGenKernel<<<blocks, threads>>>(points, dirs, blk_x, NELEMS, MINIMUM,
                                        STEP);
    // clang-format on
}

OpenCL

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>

#include "cl_helpers.h"

#include <algorithm>
#include <cmath>
#include <ctime>
#include <iostream>
#include <iterator>
#include <sstream>
#include <vector>

using namespace cl;
using namespace std;

const unsigned DIMX = 640;
const unsigned DIMY = 480;
const float MINIMUM = 1.0f;
const float MAXIMUM = 20.f;
const float STEP    = 2.0f;
const int NELEMS    = (int)((MAXIMUM - MINIMUM + 1) / STEP);

#define USE_FORGE_OPENCL_COPY_HELPERS
#include <fg/compute_copy.h>

// clang-format off
static const std::string streamKernel =
R"EOK(
constant float AF_BLUE[4]         = {0.0588f, 0.1137f, 0.2745f, 1.0f};
constant float AF_ORANGE[4]           = {0.8588f, 0.6137f, 0.0745f, 1.0f};

kernel void genColorsKernel(global float* colors, int NELEMS) {
    const size_t nelems = NELEMS * NELEMS * NELEMS;

    int i = get_global_id(0);

    if (i < nelems) {
        if (i % 2 == 0) {
            colors[3 * i + 0] = AF_ORANGE[0];
            colors[3 * i + 1] = AF_ORANGE[1];
            colors[3 * i + 2] = AF_ORANGE[2];
        } else {
            colors[3 * i + 0] = AF_BLUE[0];
            colors[3 * i + 1] = AF_BLUE[1];
            colors[3 * i + 2] = AF_BLUE[2];
        }
    }
}

kernel void pointGenKernel(global float* points, global float* dirs, int nBBS0,
                           int NELEMS, float MINIMUM, float STEP) {
    int k = get_group_id(0) / nBBS0;
    int i = get_local_size(0) * (get_group_id(0) - k * nBBS0) + get_local_id(0);
    int j = get_global_id(1);

    if (i < NELEMS && j < NELEMS && k < NELEMS) {
        float x = MINIMUM + i * STEP;
        float y = MINIMUM + j * STEP;
        float z = MINIMUM + k * STEP;

        int id = i + j * NELEMS + k * NELEMS * NELEMS;

        points[3 * id + 0] = x;
        points[3 * id + 1] = y;
        points[3 * id + 2] = z;

        dirs[3 * id + 0] = x - 10.f;
        dirs[3 * id + 1] = y - 10.f;
        dirs[3 * id + 2] = z - 10.f;
    }
}
)EOK";
// clang-format on

inline int divup(int a, int b)
{
    return (a + b - 1) / b;
}

void generatePoints(Buffer& points, Buffer& dirs, Buffer& colors,
                    CommandQueue& queue, Device& device) {
    static bool compileFlag = true;

    static cl::Program prog;
    static cl::Kernel pointGenKernel;
    static cl::Kernel colorsKernel;

    if (compileFlag) {
        try {
            prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), streamKernel,
                               false);

            std::vector<cl::Device> devs;
            devs.push_back(device);
            prog.build(devs);

            pointGenKernel = cl::Kernel(prog, "pointGenKernel");
            colorsKernel   = cl::Kernel(prog, "genColorsKernel");
        } catch (cl::Error err) {
            std::cout << "Compile Errors: " << std::endl;
            std::cout << err.what() << err.err() << std::endl;
            std::cout << prog.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)
                      << std::endl;
            exit(255);
        }
        std::cout << "Kernels compiled successfully" << std::endl;
        compileFlag = false;
    }

    static const NDRange local(8, 8);
    int blk_x = divup(NELEMS, (int)(local[0]));
    int blk_y = divup(NELEMS, (int)(local[1]));

    NDRange global(NELEMS * local[0] * blk_x, local[1] * blk_y);

    pointGenKernel.setArg(0, points);
    pointGenKernel.setArg(1, dirs);
    pointGenKernel.setArg(2, blk_x);
    pointGenKernel.setArg(3, NELEMS);
    pointGenKernel.setArg(4, MINIMUM);
    pointGenKernel.setArg(5, STEP);
    queue.enqueueNDRangeKernel(pointGenKernel, cl::NullRange, global, local);
    const int numElems = NELEMS * NELEMS * NELEMS;
    static const NDRange thrds(64, 1);
    NDRange glob(thrds[0] * divup(numElems, (int)(thrds[0])), (int)(thrds[1]));

    colorsKernel.setArg(0, colors);
    colorsKernel.setArg(1, NELEMS);
    queue.enqueueNDRangeKernel(colorsKernel, cl::NullRange, glob, thrds);
}

int main(void) {
    try {
        /*
         * First Forge call should be a window creation call
         * so that necessary OpenGL context is created for any
         * other forge::* object to be created successfully
         */
        forge::Window wnd(DIMX, DIMY, "3D Vector Field Demo");
        wnd.makeCurrent();

        forge::Chart chart(FG_CHART_3D);
        chart.setAxesLimits(MINIMUM - 1.0f, MAXIMUM, MINIMUM - 1.0f, MAXIMUM,
                            MINIMUM - 1.0f, MAXIMUM);
        chart.setAxesTitles("x-axis", "y-axis", "z-axis");

        int numElems             = NELEMS * NELEMS * NELEMS;
        forge::VectorField field = chart.vectorField(numElems, forge::f32);
        field.setColor(0.f, 1.f, 0.f, 1.f);

        /*
         * Helper function to create a CLGL interop context.
         * This function checks for if the extension is available
         * and creates the context on the appropriate device.
         * Note: context and queue are defined in cl_helpers.h
         */
        context       = createCLGLContext(wnd);
        Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
        queue         = CommandQueue(context, device);

        cl::Buffer points(context, CL_MEM_READ_WRITE,
                          sizeof(float) * 3 * numElems);
        cl::Buffer colors(context, CL_MEM_READ_WRITE,
                          sizeof(float) * 3 * numElems);
        cl::Buffer dirs(context, CL_MEM_READ_WRITE,
                        sizeof(float) * 3 * numElems);

        GfxHandle* handles[3];
        createGLBuffer(&handles[0], field.vertices(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[1], field.colors(), FORGE_VERTEX_BUFFER);
        createGLBuffer(&handles[2], field.directions(), FORGE_VERTEX_BUFFER);

        generatePoints(points, dirs, colors, queue, device);

        copyToGLBuffer(handles[0], (ComputeResourceHandle)points(),
                       field.verticesSize());
        copyToGLBuffer(handles[1], (ComputeResourceHandle)colors(),
                       field.colorsSize());
        copyToGLBuffer(handles[2], (ComputeResourceHandle)dirs(),
                       field.directionsSize());

        do { wnd.draw(chart); } while (!wnd.close());

        releaseGLBuffer(handles[0]);
        releaseGLBuffer(handles[1]);
        releaseGLBuffer(handles[2]);

    } catch (forge::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    } catch (cl::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    }

    return 0;
}

Surfaces

CPU

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>
#define USE_FORGE_CPU_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cmath>
#include <complex>
#include <iostream>
#include <vector>

using namespace std;

static const float XMIN = -8.0f;
static const float XMAX = 8.0f;
static const float YMIN = -8.0f;
static const float YMAX = 8.0f;

const float DX     = 0.5;
const size_t XSIZE = (size_t)((XMAX - XMIN) / DX);
const size_t YSIZE = (size_t)((YMAX - YMIN) / DX);

void genSurface(float dx, std::vector<float>& vec) {
    vec.clear();
    for (float x = XMIN; x < XMAX; x += dx) {
        for (float y = YMIN; y < YMAX; y += dx) {
            vec.push_back(x);
            vec.push_back(y);
            float z = sqrt(x * x + y * y) + 2.2204e-16f;
            vec.push_back(sin(z) / z);
        }
    }
}

int main(void) {
    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(1024, 768, "3d Surface Demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_3D);
    chart.setAxesLimits(XMIN - 2.0f, XMAX + 2.0f, YMIN - 2.0f, YMAX + 2.0f,
                        -0.5f, 1.f);
    chart.setAxesTitles("x-axis", "y-axis", "z-axis");

    forge::Surface surf = chart.surface(XSIZE, YSIZE, forge::f32);
    surf.setColor(FG_YELLOW);

    // generate a surface
    std::vector<float> function;

    genSurface(DX, function);

    GfxHandle* handle;
    createGLBuffer(&handle, surf.vertices(), FORGE_VERTEX_BUFFER);

    /* copy your data into the pixel buffer object exposed by
     * forge::Plot class and then proceed to rendering.
     * To help the users with copying the data from compute
     * memory to display memory, Forge provides copy headers
     * along with the library to help with this task
     */
    copyToGLBuffer(handle, (ComputeResourceHandle)function.data(),
                   surf.verticesSize());

    do { wnd.draw(chart); } while (!wnd.close());

    releaseGLBuffer(handle);

    return 0;
}

CUDA

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <cuComplex.h>
#include <cuda_runtime.h>
#include <forge.h>
#define USE_FORGE_CUDA_COPY_HELPERS
#include <fg/compute_copy.h>
#include <cstdio>
#include <iostream>

const float XMIN = -8.0f;
const float XMAX = 8.f;
const float YMIN = -8.0f;
const float YMAX = 8.f;

const float DX     = 0.5;
const size_t XSIZE = (size_t)((XMAX - XMIN) / DX);
const size_t YSIZE = (size_t)((YMAX - YMIN) / DX);

void kernel(float dx, float* dev_out);

int main(void) {
    float* dev_out;

    /*
     * First Forge call should be a window creation call
     * so that necessary OpenGL context is created for any
     * other forge::* object to be created successfully
     */
    forge::Window wnd(1024, 768, "3d Surface Demo");
    wnd.makeCurrent();

    forge::Chart chart(FG_CHART_3D);
    chart.setAxesLimits(-10.f, 10.f, -10.f, 10.f, -0.5f, 1.f);
    chart.setAxesTitles("x-axis", "y-axis", "z-axis");

    forge::Surface surf = chart.surface(XSIZE, YSIZE, forge::f32);
    surf.setColor(FG_YELLOW);

    FORGE_CUDA_CHECK(
        cudaMalloc((void**)&dev_out, XSIZE * YSIZE * 3 * sizeof(float)));
    kernel(DX, dev_out);

    GfxHandle* handle;
    createGLBuffer(&handle, surf.vertices(), FORGE_VERTEX_BUFFER);
    /* copy your data into the vertex buffer object exposed by
     * forge::Plot class and then proceed to rendering.
     * To help the users with copying the data from compute
     * memory to display memory, Forge provides copy headers
     * along with the library to help with this task
     */
    copyToGLBuffer(handle, (ComputeResourceHandle)dev_out, surf.verticesSize());

    do { wnd.draw(chart); } while (!wnd.close());

    FORGE_CUDA_CHECK(cudaFree(dev_out));
    releaseGLBuffer(handle);
    return 0;
}

__global__ void sincos_surf(float dx, float* out, const float XMIN,
                            const float YMIN, const size_t XSIZE,
                            const size_t YSIZE) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;

    float x = XMIN + i * dx;
    float y = YMIN + j * dx;
    if (i < XSIZE && j < YSIZE) {
        int offset          = j + i * YSIZE;
        out[3 * offset]     = x;
        out[3 * offset + 1] = y;
        float z             = sqrt(x * x + y * y) + 2.2204e-16;
        out[3 * offset + 2] = sinf(z) / z;
    }
}

inline int divup(int a, int b) { return (a + b - 1) / b; }

void kernel(float dx, float* dev_out) {
    static const dim3 threads(8, 8);
    dim3 blocks(divup(XSIZE, threads.x), divup(YSIZE, threads.y));

    // clang-format off
    sincos_surf<<<blocks, threads>>>(dx, dev_out, XMIN, YMIN, XSIZE, YSIZE);
    // clang-format on
}

OpenCL

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/*******************************************************
 * Copyright (c) 2015-2019, ArrayFire
 * All rights reserved.
 *
 * This file is distributed under 3-clause BSD license.
 * The complete license agreement can be obtained at:
 * http://arrayfire.com/licenses/BSD-3-Clause
 ********************************************************/

#include <forge.h>

#include "cl_helpers.h"

#include <algorithm>
#include <cmath>
#include <complex>
#include <iostream>
#include <iterator>
#include <mutex>
#include <vector>

static const float XMIN = -8.0f;
static const float XMAX = 8.f;
static const float YMIN = -8.0f;
static const float YMAX = 8.f;

const float DX       = 0.5;
const unsigned XSIZE = (unsigned)((XMAX - XMIN) / DX);
const unsigned YSIZE = (unsigned)((YMAX - YMIN) / DX);

using namespace std;

#define USE_FORGE_OPENCL_COPY_HELPERS
#include <fg/compute_copy.h>

// clang-format off
static const std::string sin_surf_kernel =
R"EOK(
kernel void
surf(global float* out, const float dx, const float xmin, const float ymin,
     const unsigned w, const unsigned h) {
    int i = get_global_id(0);
    int j = get_global_id(1);

    float x = xmin + i * dx;
    float y = ymin + j * dx;

    if (i < w && j < h) {
        int offset          = j + i * h;
        out[3 * offset]     = x;
        out[3 * offset + 1] = y;
        float z             = sqrt(x * x + y * y) + 2.2204e-16;
        out[3 * offset + 2] = sin(z) / z;
    }
}
)EOK";
// clang-format on

inline
int divup(int a, int b)
{
    return (a + b - 1) / b;
}

void kernel(cl::Buffer& devOut, cl::CommandQueue& queue, cl::Device& device) {
    static bool compileFlag = true;
    static cl::Program prog;
    static cl::Kernel kern;

    if (compileFlag) {
        try {
            prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(),
                               sin_surf_kernel, false);

            std::vector<cl::Device> devs;
            devs.push_back(device);
            prog.build(devs);

            kern = cl::Kernel(prog, "surf");
        } catch (cl::Error err) {
            std::cout << "Compile Errors: " << std::endl;
            std::cout << err.what() << err.err() << std::endl;
            std::cout << prog.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)
                      << std::endl;
            exit(255);
        }
        std::cout << "Kernels compiled successfully" << std::endl;
        compileFlag = false;
    }

    NDRange local(8, 8);
    NDRange global(local[0] * divup(XSIZE, (int)(local[0])),
                   local[1] * divup(YSIZE, (int)(local[1])));

    kern.setArg(0, devOut);
    kern.setArg(1, DX);
    kern.setArg(2, XMIN);
    kern.setArg(3, YMIN);
    kern.setArg(4, XSIZE);
    kern.setArg(5, YSIZE);
    queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local);
}

int main(void) {
    try {
        /*
         * First Forge call should be a window creation call
         * so that necessary OpenGL context is created for any
         * other forge::* object to be created successfully
         */
        forge::Window wnd(1024, 768, "3d Surface Demo");
        wnd.makeCurrent();

        forge::Chart chart(FG_CHART_3D);
        chart.setAxesLimits(-10.f, 10.f, -10.f, 10.f, -0.5f, 1.f);
        chart.setAxesTitles("x-axis", "y-axis", "z-axis");

        forge::Surface surf = chart.surface(XSIZE, YSIZE, forge::f32);
        surf.setColor(FG_YELLOW);

        /*
         * Helper function to create a CLGL interop context.
         * This function checks for if the extension is available
         * and creates the context on the appropriate device.
         * Note: context and queue are defined in cl_helpers.h
         */
        context       = createCLGLContext(wnd);
        Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
        queue         = CommandQueue(context, device);

        cl::Buffer devOut(context, CL_MEM_READ_WRITE,
                          sizeof(float) * XSIZE * YSIZE * 3);

        kernel(devOut, queue, device);

        GfxHandle* handle;
        createGLBuffer(&handle, surf.vertices(), FORGE_VERTEX_BUFFER);
        /* copy your data into the pixel buffer object exposed by
         * forge::Surface class and then proceed to rendering.
         * To help the users with copying the data from compute
         * memory to display memory, Forge provides copy headers
         * along with the library to help with this task
         */
        copyToGLBuffer(handle, (ComputeResourceHandle)devOut(),
                       surf.verticesSize());

        do { wnd.draw(chart); } while (!wnd.close());

        releaseGLBuffer(handle);
    } catch (forge::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    } catch (cl::Error err) {
        std::cout << err.what() << "(" << err.err() << ")" << std::endl;
    }

    return 0;
}