there are so many cores

Just another WordPress.com site

Some huge machine generated kernels

The good news is that output from the JIT/OpenCL agrees with the interpreter. The bad news is that there’s a lot more work required. As I’ve never written something like this before, I didn’t see the requisite static analysis and code transformation problems beforehand. It’s a process of discovery, not only of finding solutions but learning what the (compiler) problems are and how they are interdependent.

A minute ago, the need for memoizing generation of kernel code became obvious when testing with the PeakStream conjugate gradient example.

Arrayf32 x = Arrayf32::zeros(N);
{
    Arrayf32 A = Arrayf32::make2(N, N, cpuA);
    Arrayf32 b = Arrayf32::make1(N, cpub);
    Arrayf32 residuals = b - matmul(A, x);
    Arrayf32 p = residuals;
    Arrayf32 newRR = dot_product(residuals, residuals);

    for (iter = 0; iter < N; iter++) {
        Arrayf32 oldRR = newRR;
        Arrayf32 newX, newP, newResiduals;
        Arrayf32 Ap = matmul(A, p);
        Arrayf32 dp = dot_product(p, Ap);

        newX = x + p * oldRR / dp;
        newResiduals = residuals - Ap * oldRR / dp;
        newRR = dot_product(newResiduals, newResiduals);
        newP = newResiduals + p * newRR / oldRR;

        p = newP;
        residuals = newResiduals;

        float oldRRcpu = oldRR.read_scalar();
        if (oldRRcpu <= TOLERANCE) {
            break;
        }
        x = newX;
    }
}

The first iteration generates a simple kernel. It is just a dot product.

__kernel void kx(
    __global float* a0xeaff80,
    __global float* a0x7fff6375c1a0,
    __global float* a0x7fff6375c7e0)
{
    const size_t x0 = get_global_id(0);

    float a1 = 0;
    for (size_t x1 = 0; x1 < 20; x1++) {
        const float a2 = 0;
        const float a3 = 0;
        a1 += ((a0x7fff6375c7e0[x1] - a2) * (a0x7fff6375c7e0[x1] - a3));
    }

    a0xeaff80[x0] = a1;
}

The second iteration generates a more complex kernel.

__kernel void kx(
    __global float* a0xf6a740,
    __global float* a0x7fff6375c1a0,
    __global float* a0x7fff6375c7e0)
{
    const size_t x0 = get_global_id(0);

    float a18 = 0;
    for (size_t x18 = 0; x18 < 20; x18++) {
        const float a19 = 0;
        float a20 = 0;
        for (size_t x20 = 0; x20 < 20; x20++) {
            const float a21 = 0;
            a20 += (a0x7fff6375c1a0[x20 + x18 * 20] * (a0x7fff6375c7e0[x20] - a21));
        }

        a18 += ((a0x7fff6375c7e0[x18] - a19) * a20);
    }

    float a15 = 0;
    for (size_t x15 = 0; x15 < 20; x15++) {
        const float a16 = 0;
        const float a17 = 0;
        a15 += ((a0x7fff6375c7e0[x15] - a16) * (a0x7fff6375c7e0[x15] - a17));
    }

    float a8 = 0;
    for (size_t x8 = 0; x8 < 20; x8++) {
        const float a9 = 0;
        float a10 = 0;
        for (size_t x10 = 0; x10 < 20; x10++) {
            const float a11 = 0;
            a10 += (a0x7fff6375c1a0[x10 + x8 * 20] * (a0x7fff6375c7e0[x10] - a11));
        }

        a8 += ((a0x7fff6375c7e0[x8] - a9) * a10);
    }

    float a5 = 0;
    for (size_t x5 = 0; x5 < 20; x5++) {
        const float a6 = 0;
        const float a7 = 0;
        a5 += ((a0x7fff6375c7e0[x5] - a6) * (a0x7fff6375c7e0[x5] - a7));
    }

    float a1 = 0;
    for (size_t x1 = 0; x1 < 20; x1++) {
        const float a2 = 0;
        float a3 = 0;
        for (size_t x3 = 0; x3 < 20; x3++) {
            const float a4 = 0;
            a3 += (a0x7fff6375c1a0[x3 + x1 * 20] * (a0x7fff6375c7e0[x3] - a4));
        }

        const float a12 = 0;
        float a13 = 0;
        for (size_t x13 = 0; x13 < 20; x13++) {
            const float a14 = 0;
            a13 += (a0x7fff6375c1a0[x13 + x1 * 20] * (a0x7fff6375c7e0[x13] - a14));
        }

        a1 += (((a0x7fff6375c7e0[x1] - a2) - ((a3 * a5) / a8)) * ((a0x7fff6375c7e0[x1] - a12) - ((a13 * a15) / a18)));
    }

    a0xf6a740[x0] = a1;
}

Many reductions are duplicated in the kernel as it is generated directly from recursive traversal of the execution trace DAG. There’s no transposition detection. That’s why a memo is needed (although I’m not sure where it should go – generally, I try to avoid creating temporary stages by performing transformations as far upstream as possible). By the third iteration, the generated kernel is 756 lines. The fourth iteration results in a kernel with 9876 lines. The fifth iteration has a kernel with 130506 lines. This is too much for the OpenCL compiler which then seg faults.

As you might guess, there are other issues going on besides transpositions. I’m only showing off the skeletons I understand. There are many more in the closet.

Here’s another example of a crazy looking kernel.

__kernel void kx(
    __global float* a0x1da56e0,
    __global float* a0x7fffc0ceeaa0)
{
    const size_t x0 = get_global_id(0);
    const size_t y0 = get_global_id(1);

    a0x1da56e0[x0 + y0 * 100] = ((((((((((((((((((((0 + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 0) * ((1 * y0) - 0)) * 1))) * 1))))) % 100) + (as_uint(floor((0 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 1) * ((1 * y0) - 1)) * 1))) * 1))))) % 100) + (as_uint(floor((1 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 2) * ((1 * y0) - 2)) * 1))) * 1))))) % 100) + (as_uint(floor((2 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 3) * ((1 * y0) - 3)) * 1))) * 1))))) % 100) + (as_uint(floor((3 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 4) * ((1 * y0) - 4)) * 1))) * 1))))) % 100) + (as_uint(floor((4 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 5) * ((1 * y0) - 5)) * 1))) * 1))))) % 100) + (as_uint(floor((5 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 6) * ((1 * y0) - 6)) * 1))) * 1))))) % 100) + (as_uint(floor((6 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 7) * ((1 * y0) - 7)) * 1))) * 1))))) % 100) + (as_uint(floor((7 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 8) * ((1 * y0) - 8)) * 1))) * 1))))) % 100) + (as_uint(floor((8 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 9) * ((1 * y0) - 9)) * 1))) * 1))))) % 100) + (as_uint(floor((9 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 10) * ((1 * y0) - 10)) * 1))) * 1))))) % 100) + (as_uint(floor((10 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 11) * ((1 * y0) - 11)) * 1))) * 1))))) % 100) + (as_uint(floor((11 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 12) * ((1 * y0) - 12)) * 1))) * 1))))) % 100) + (as_uint(floor((12 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 13) * ((1 * y0) - 13)) * 1))) * 1))))) % 100) + (as_uint(floor((13 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 14) * ((1 * y0) - 14)) * 1))) * 1))))) % 100) + (as_uint(floor((14 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 15) * ((1 * y0) - 15)) * 1))) * 1))))) % 100) + (as_uint(floor((15 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 16) * ((1 * y0) - 16)) * 1))) * 1))))) % 100) + (as_uint(floor((16 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 17) * ((1 * y0) - 17)) * 1))) * 1))))) % 100) + (as_uint(floor((17 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 18) * ((1 * y0) - 18)) * 1))) * 1))))) % 100) + (as_uint(floor((18 * 1))) % 100) * 100]) + a0x7fffc0ceeaa0[(as_uint(floor(as_float((0.5 + (sqrt((((1 * x0) * (1 * x0)) + ((((1 * y0) - 19) * ((1 * y0) - 19)) * 1))) * 1))))) % 100) + (as_uint(floor((19 * 1))) % 100) * 100]);
}

Death by parentheses is possible in OpenCL too? Anyway, here’s the source code that generates it. This is the PeakStream Kirchhoff migration (seismic/depth imaging) example.

Arrayf32 modl = zeros_f32(NT, N);
{
    Arrayf32 x = dx * index_f32(1, NT, N);
    Arrayf32 z = dt * index_f32(0, NT, N);
    Arrayf32 data = Arrayf32::make2(NT, N, datagpu);

    for (int iy = 0; iy < N; iy++)
    {
        float y = float(iy) * dx;
        Arrayf32 index1 = float(iy) * ones_f32(NT, N);
        Arrayf32 it = 0.5 + sqrt(z * z + (x - y) * (x - y) * factor) * idt;
        modl += gather2_floor(data, it, index1);
    }
}
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: