there are so many cores

Just another WordPress.com site

Monthly Archives: June 2011

Six months of development in one chart

Progress seems to evolve as big jumps followed by stabilizing periods.

Development progress as lines of code over time

Lines of code over the last six months

Ready for the back-end

It’s time to revisit GATLAS and integrate something like it as the back-end for OpenCL kernel generation. Everything else is done (although not done done – a former manager used to always ask me, “Is it done? Is it done done?”).

Here’s the conjugate gradient example again, this time using OpenMP in data-parallel style. Note there is one common value for matrix cpuA shared by all threads. The vector cpub has a different value in each thread. This represents a problem in which there is a cluster of “things” all of which share the same linear model. The linear model is cpuA. The “things” are in the vectors cpux and cpub. So each thread corresponds to one “thing” within the model.

#pragma omp parallel for
for (size_t i = 0; i < 256; i++)
{
    Conj_Grad_GPU_PS(N, cpuA, cpux[i], cpub[i]);
}

int Conj_Grad_GPU_PS(int N, float *cpuA, float *cpux, float *cpub)
{
    int iter;
    Arrayf32 x = Arrayf32::zeros(N);
    {
        Arrayf32 A = Arrayf32::make2(N, N, cpuA);
        Arrayf32 b = Arrayf32::make1(N, cpub);
        Arrayf32 residuals = b - matmul(A, x);
        Arrayf32 p = residuals;
        Arrayf32 newRR = dot_product(residuals, residuals);

        for (iter = 0; iter < N; iter++) {
            Arrayf32 oldRR = newRR;
            Arrayf32 newX, newP, newResiduals;
            Arrayf32 Ap = matmul(A, p);
            Arrayf32 dp = dot_product(p, Ap);
            newX = x + p * oldRR / dp;
            newResiduals = residuals - Ap * oldRR / dp;
            newRR = dot_product(newResiduals, newResiduals);
            newP = newResiduals + p * newRR / oldRR;

            p = newP;
            residuals = newResiduals;

            float oldRRcpu = oldRR.read_scalar();
            if (oldRRcpu <= TOLERANCE) {
                break;
            }
            x = newX;
        }
    }
    x.read1(cpux, N * sizeof(float));

    return iter;
}

On the first loop iteration, the bytecode looks like:

HASHCODE 7703275699473089935 NUMTRACES 2
0.0     
0.1     51 20 
1.0     
1.1     14 20 20 PTR ( 0xc13a90 0xc16f70 )
2.0     
2.1     12 20 1 PTR ( 0xc13f00 0xc172f0 )
3.0     
3.1     2 36 2.1 16 1.1 0.1 
4.0     
4.1     2 3.1 
5.0     
5.1     2 4 3.1 3.1 
6.0     
6.1     2 5.1 
7.0     
7.1     2 16 1.1 4.1 
8.0     
8.1     2 4 4.1 7.1 
9.0     
9.1     2 25 0.1 27 33 4.1 6.1 8.1 
10.0    
10.1    2 36 3.1 27 33 7.1 6.1 8.1 
5.2     2 4 10.1 10.1 
11.0    
11.1    2 25 10.1 27 33 4.1 5.2 6.1 
4.2     2 11.1 
3.2     2 10.1 
6.2     37 6.1 PTR ( 0xc15780 0xc18a90 )

This bytecode passes through the JIT until it looks like:

DATA MOVEMENT FOR 84 TRACES
SEND makeX_f32(20, 20, PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)PTR(0x7f15580367c0)) TO 20x20 VAR(1.1)
SEND makeX_f32(20, 1, PTR(0x7f1558037220)PTR(0x7f1558037270)PTR(0x7f15580372c0)PTR(0x7f1558037310)PTR(0x7f1558037360)PTR(0x7f15580373b0)PTR(0x7f1558037400)PTR(0x7f1558037450)PTR(0x7f15580374a0)PTR(0x7f15580374f0)PTR(0x7f1558037540)PTR(0x7f1558037590)PTR(0x7f15580375e0)PTR(0x7f1558037630)PTR(0x7f1558037680)PTR(0x7f15580376d0)PTR(0x7f1558037720)PTR(0x7f1558037770)PTR(0x7f15580377c0)PTR(0x7f1558037810)PTR(0x7f1558037860)PTR(0x7f15580378b0)PTR(0x7f1558037900)PTR(0x7f1558037950)PTR(0x7f15580379a0)PTR(0x7f15580379f0)PTR(0x7f1558037a40)PTR(0x7f1558037a90)PTR(0x7f1558037ae0)PTR(0x7f1558037b30)PTR(0x7f1558037b80)PTR(0x7f1558037bd0)PTR(0x7f1558037c20)PTR(0x7f1558037c70)PTR(0x7f1558037cc0)PTR(0x7f1558037d10)PTR(0x7f1558037d60)PTR(0x7f1558037db0)PTR(0x7f1558037e00)PTR(0x7f1558037e50)PTR(0x7f1558037ea0)PTR(0x7f1558037ef0)PTR(0x7f1558037f40)PTR(0x7f1558037f90)PTR(0x7f1558037fe0)PTR(0x7f1558038030)PTR(0x7f1558038080)PTR(0x7f15580380d0)PTR(0x7f1558038120)PTR(0x7f1558038170)PTR(0x7f15580381c0)PTR(0x7f1558038210)PTR(0x7f1558038260)PTR(0x7f15580382b0)PTR(0x7f1558038300)PTR(0x7f1558038350)PTR(0x7f15580383a0)PTR(0x7f15580383f0)PTR(0x7f1558038440)PTR(0x7f1558038490)PTR(0x7f15580384e0)PTR(0x7f1558038530)PTR(0x7f1558038580)PTR(0x7f15580385d0)PTR(0x7f1558038620)PTR(0x7f1558038670)PTR(0x7f15580386c0)PTR(0x7f1558038710)PTR(0x7f1558038760)PTR(0x7f15580387b0)PTR(0x7f1558038800)PTR(0x7f1558038850)PTR(0x7f15580388a0)PTR(0x7f15580388f0)PTR(0x7f1558038940)PTR(0x7f1558038990)PTR(0x7f15580389e0)PTR(0x7f1558038a30)PTR(0x7f1558038a80)PTR(0x7f1558038ad0)PTR(0x7f1558038b20)PTR(0x7f1558038b70)PTR(0x7f1558038bc0)PTR(0x7f1558038c10)) TO 20x1 VAR(2.1)

INDEX SPACE 20 x 1 FOR 84 TRACES
LITERAL zerosX_f32(20, 1) TO 20x1 VAR(0.1)

INDEX SPACE 20 x 1 FOR 84 TRACES
MATMUL matmulMV(VAR(1.1), VAR(0.1)) TO 20x1 var(0x7f1538010b80)

INDEX SPACE 20 x 1 FOR 84 TRACES
20x1 VAR(3.1) <- (VAR(2.1) - var(0x7f1538010b80))
20x1 VAR(4.1) <- VAR(3.1)

INDEX SPACE 20 x 1 FOR 84 TRACES
MATMUL matmulMV(VAR(1.1), VAR(4.1)) TO 20x1 VAR(7.1)

INDEX SPACE 1 x 1 FOR 84 TRACES
REDUCE dot_product(VAR(3.1), VAR(3.1)) TO 1x1 VAR(5.1)
1x1 VAR(6.1) <- VAR(5.1)
REDUCE dot_product(VAR(4.1), VAR(7.1)) TO 1x1 VAR(8.1)
20x1 VAR(9.1) <- (VAR(0.1) + ((VAR(4.1) * VAR(6.1)) / VAR(8.1)))
20x1 VAR(10.1) <- (VAR(3.1) - ((VAR(7.1) * VAR(6.1)) / VAR(8.1)))
REDUCE dot_product(VAR(10.1), VAR(10.1)) TO 1x1 VAR(5.2)
20x1 VAR(11.1) <- (VAR(10.1) + ((VAR(4.1) * VAR(5.2)) / VAR(6.1)))
20x1 VAR(4.2) <- VAR(11.1)
20x1 VAR(3.2) <- VAR(10.1)

DATA MOVEMENT FOR 84 TRACES
READ readout(PTR(0x7f1558030f00)PTR(0x7f1558030f04)PTR(0x7f1558030f08)PTR(0x7f1558030f0c)PTR(0x7f1558030f10)PTR(0x7f1558030f14)PTR(0x7f1558030f18)PTR(0x7f1558030f1c)PTR(0x7f1558030f20)PTR(0x7f1558030f24)PTR(0x7f1558030f28)PTR(0x7f1558030f2c)PTR(0x7f1558030f30)PTR(0x7f1558030f34)PTR(0x7f1558030f38)PTR(0x7f1558030f3c)PTR(0x7f1558030f40)PTR(0x7f1558030f44)PTR(0x7f1558030f48)PTR(0x7f1558030f4c)PTR(0x7f1558030f50)PTR(0x7f1558030f54)PTR(0x7f1558030f58)PTR(0x7f1558030f5c)PTR(0x7f1558030f60)PTR(0x7f1558030f64)PTR(0x7f1558030f68)PTR(0x7f1558030f6c)PTR(0x7f1558030f70)PTR(0x7f1558030f74)PTR(0x7f1558030f78)PTR(0x7f1558030f7c)PTR(0x7f1558030f80)PTR(0x7f1558030f84)PTR(0x7f1558030f88)PTR(0x7f1558030f8c)PTR(0x7f1558030f90)PTR(0x7f1558030f94)PTR(0x7f1558030f98)PTR(0x7f1558030f9c)PTR(0x7f1558030fa0)PTR(0x7f1558030fa4)PTR(0x7f1558030fa8)PTR(0x7f1558030fac)PTR(0x7f1558030fb0)PTR(0x7f1558030fb4)PTR(0x7f1558030fb8)PTR(0x7f1558030fbc)PTR(0x7f1558030fc0)PTR(0x7f1558030fc4)PTR(0x7f1558030fc8)PTR(0x7f1558030fcc)PTR(0x7f1558030fd0)PTR(0x7f1558030fd4)PTR(0x7f1558030fd8)PTR(0x7f1558030fdc)PTR(0x7f1558030fe0)PTR(0x7f1558030fe4)PTR(0x7f1558030fe8)PTR(0x7f1558030fec)PTR(0x7f1558030ff0)PTR(0x7f1558030ff4)PTR(0x7f1558030ff8)PTR(0x7f1558030ffc)PTR(0x7f1558031000)PTR(0x7f1558031004)PTR(0x7f1558031008)PTR(0x7f155803100c)PTR(0x7f1558031010)PTR(0x7f1558031014)PTR(0x7f1558031018)PTR(0x7f155803101c)PTR(0x7f1558031020)PTR(0x7f1558031024)PTR(0x7f1558031028)PTR(0x7f155803102c)PTR(0x7f1558031030)PTR(0x7f1558031034)PTR(0x7f1558031038)PTR(0x7f155803103c)PTR(0x7f1558031040)PTR(0x7f1558031044)PTR(0x7f1558031048)PTR(0x7f155803104c), VAR(6.1)) TO 1x20 VAR(6.2)

There are two enqueued data writes to the compute device followed by five kernels, two of which are the special case of matrix multiply (level 3 GEMM in this case) and the final data transfer back to the host.

Subsequent iterations have bytecode that looks like:

HASHCODE 6129876018811343629 NUMTRACES 18
0.7     2 39.1 
38.4294967295   
37.4294967295   
40.4294967295   
41.4294967295   
39.4294967295   
36.4294967295   
42.0    
42.1    2 5.7 
43.0    
43.1    2 16 1.1 4.7 
44.0    
44.1    2 4 4.7 43.1 
45.0    
45.1    2 25 0.7 27 33 4.7 42.1 44.1 
46.0    
46.1    2 36 3.7 27 33 43.1 42.1 44.1 
5.8     2 4 46.1 46.1 
47.0    
47.1    2 25 46.1 27 33 4.7 5.8 42.1 
4.8     2 47.1 
3.8     2 46.1 
42.2    37 42.1 PTR ( 0x1554cb0 0x15d2350 0x7f151c17ab40 0x7f1534047570 0x7f153c073030 0x7f15400e89c0 0x7f15440f51d0 0x7f15480b3500 0x7f154c045cc0 0x7f154c093e80 0x7f154c095fe0 0x7f15500300b0 0x7f1550080a90 0x7f15541e2c50 0x7f15540c3d20 0x7f15540c6c40 0x7f15540c9310 0x7f155809f8c0 )

As before, the JIT identifies data transfers and kernels:

DATA MOVEMENT FOR 3 TRACES
SEND PTR(0x7f15440bd910)PTR(0x7f15440bd960)PTR(0x7f15440bd9b0) FROM 33.1 TO 20x1 VAR(0.6)
SEND PTR(0x7f1544007760)PTR(0x7f1544007764)PTR(0x7f1544007768) FROM 5.6 TO 1x1 VAR(36.1)
SEND PTR(0x7f1538028ca0)PTR(0x7f1538028ca0)PTR(0x7f1538028ca0) FROM 1.1 TO 20x20 var(0x7f15440bc420)
SEND PTR(0x7f15440b88b0)PTR(0x7f15440b8900)PTR(0x7f15440b8950) FROM 4.6 TO 20x1 var(0x7f15440bb940)
SEND PTR(0x7f15440b7cd0)PTR(0x7f15440b7d20)PTR(0x7f15440b7d70) FROM 3.6 TO 20x1 var(0x7f15440b83d0)

INDEX SPACE 20 x 1 FOR 3 TRACES
MATMUL matmulMV(var(0x7f15440bc420), var(0x7f15440bb940)) TO 20x1 VAR(37.1)

INDEX SPACE 1 x 1 FOR 3 TRACES
REDUCE dot_product(var(0x7f15440bb940), VAR(37.1)) TO 1x1 VAR(38.1)
20x1 VAR(39.1) <- (VAR(0.6) + ((var(0x7f15440bb940) * VAR(36.1)) / VAR(38.1)))
20x1 VAR(40.1) <- (var(0x7f15440b83d0) - ((VAR(37.1) * VAR(36.1)) / VAR(38.1)))
REDUCE dot_product(VAR(40.1), VAR(40.1)) TO 1x1 VAR(5.7)
20x1 VAR(41.1) <- (VAR(40.1) + ((var(0x7f15440bb940) * VAR(5.7)) / VAR(36.1)))
20x1 VAR(4.7) <- VAR(41.1)
20x1 VAR(3.7) <- VAR(40.1)

DATA MOVEMENT FOR 3 TRACES
READ readout(PTR(0x7f1544080360)PTR(0x7f1544080364)PTR(0x7f1544080368), VAR(36.1)) TO 1x1 VAR(36.2)

You’ve probably noticed how the number of traces changes. The scheduling right now is primitive. It doesn’t know how many threads are in flight and has no concept of waiting a little longer to batch more together. This will be necessary for good performance.