public static final void main(String[] args)
{
int max = 9;
int start, stop, step;
// this controls the order
boolean smallToLarge = true;
if (smallToLarge)
{
start = 0;
stop = max;
step = 1;
System.out.println("ORDER: SMALL[64] -> LARGE[16k]");
}
else
{
start = max - 1;
stop = 0;
step = -1;
System.out.println("ORDER: LARGE[16k] -> SMALL[64]");
}
for (int i = start; (smallToLarge ? i < stop : i > stop); i += step)
{
int size = 1 << (i + 6);
long elapsed = benchmark(size);
System.out.println("size=" + size + ", \ttook: " + (elapsed/1000L) + "us \t(" + (elapsed / size) + " ns/element -> "+(size * 1000000000L / elapsed)+" elements/s)");
for (int k = 0; k < 8; k++)
System.gc();
try
{
Thread.sleep(2 * 1000);
}
catch (Exception exc)
{
//
}
}
}
public static final long benchmark(int size)
{
int byteSize = size * 4;
FloatBuffer bufferA = ByteBuffer.allocateDirect(byteSize).order(ByteOrder.nativeOrder()).asFloatBuffer();
FloatBuffer bufferB = ByteBuffer.allocateDirect(byteSize).order(ByteOrder.nativeOrder()).asFloatBuffer();
FloatBuffer bufferC = ByteBuffer.allocateDirect(byteSize).order(ByteOrder.nativeOrder()).asFloatBuffer();
int runs = 32;
int loops = 1024;
int recordedRuns = 0;
long recordedElapsed = 0L;
for (int i = 0; i < runs; i++)
{
if (i == runs / 2 - 1)
{
// reset halfway
recordedElapsed = 0L;
recordedRuns = 0;
}
long start = System.nanoTime();
for (int k = 0; k < loops; k++)
{
faddN_buffer(bufferA, bufferB, bufferC, size);
faddN_buffer(bufferB, bufferC, bufferA, size);
faddN_buffer(bufferC, bufferA, bufferB, size);
}
long elapsed = System.nanoTime() - start;
recordedElapsed += elapsed;
recordedRuns++;
}
return recordedElapsed / recordedRuns;
}
/**
* method that controls the unrollment of the algorithm
*/
private static final void faddN_buffer(FloatBuffer a, FloatBuffer b, FloatBuffer c, int n)
{
int n8 = n >> 3;
int n4 = (n & 7) >> 2;
int n1 = n & 3;
int pA = 0;
int pB = 0;
int pC = 0;
if (n8 != 0)
{
pA -= 8;
pB -= 8;
pC -= 8;
for (int i = n8 - 1; i >= 0; i--)
fadd8_buffer(a, b, c, pA += 8, pB += 8, pC += 8);
}
if (n4 != 0)
{
pA -= 4;
pB -= 4;
pC -= 4;
for (int i = n4 - 1; i >= 0; i--)
fadd4_buffer(a, b, c, pA += 4, pB += 4, pC += 4);
}
if (n1 != 0)
{
pA -= 1;
pB -= 1;
pC -= 1;
for (int i = n1 - 1; i >= 0; i--)
fadd1_buffer(a, b, c, pA += 1, pB += 1, pC += 1);
}
}
/**
* methods that manually unroll loops (much faster)
*/
private static final void fadd8_buffer(FloatBuffer a, FloatBuffer b, FloatBuffer c, int pa, int pb, int pc)
{
c.put(pc + 0, a.get(pa + 0) + b.get(pb + 0));
c.put(pc + 1, a.get(pa + 1) + b.get(pb + 1));
c.put(pc + 2, a.get(pa + 2) + b.get(pb + 2));
c.put(pc + 3, a.get(pa + 3) + b.get(pb + 3));
c.put(pc + 4, a.get(pa + 4) + b.get(pb + 4));
c.put(pc + 5, a.get(pa + 5) + b.get(pb + 5));
c.put(pc + 6, a.get(pa + 6) + b.get(pb + 6));
c.put(pc + 7, a.get(pa + 7) + b.get(pb + 7));
}
private static final void fadd4_buffer(FloatBuffer a, FloatBuffer b, FloatBuffer c, int pa, int pb, int pc)
{
c.put(pc + 0, a.get(pa + 0) + b.get(pb + 0));
c.put(pc + 1, a.get(pa + 1) + b.get(pb + 1));
c.put(pc + 2, a.get(pa + 2) + b.get(pb + 2));
c.put(pc + 3, a.get(pa + 3) + b.get(pb + 3));
}
private static final void fadd1_buffer(FloatBuffer a, FloatBuffer b, FloatBuffer c, int pa, int pb, int pc)
{
c.put(pc, a.get(pa) + b.get(pb));
}
`
Java 6.0 Server VM
ORDER: SMALL[64] -> LARGE[16k]
size=64, took: 3850us (60157 ns/element -> 16622 elements/s)
size=128, took: 1515us (11839 ns/element -> 84461 elements/s)
size=256, took: 2638us (10305 ns/element -> 97038 elements/s)
size=512, took: 5657us (11050 ns/element -> 90494 elements/s)
size=1024, took: 12177us (11892 ns/element -> 84087 elements/s)
size=2048, took: 24268us (11849 ns/element -> 84390 elements/s)
size=4096, took: 48544us (11851 ns/element -> 84376 elements/s)
size=8192, took: 97251us (11871 ns/element -> 84234 elements/s)
size=16384, took: 200368us (12229 ns/element -> 81769 elements/s)
ORDER: LARGE[16k] -> SMALL[64]
size=16384, took: 678194us (41393 ns/element -> 24158 elements/s)
size=8192, took: 310363us (37886 ns/element -> 26394 elements/s)
size=4096, took: 149891us (36594 ns/element -> 27326 elements/s)
size=2048, took: 74897us (36570 ns/element -> 27344 elements/s)
size=1024, took: 37610us (36728 ns/element -> 27226 elements/s)
size=512, took: 18856us (36829 ns/element -> 27152 elements/s)
size=256, took: 9516us (37174 ns/element -> 26900 elements/s)
size=128, took: 4693us (36668 ns/element -> 27271 elements/s)
size=64, took: 2405us (37581 ns/element -> 26608 elements/s)
Note: performance difference: factor 3+
`
`
[b]Java 6.0 Client VM[/b]
ORDER: SMALL[64] -> LARGE[16k] (all 4.8k to 5.2k elements/s)
ORDER: LARGE[16k] -> SMALL[64] (all 5.3k to 5.4k elements/s)
Note: slower than Client 5.0 (400%) and Server 5.0 (700%) !! :o (see below)
`
`
[b]Java 5.0 Server VM[/b]
ORDER: SMALL[64] -> LARGE[16k] (all 33k to 37k elements/s)
ORDER: LARGE[16k] -> SMALL[64] (all 33k to 37k elements/s)
Java 5.0 Client VM
ORDER: SMALL[64] -> LARGE[16k] (all 19k elements/s)
ORDER: LARGE[16k] -> SMALL[64] (all 19k elements/s)
Note: server ~80% faster than client
`