Simple benchmark - regular disclaimers about benchmark implementation, coding misstakes etc. apply
Code implements vector multiplication A = AB for 41024*1024 length vectors A and B populated with randomly distributed floats [0,100).
First numbers is with float arrays, second with (direct) FloatBuffers and third is a native implementation (direct bytebuffers used to transfer data).
“Well…” don’t really serve any purpose except possibly preventing the number crunching process from being optimized away (?).
java version:
java version “1.6.0-rc”
Java™ 2 Runtime Environment, Standard Edition (build 1.6.0-rc-b68)
Java HotSpot™ Client VM (build 1.6.0-rc-b68, mixed mode, sharing)
gcc version:
gcc (GCC) 3.4.5 (Gentoo 3.4.5, ssp-3.4.5-1.0, pie-8.7.9)
command used to compile shared library:
gcc -pipe -msse -march=pentium4 -O3 simd_SimdTest.c -o libtestsimd.so -shared -ljvm -lverify -lnet -ljava -lnio -I/opt/sun-jdk-1.6.0/include -I/opt/sun-jdk-1.6.0/include/linux -L/opt/sun-jdk-1.6.0/jre/lib/i386 -L/opt/sun-jdk-1.6.0/jre/lib/i386/client -Wl,-soname,libtestsimd.so -lc -fPIC -D_REENTRANT -W -Wall -Wno-unused -Wno-parentheses -Werror
<snip>
Well...: 7412,38965.
Array calc took 34631 us.
Well...: 4627,65234.
Array calc took 35024 us.
Well...: 1983,89075.
Array calc took 34717 us.
<snip>
Well...: 578,43506.
Buffer calc took 34490 us.
Well...: 5744,37891.
Buffer calc took 34895 us.
Well...: 3716,52808.
Buffer calc took 33800 us.
<snip>
Well...: 1946,09839.
SIMD calc took 12619 us.
Well...: 657,79602.
SIMD calc took 12674 us.
Well...: 3669,88257.
SIMD calc took 12666 us.
Java:
package simd;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.DoubleBuffer;
import java.nio.FloatBuffer;
public class SimdTest {
private static final int ITERS = 10;
private static final int SIZE = 4*1024*1024;
private static float[] dataA = new float[SIZE];
private static float[] dataB = new float[SIZE];
private static FloatBuffer bufferA = ByteBuffer.allocateDirect(SIZE*4).
order(ByteOrder.nativeOrder()).
asFloatBuffer();
private static FloatBuffer bufferB = ByteBuffer.allocateDirect(SIZE*4).
order(ByteOrder.nativeOrder()).
asFloatBuffer();
public static void main(String[] args) {
// Test ARRAY based
int random = (int) (SIZE*Math.random());
for (int i = 0; i < ITERS; i++) {
initArrays();
long before = System.nanoTime();
calcArrays();
long after = System.nanoTime();
System.out.printf("Well...:\t\t\t%4.5f.\n", dataA[random]);
long diffInMs = (after - before) / 1000;
System.out.printf("Array calc took %d us.\n", diffInMs);
}
// Test BUFFER based
for (int i = 0; i < ITERS; i++) {
initBuffers();
long before = System.nanoTime();
calcBuffers();
long after = System.nanoTime();
System.out.printf("Well...:\t\t\t%4.5f.\n", bufferA.get(random));
long diffInMs = (after - before) / 1000;
System.out.printf("Buffer calc took %d us.\n", diffInMs);
}
// Test SIMD based
System.loadLibrary("testsimd");
for (int i = 0; i < ITERS; i++) {
initBuffers();
long before = System.nanoTime();
calcSIMD(bufferA, bufferB);
long after = System.nanoTime();
System.out.printf("Well...:\t\t\t%4.5f.\n", bufferA.get(random));
long diffInMs = (after - before) / 1000;
System.out.printf("SIMD calc took %d us.\n", diffInMs);
}
}
private static void calcArrays() {
for (int i = 0; i < SIZE; i++) {
dataA[i] = dataA[i]*dataB[i];
}
}
private static void calcBuffers() {
for (int i = 0; i < SIZE; i++) {
bufferA.put(i, bufferA.get(i)*bufferB.get(i));
}
}
private static native void calcSIMD(FloatBuffer bufferA, FloatBuffer bufferB);
private static void initArrays() {
for (int i=0; i < SIZE; i++) {
dataA[i] = (float) (Math.random() * 100);
dataB[i] = (float) (Math.random() * 100);
}
}
private static void initBuffers() {
for (int i=0; i < SIZE; i++) {
bufferA.put(i, (float) (Math.random() * 100));
bufferB.put(i, (float) (Math.random() * 100));
}
}
}
C header:
/* DO NOT EDIT THIS FILE - it is machine generated */
#include <jni.h>
/* Header for class simd_SimdTest */
#ifndef _Included_simd_SimdTest
#define _Included_simd_SimdTest
#ifdef __cplusplus
extern "C" {
#endif
#undef simd_SimdTest_SIZE
#define simd_SimdTest_SIZE 409600L
/*
* Class: simd_SimdTest
* Method: calcSIMD
* Signature: (Ljava/nio/DoubleBuffer;Ljava/nio/DoubleBuffer;)V
*/
JNIEXPORT void JNICALL Java_simd_SimdTest_calcSIMD
(JNIEnv *, jclass, jobject, jobject);
#ifdef __cplusplus
}
#endif
#endif
C-impl:
#include <jni.h>
#include <stdio.h>
#include "simd_SimdTest.h"
//typedef float v4sf __attribute__ ((mode(V4SF)));
JNIEXPORT void JNICALL Java_simd_SimdTest_calcSIMD(JNIEnv *env, jclass cls, jobject ba, jobject bb)
{
/*
// Float vectors of size 4
v4sf a;
v4sf b;
v4sf result;
*/
float* bufferA = (float*) (*env)->GetDirectBufferAddress(env, ba);
int sizeA = (*env)->GetDirectBufferCapacity(env, ba);
float* bufferB = (float*) (*env)->GetDirectBufferAddress(env, bb);
// assumes buffers is equal in size.
int i = 0;
for (; i < sizeA; i++) // i += 4
{
*bufferA = (*bufferA) * (*bufferB);
bufferA++; bufferB++;
/*
// GCC 3.4 compiles above to same speed as the more
// verbose version below.
// Loads 4 floats from adress bufferA
a = __builtin_ia32_loadups(bufferA);
b = __builtin_ia32_loadups(bufferB);
// a * b
result = __builtin_ia32_mulps (a, b);
//Stores 4 floats at adress bufferA
__builtin_ia32_storeups(bufferA, result);
bufferA = bufferA + 4;
bufferB = bufferB + 4;
*/
}
}