I have this simple C sourcecode, that is supposed to be
faster than Java, as the Java VM can’t do SIMD yet.
The problem is that the non-SIMD code in C (1100ms),
is faster than the SIMD code in C (1750ms).
And both are beaten by the Java Server VM (750ms)
Now, the JVM can’t possibly be faster, as the C version
is supposed to do 2-4x as much in every operation
(2x on P3/P4, 4x on C2D/C2Q).
So, where am I screwing up?
Even in C the SSE version is slower…
C initialization code:
#include <stdio.h>
#include <xmmintrin.h>
#include <time.h>
#include <sys/time.h>
#include <errno.h>
#include <windows.h>
__m128 a, b, c;
float f4a[4] __attribute__((aligned(16))) = { +1.2, +3.5, +1.7, +2.8 };
float f4b[4] __attribute__((aligned(16))) = { -0.7, +2.6, +3.3, -4.0 };
float f4c[4] __attribute__((aligned(16))) = { -0.7, +2.6, +3.3, -4.0 };
unsigned long long System_currentTimeMillis() {
FILETIME t;
long long c;
GetSystemTimeAsFileTime(&t);
c = (unsigned long long int) t.dwHighDateTime << 32LL;
return (c | t.dwLowDateTime) / 10000;
}
Normal (x86) C code: (takes 1100ms)
t0 = System_currentTimeMillis();
for(i = 0; i < end; i++)
{
f4c[0] = f4a[0] + f4b[0];
f4c[1] = f4a[1] + f4b[1];
f4c[2] = f4a[2] + f4b[2];
f4c[3] = f4a[3] + f4b[3];
f4a[0] = f4c[0] - f4b[0];
f4a[1] = f4c[1] - f4b[1];
f4a[2] = f4c[2] - f4b[2];
f4a[3] = f4c[3] - f4b[3];
f4c[0] = f4a[0] * f4c[0];
f4c[1] = f4a[1] * f4c[1];
f4c[2] = f4a[2] * f4c[2];
f4c[3] = f4a[3] * f4c[3];
}
t1 = System_currentTimeMillis();
printf("x86 took: %dms\n", (int)(t1-t0));
SIMD SSE2 code: (takes 1750ms)
t0 = System_currentTimeMillis();
a = _mm_load_ps(f4a);
b = _mm_load_ps(f4b);
c = _mm_load_ps(f4c);
for(i = 0; i < end; i++)
{
c = _mm_add_ps(a, b);
a = _mm_sub_ps(c, b);
c = _mm_mul_ps(a, c);
}
_mm_store_ps(f4c,c);
t1 = System_currentTimeMillis();
printf("SSE took: %dms\n", (int)(t1-t0));
Java code: (takes 750ms)
static void _mm_mul_ps(float[] a, float[] b, float[] dst)
{
dst[0] = a[0] * b[0];
dst[1] = a[1] * b[1];
dst[2] = a[2] * b[2];
dst[3] = a[3] * b[3];
}
static void _mm_add_ps(float[] a, float[] b, float[] dst) { .......... }
static void _mm_sub_ps(float[] a, float[] b, float[] dst) { .......... }
static float[] run()
{
float[] a = { 1.2f, 3.5f, 1.7f, 2.8f };
float[] b = { -0.7f, 2.6f, 3.3f, -4.0f };
float[] c = { -0.7f, 2.6f, 3.3f, -4.0f };
int end = 1024 * 1024 * 64;
for (int i = 0; i < end; i++)
{
_mm_add_ps(a, b, c);
_mm_sub_ps(c, b, a);
_mm_mul_ps(a, c, c);
}
return c;
}
I am compiling with:
M:\MinGW_C_compiler\bin\gcc -Wall -Wl,-subsystem,console -march=pentium3 -mfpmath=sse -fomit-frame-pointer -funroll-loops sse.c -o “sse.exe”