My na(t)ive PerlinNoise impl. is 3 times slower than server VM

I’m pretty much a C n00b, but I thought I would be able to implement Perlin Noise in C that would be at least as fast as my Java version. Guess not! Here is my code:


int   perm[512];

void initPerlinNoise()
   {
      int i, permutation[] = { 151, 160, 137, 91, 90, 15, 131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140, 36, 103, 30, 69, 142, 8, 99, 37, 240, 21, 10, 23, 190, 6, 148, 247, 120, 234, 75, 0, 26, 197, 62, 94, 252, 219, 203, 117, 35, 11, 32, 57, 177, 33, 88, 237, 149, 56, 87, 174, 20, 125, 136, 171, 168, 68, 175, 74, 165, 71, 134, 139, 48, 27, 166, 77, 146, 158, 231, 83, 111, 229, 122, 60, 211, 133, 230, 220, 105, 92, 41, 55, 46, 245, 40, 244, 102, 143, 54, 65, 25, 63, 161, 1, 216, 80, 73, 209, 76, 132, 187, 208, 89, 18, 169, 200, 196, 135, 130, 116, 188, 159, 86, 164, 100, 109, 198, 173, 186, 3, 64, 52, 217, 226, 250, 124, 123, 5, 202, 38, 147, 118, 126, 255, 82, 85, 212, 207, 206, 59, 227, 47, 16, 58, 17, 182, 189, 28, 42, 223, 183, 170, 213, 119, 248, 152, 2, 44, 154, 163, 70, 221, 153, 101, 155, 167, 43, 172, 9, 129, 22, 39, 253, 19, 98, 108, 110, 79, 113, 224, 232, 178, 185, 112, 104, 218, 246, 97, 228, 251, 34, 242, 193, 238, 210, 144, 12, 191, 179, 162, 241, 81, 51, 145, 235, 249,
         14, 239, 107, 49, 192, 214, 31, 181, 199, 106, 157, 184, 84, 204, 176, 115, 121, 50, 45, 127, 4, 150, 254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 243, 141, 128, 195, 78, 66, 215, 61, 156, 180 };

      for (i = 0; i < 256; i++)
         perm[256 + i] = perm[i] = permutation[i];
   }


inline float fade(float t)
   {
      return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
   }

inline float lerp(float t, float a, float b)
   {
      return a + t * (b - a);
   }

inline float grad(int hash, float x, float y, float z)
   {
      int h = hash & 15;
      float u = (h < 8) ? x : y;
      float v = (h < 4) ? y : ((h == 12 || h == 14) ? x : z);
      return ((h & 1) == 0 ? u : -u) + ((h & 2) == 0 ? v : -v);
   }

inline float noise(float x, float y, float z)
   {
      float fx = floor(x);
      float fy = floor(y);
      float fz = floor(z);

      int gx = (int) fx & 0xFF;
      int gy = (int) fy & 0xFF;
      int gz = (int) fz & 0xFF;

      float u = fade(x -= fx);
      float v = fade(y -= fy);
      float w = fade(z -= fz);

      int a0 = perm[gx + 0] + gy;
      int b0 = perm[gx + 1] + gy;
      int aa = perm[a0 + 0] + gz;
      int ab = perm[a0 + 1] + gz;
      int ba = perm[b0 + 0] + gz;
      int bb = perm[b0 + 1] + gz;

      float a1 = grad(perm[bb + 1], x - 1, y - 1, z - 1);
      float a2 = grad(perm[ab + 1], x - 0, y - 1, z - 1);
      float a3 = grad(perm[ba + 1], x - 1, y - 0, z - 1);
      float a4 = grad(perm[aa + 1], x - 0, y - 0, z - 1);
      float a5 = grad(perm[bb + 0], x - 1, y - 1, z - 0);
      float a6 = grad(perm[ab + 0], x - 0, y - 1, z - 0);
      float a7 = grad(perm[ba + 0], x - 1, y - 0, z - 0);
      float a8 = grad(perm[aa + 0], x - 0, y - 0, z - 0);

      float a2_1 = a2+u*(a1-a2);       //lerp(u, a2, a1);
      float a4_3 = a4+u*(a3-a4);       //lerp(u, a4, a3);
      float a6_5 = a6+u*(a5-a6);       //lerp(u, a6, a5);
      float a8_7 = a8+u*(a7-a8);       //lerp(u, a8, a7);
      float a8_5 = a8_7+v*(a6_5-a8_7); //lerp(v, a8_7, a6_5);
      float a4_1 = a4_3+v*(a2_1-a4_3); //lerp(v, a4_3, a2_1);
      float a8_1 = a8_5+w*(a4_1-a8_5); //lerp(w, a8_5, a4_1);

      return a8_1;
   }

   //

JNIEXPORT void JNICALL Java_pack_Code_perlin_1noise
	(JNIEnv *env, jclass clz, jlong off, jint pow)
{
	// pow=5  => dim=32  => 32*32*32 samples
	int bits, mask, elems, i, x, y, z, pow0, pow1, pow2;
	bits = 1 << pow;
	mask = bits - 1;
	elems = bits*bits*bits;

	pow0 = pow << 0;
	pow1 = pow << 1;
	pow2 = pow << 2;

	float* p = asFloat4Pointer(env, off);

	for(i=elems-1; i>=0; i--)
	{
		x = (i>>pow0) & mask;
		y = (i>>pow1) & mask;
		z = (i>>pow2) & mask;

		p[i] = noise(x*0.01f, y*0.01f, z*0.01f);
	}	
}

My Java code is pretty much identical.

The benchmark is: fill a grid of 323232 (32768 samples)
C: 8.2ms (avg of 64 runs, no jni overhead as I benchmark it 100% in C)
Java: 2.3ms (after warmup, avg of 64 runs)

Java server VM is 3.5x faster?

What rookie mistake did I make? I expected HotSpot to check all array-indices as ‘random’ array access like these: perm[perm[gx + 1] + 1] should be rather hard for HotSpot to optimize away, or so I thought.

I compile C code with:

gcc -Wall -D_JNI_IMPLEMENTATION_ -Wl,--kill-at    -march=pentium3 -mfpmath=sse -fomit-frame-pointer -funroll-loops -O3

Unrolling the loop manually didn’t change anything either.

You’re using a mix of C and C++, the keyword “inline” comes from C++.

http://www.greenend.org.uk/rjk/2003/03/inline.html

inline has been a part of C since C99 spec and most compilers supported well before that.

That, and I’m still curious why my C code is so slow.

How floor() is implemented in C?

I think you’ve hit a good point. floor will be the one function called. The default floor most likely works by changing the FPU’s control register. You can try adding the --fast-math option.

Hm… that commandline param didn’t help, but:


      float fx = x; //floor(x);
      float fy = y; //floor(y);
      float fz = z; //floor(z);

=> 0.25ms

Although incorrect, it’s significantly faster.


      int ix = (int)x;
      int iy = (int)y;
      int iz = (int)z;

      int gx = ix & 0xFF;
      int gy = iy & 0xFF;
      int gz = iz & 0xFF;

      float u = fade(x -= ix);
      float v = fade(y -= iy);
      float w = fade(z -= iz);

=> C: 3.3ms => J: 2.3ms

Reordered some operations, so that it stays in FPU or CPU mode longer:

inline float noise(float x, float y, float z)
   {
      int ix,iy,iz;
      int gx,gy,gz;
      float u,v,w;
      int a0,b0,aa,ab,ba,bb;
      float aa0,ab0,ba0,bb0;
      float aa1,ab1,ba1,bb1;
      float a1,a2,a3,a4,a5,a6,a7,a8;
      float a8_5, a4_1;

      ix = (int)x; x-=ix;
      iy = (int)y; y-=iy;
      iz = (int)z; z-=iz;      

      gx = ix & 0xFF;
      gy = iy & 0xFF;
      gz = iz & 0xFF;
      
      a0 = gy+perm[gx];
      b0 = gy+perm[gx + 1];
      aa = gz+perm[a0];
      ab = gz+perm[a0 + 1];
      ba = gz+perm[b0];
      bb = gz+perm[b0 + 1];

      aa0 = perm[aa]; aa1 = perm[aa + 1];
      ab0 = perm[ab]; ab1 = perm[ab + 1];
      ba0 = perm[ba]; ba1 = perm[ba + 1];
      bb0 = perm[bb]; bb1 = perm[bb + 1];

      a1 = grad(bb1, x - 1, y - 1, z - 1);
      a2 = grad(ab1, x    , y - 1, z - 1);
      a3 = grad(ba1, x - 1, y    , z - 1);
      a4 = grad(aa1, x    , y    , z - 1);
      a5 = grad(bb0, x - 1, y - 1, z    );
      a6 = grad(ab0, x    , y - 1, z    );
      a7 = grad(ba0, x - 1, y    , z    );
      a8 = grad(aa0, x    , y    , z    );

      u = fade(x);
      v = fade(y);
      w = fade(z);

      a8_5 = lerp(v, lerp(u, a8, a7), lerp(u, a6, a5));
      a4_1 = lerp(v, lerp(u, a4, a3), lerp(u, a2, a1));
      return lerp(w, a8_5, a4_1);
   }

=> C: 2.9ms => J: 2.3ms

So what am I missing here? Maybe MinGW/GCC is simply not generating efficient machine code?

I hardly ever use GCC. If you’re using linux, you can grab the intel compiler for non-com. usage. At least for comparison purposes. Maybe you need a newer version of GCC?? Also, why limit the instruction set to P3? Isn’t P4 a reasonable bottom end?

Plenty of P3’s about.

Cas :slight_smile:

Would your noise function even get inlined? If I remember correctly, the larger a function is the less chance it has of becoming inlined. You should be able to check how much inlining is happening by removing the inline directives and check the resulting exe file size. If there is a significant difference then a lot of inlining is happening. I thought that if you are trying to call one inline function from another inline function it sinificantly reduces that chance of anything being inlined. Try removing inline from noise() and see if it has any effect.

An easier way to verify would be to just add “-c -S” and inspect the asm output.

4x inline: 2906 microseconds
3x inline: 3000 microseconds
0x inline: 3006 microseconds

Barely a difference…

ASM dump: http://pastebin.com/DWuQyw7H

I’m way too new at this to draw any conclusions.

Lol it seems difficult to beat Java :stuck_out_tongue:

Optimized grad() by hardcoding the branches in a switch ;D


inline float grad(int hash, float x, float y, float z)
   {  
	//float u = (h < 8) ? x : y;
	//float v = (h < 4) ? y : ((h == 12 || h == 14) ? x : z);
	//return ((h & 1) == 0 ? u : -u) + ((h & 2) == 0 ? v : -v);

	switch(hash & 0xF)
	{
		case 0x0: return  x + y;
		case 0x1: return -x + y;
		case 0x2: return  x - y;
		case 0x3: return -x - y;
		case 0x4: return  x + x;
		case 0x5: return -x + x;
		case 0x6: return  x - x;
		case 0x7: return -x - x;
		case 0x8: return  y + x;
		case 0x9: return -y + x;
		case 0xA: return  y - x;
		case 0xB: return -y - x;
		case 0xC: return  y + z;
		case 0xD: return -y + x;
		case 0xE: return  y - x;
		case 0xF: return -y - z;
		default: return 0; // never happens
	}
   }

C: 1.5ms J: 2.3ms (new grad() made java version slower, so not using it)

Well, noise itself is a leaf function (fully inlined). Noise itself is not getting inlined. Line 714: call _noise.

You can (most likely) reduce the calling overhead by changing the prototype of noise to:

static inline float __fastcall noise(float x, float y, float z)

(register calling instead of stack call convention - which is good for GCC, MS & Intel)

My guess here is that the JIT is using SSE2 which will be a big win.

Interesting stuff, it didn’t make any difference however. The hardcoded grad() seemed to be the bottleneck, and the call-overhead of noise() is probably negligible.

Could you tell me how? Would the costs of switching between CPU/FPU be removed? HotSpot can’t do SIMD yet.