Arithmictic Performance

DzzD · August 7, 2009, 8:13pm

thanks for the test, any chance to get a high resolution test plz ? (up to F16)

but wow, I am realy suprised by lookup fastness

EDIT : hum seems that the rules have changed on Android with lookup table, I run your test again (but on a laptop, sry but dont have any Android device yet) using Riven lookup/Java/ and original taylor (but converted from double to float and up to F20)

Here is the whole test project : http://demo.dzzd.net/testCos.zip

[quote]//Time
Java: 0.045886
Riven: 1.163765
DzzD: 0.166013

//Results
Input: 0.84525967
DzzD: 0.66299015
Riven: 0.663537
Java: 0.6635370371144133

//Time
Java: 0.046026
Riven: 1.100349
DzzD: 0.166362

//Results
Input: 0.37585977
DzzD: 0.92992324
Riven: 0.93019235
Java: 0.9301923689294419

//Time
Java: 0.045956
Riven: 1.101327
DzzD: 0.166292

//Results
Input: 0.35580623
DzzD: 0.937606
Riven: 0.93736595
Java: 0.9373659458091854
[/quote]

pjt33 · August 7, 2009, 8:43pm

Nate, Dzzd, what are you accuracy / results respectively measuring?

DzzD · August 7, 2009, 8:47pm

[s]

it show original Java Math.cos result and results of the two others methods, this is not a percentage

EDIT : modifing the random generator ( [i]numbers = (float)(Math.random()*7919 ); ) to use a higher range of values give the following results:

[quote]//times
Java: 0.049936
Riven: 1.734927
DzzD: 0.164825

//results
Input: 4377.789
DzzD: -0.020707069
Riven: -0.020298883110866583
Java: -0.02029888311062082

//times
Java: 0.050845
Riven: 1.757276
DzzD: 0.16755

//results
Input: 2180.3757
DzzD: 0.9939912
Riven: 0.9939086387101796
Java: 0.9939087098649356

//times
Java: 0.049937
Riven: 1.734369
DzzD: 0.161892

//results
Input: 4083.068
DzzD: 0.53823364
Riven: 0.5383364782146423
Java: 0.538336478350398
[/quote]
two things to point out :

Math.cos have become a lot faster on recent JVM
lookup/array are a lot faster on Android than it is on a Desktop (wich is a shame…)

all test was done on an Intel Core Duo T7500 2.2Ghz with Java 6u14[/s]

EDIT : really need to have some rest

Riven · August 7, 2009, 10:08pm

Are you measuring showing my code to be 34.6x slower than pure Java? :o :persecutioncomplex:

Riven · August 7, 2009, 10:14pm

You seem to have swapped the implementations

FastMath is mine. FastMath2 is yours.

Your benchmark was fundamentally flawed. It allowed the JVM to completely remove the Math.cos() call.

I renamed the classes (see next post)

Bench.java


   public static void main(String[] args)
   {
      for (int i = 0; i < 64; i++)
         testCos(false);
      for (int i = 0; i < 64; i++)
         testCos(true);
   }

   public static void testCos(boolean log)
   {
      int count = 50000;

      long s, e;

      float[] numbers = new float[count];
      for (int i = 0; i < count; i++)
         numbers[i] = (float) (Math.random() * Math.PI * 4.0 - Math.PI * 2.0);

      // ensure the JVM doesn't optimize those silly calls away!!
      double[] java_dst = new double[count];
      float[] dzzd_dst = new float[count];
      float[] riven_dst = new float[count];

      s = System.nanoTime();
      for (int i = 0; i < count; i++)
         java_dst[i] = Math.cos(numbers[i]);
      e = System.nanoTime();
      if (log)
         System.out.println("Java.cos:  " + (e - s) / 1000 + "us");

      s = System.nanoTime();
      for (int i = 0; i < count; i++)
         dzzd_dst[i] = FastMathDzzD.cos(numbers[i]);
      e = System.nanoTime();
      if (log)
         System.out.println("DzzD.cos:  " + (e - s) / 1000 + "us");

      s = System.nanoTime();
      for (int i = 0; i < count; i++)
         riven_dst[i] = FastMathRiven.cos(numbers[i]);
      e = System.nanoTime();
      if (log)
         System.out.println("Riven.cos: " + (e - s) / 1000 + "us");

      double dzzdAvgErr = 0.0;
      double rivenAvgErr = 0.0;
      double dzzdMaxErr = 0.0;
      double rivenMaxErr = 0.0;

      for (int i = 0; i < count; i++)
      {
         double dzzdErr = Math.abs(Math.cos(numbers[i]) - FastMathDzzD.cos(numbers[i]));
         double rivenErr = Math.abs(Math.cos(numbers[i]) - FastMathRiven.cos(numbers[i]));

         dzzdAvgErr += dzzdErr;
         if (dzzdErr > dzzdMaxErr)
            dzzdMaxErr = dzzdErr;

         rivenAvgErr += rivenErr;
         if (rivenErr > rivenMaxErr)
            rivenMaxErr = rivenErr;
      }
      dzzdAvgErr /= count;
      rivenAvgErr /= count;

      if (log)
      {
         System.out.println("Input: " + numbers[3]);
         System.out.println("DzzD: " + FastMathDzzD.cos(numbers[3]) + ", avg.error=" + dzzdAvgErr + ", max.error=" + dzzdMaxErr);
         System.out.println("Riven: " + FastMathRiven.cos(numbers[3]) + ", avg.error=" + rivenAvgErr + ", max.error=" + rivenMaxErr);
         System.out.println("Java: " + Math.cos(numbers[3]));
         System.out.println("~~~prevent opt. ~~~" + dzzd_dst[13] + "~~~" + java_dst[13] + "~~~" + riven_dst[13]);
         System.out.println();
      }
   }

   public static void testAtan2(boolean log)
   {
      int count = 50000;

      long s, e;

      float[] xNumbers = new float[count];
      float[] yNumbers = new float[count];
      for (int i = 0; i < count; i++)
      {
         xNumbers[i] = (float) (Math.random() * Math.PI * 2.0 - Math.PI);
         yNumbers[i] = (float) (Math.random() * Math.PI * 2.0 - Math.PI);
      }

      // ensure the JVM doesn't optimize those silly calls away!!
      double[] java_dst = new double[count];
      float[] nate_dst = new float[count];
      float[] riven_dst = new float[count];

      s = System.nanoTime();
      for (int i = 0; i < count; i++)
         java_dst[i] = Math.atan2(yNumbers[i], xNumbers[i]);
      e = System.nanoTime();
      if (log)
         System.out.println("Java.atan2:  " + (e - s) / 1000 + "us");

      s = System.nanoTime();
      for (int i = 0; i < count; i++)
         nate_dst[i] = FastMathNate.atan2(yNumbers[i], xNumbers[i]);
      e = System.nanoTime();
      if (log)
         System.out.println("Nate.atan2:  " + (e - s) / 1000 + "us");

      s = System.nanoTime();
      for (int i = 0; i < count; i++)
         riven_dst[i] = FastMathRiven.atan2(yNumbers[i], xNumbers[i]);
      e = System.nanoTime();
      if (log)
         System.out.println("Riven.atan2: " + (e - s) / 1000 + "us");

      double nateAvgErr = 0.0;
      double rivenAvgErr = 0.0;
      double nateMaxErr = 0.0;
      double rivenMaxErr = 0.0;

      for (int i = 0; i < count; i++)
      {
         double nateErr = Math.abs(Math.atan2(yNumbers[i], xNumbers[i]) - FastMathNate.atan2(yNumbers[i], xNumbers[i]));
         double rivenErr = Math.abs(Math.atan2(yNumbers[i], xNumbers[i]) - FastMathRiven.atan2(yNumbers[i], xNumbers[i]));

         nateAvgErr += nateErr;
         if (nateErr > nateMaxErr)
            nateMaxErr = nateErr;

         rivenAvgErr += rivenErr;
         if (rivenErr > rivenMaxErr)
            rivenMaxErr = rivenErr;
      }
      nateAvgErr /= count;
      rivenAvgErr /= count;

      if (log)
      {
         System.out.println("Input: " + xNumbers[3]);
         System.out.println("Nate: " + FastMathDzzD.cos(xNumbers[3]) + ", avg.error=" + nateAvgErr + ", max.error=" + nateMaxErr);
         System.out.println("Riven: " + FastMathRiven.cos(xNumbers[3]) + ", avg.error=" + rivenAvgErr + ", max.error=" + rivenMaxErr);
         System.out.println("Java: " + Math.cos(xNumbers[3]));
         System.out.println("~~~prevent opt. ~~~" + nate_dst[13] + "~~~" + java_dst[13] + "~~~" + riven_dst[13]);
         System.out.println();
      }
   }

Riven · August 7, 2009, 10:21pm

FastMathDzzD.java


public class FastMathDzzD
{
   public static float  PI   = (float) Math.PI;

   private static float f2   = (float) (-0.5);
   private static float f4   = (float) (-f2 / (3.0 * 4.0));
   private static float f6   = (float) (-f4 / (5.0 * 6.0));
   private static float f8   = (float) (-f6 / (7.0 * 8.0));
   private static float f10  = (float) (-f8 / (9.0 * 10.0));
   private static float f12  = (float) (-f10 / (11.0 * 12.0));
   private static float f14  = (float) (-f12 / (13.0 * 14.0));
   private static float f16  = (float) (-f14 / (15.0 * 16.0));
   //MORE PRECISE BUT NOT COMPATIBLE JVM MS =>//
   private static float f18  = (float) (-f16 / (17.0 * 18.0));
   //MORE PRECISE BUT NOT COMPATIBLE JVM MS =>//
   private static float f20  = (float) (-f18 / (19.0 * 20.0));
   private static float PI2  = (float) (2.0 * PI);
   private static float PI05 = (float) (0.5 * PI);

   /**
   * Compute and return sinus of its parameter using taylor serie
   * @param x angle in radian to 
   * @return sinus value for the given parameter
   */
   public static final float sin(float x)
   {
      //return Math.sin(x);
      return cos(x - PI05);
   }

   /**
   * Compute and return cosinus of its parameter using taylor serie
   * @param x angle in radian to 
   * @return cosinus value for the given parameter
   */
   public static final float cos(float x)
   {

      //return Math.cos(x);

      if (x < 0.0)
         x = -x;

      if (x < PI2)
      {
         if (x < PI)
         {
            float x2 = x * x;
            //return 1.0f+x2*(f2+x2*(f4+x2*(f6+x2*(f8+x2*(f10+x2*(f12+x2*(f14+x2*(f16))))))));
            //MORE PRECISE BUT NOT COMPATIBLE JVM MS => 
            return 1.0f + x2 * (f2 + x2 * (f4 + x2 * (f6 + x2 * (f8 + x2 * (f10 + x2 * (f12 + x2 * (f14 + x2 * (f16 + x2 * (f18 + x2 * f20)))))))));
         }
         else
         {
            x -= PI;
            float x2 = x * x;
            //return -(1.0f+x2*(f2+x2*(f4+x2*(f6+x2*(f8+x2*(f10+x2*(f12+x2*(f14+x2*(f16)))))))));
            //MORE PRECISE BUT NOT COMPATIBLE JVM MS => 
            return -(1.0f + x2 * (f2 + x2 * (f4 + x2 * (f6 + x2 * (f8 + x2 * (f10 + x2 * (f12 + x2 * (f14 + x2 * (f16 + x2 * (f18 + x2 * f20))))))))));
         }
      }

      x %= PI2;
      x -= PI;
      float x2 = x * x;

      //return -(1.0f+x2*(f2+x2*(f4+x2*(f6+x2*(f8+x2*(f10+x2*(f12+x2*(f14+x2*f16))))))));
      //MORE PRECISE BUT NOT COMPATIBLE JVM MS => 
      return -(1.0f + x2 * (f2 + x2 * (f4 + x2 * (f6 + x2 * (f8 + x2 * (f10 + x2 * (f12 + x2 * (f14 + x2 * (f16 + x2 * (f18 + x2 * f20))))))))));

   }

}

FastMathRiven.java

public class FastMathRiven
{

   private static final int     SIN_BITS              = 12;                          // adjust these .......
   private static final int     ATAN2_BITS            = 7;                           // adjust these .......

   private static final float   RAD                   = (float) Math.PI / 180.0f;
   private static final float   DEG                   = 180.0f / (float) Math.PI;

   private static final int     SIN_MASK              = ~(-1 << SIN_BITS);
   private static final int     SIN_COUNT             = SIN_MASK + 1;

   private static final float   radFull               = (float) (Math.PI * 2.0);
   private static final float   degFull               = (float) (360.0);
   private static final float   radToIndex            = SIN_COUNT / radFull;
   private static final float   degToIndex            = SIN_COUNT / degFull;

   private static final float[] sin                   = new float[SIN_COUNT];
   private static final float[] cos                   = new float[SIN_COUNT];

   private static final int     ATAN2_BITS2           = ATAN2_BITS << 1;
   private static final int     ATAN2_MASK            = ~(-1 << ATAN2_BITS2);
   private static final int     ATAN2_COUNT           = ATAN2_MASK + 1;
   private static final int     ATAN2_DIM             = (int) Math.sqrt(ATAN2_COUNT);

   private static final float   INV_ATAN2_DIM_MINUS_1 = 1.0f / (ATAN2_DIM - 1);

   private static final float[] atan2                 = new float[ATAN2_COUNT];

   static
   {
      for (int i = 0; i < SIN_COUNT; i++)
      {
         sin[i] = (float) Math.sin((i + 0.5f) / SIN_COUNT * radFull);
         cos[i] = (float) Math.cos((i + 0.5f) / SIN_COUNT * radFull);
      }

      for (int i = 0; i < ATAN2_DIM; i++)
      {
         for (int j = 0; j < ATAN2_DIM; j++)
         {
            // 0 .. 1
            float x0 = (float) i / ATAN2_DIM;
            float y0 = (float) j / ATAN2_DIM;

            atan2[j * ATAN2_DIM + i] = (float) Math.atan2(y0, x0);
         }
      }
   }

   public static final float sin(float rad)
   {
      return sin[(int) (rad * radToIndex) & SIN_MASK];
   }

   public static final float cos(float rad)
   {
      return cos[(int) (rad * radToIndex) & SIN_MASK];
   }

   public static final float atan2(float y, float x)
   {
      float add, mul;

      if (x < 0.0f)
      {
         if (y < 0.0f)
         {
            x = -x;
            y = -y;

            mul = 1.0f;
         }
         else
         {
            x = -x;
            mul = -1.0f;
         }

         add = -3.141592653f;
      }
      else
      {
         if (y < 0.0f)
         {
            y = -y;
            mul = -1.0f;
         }
         else
         {
            mul = 1.0f;
         }

         add = 0.0f;
      }

      float invDiv = 1.0f / (((x < y) ? y : x) * INV_ATAN2_DIM_MINUS_1);

      int xi = (int) (x * invDiv);
      int yi = (int) (y * invDiv);

      return (atan2[yi * ATAN2_DIM + xi] + add) * mul;
   }
}

FastMathNate.java


public class FastMathNate
{
   static public float atan2(float y, float x)
   {
      float abs_y = y < 0 ? -y : y;
      float angle;
      if (x >= 0)
         angle = 0.7853981633974483f - 0.7853981633974483f * (x - abs_y) / (x + abs_y);
      else
         angle = 2.356194490192345f - 0.7853981633974483f * (x + abs_y) / (abs_y - x);
      return y < 0 ? -angle : angle;
   }
}

Riven · August 7, 2009, 10:37pm

I just knew my code couldn’t be 35x slower than a call to Math.cos(), it turns out it is nearly 50x faster.

Result on [u]cos/u: JRE 1.6.0u13 32bit
`Java.cos: 5937us
DzzD.cos: 1755us
Riven.cos: 119us
Input: 0.13322899
DzzD: 0.99113816, avg.error=6.511363215909248E-8, max.error=4.2913641251640655E-7
Riven: 0.9912097, avg.error=6.112402706577345E-4, max.error=0.0022978588251646535
Java: 0.9911381382357358


Java.cos:  5182us
DzzD.cos:  1756us
Riven.cos: 106us
Input: -4.931952
DzzD: 0.217803, avg.error=6.461738699229257E-8, max.error=4.4446321678659473E-7
Riven: 0.21685559, avg.error=6.134784039957628E-4, max.error=0.0022999601748480807
Java: 0.21780315388830504
~~~prevent opt. ~~~0.59424895~~~0.5942489528581584~~~0.5950832

Java.cos:  5215us
DzzD.cos:  1767us
Riven.cos: 114us
Input: 5.194904
DzzD: 0.46400833, avg.error=6.49894411417267E-8, max.error=4.217999866051869E-7
Riven: 0.46393955, avg.error=6.083748864382175E-4, max.error=0.002297156823630276
Java: 0.4640083899859773
~~~prevent opt. ~~~-0.99691343~~~-0.9969134135444965~~~-0.9968811
`

[b]Result on [u]atan2[/u](-PI..+PI, -PI..+PI): JRE 1.6.0u13 32bit[/b]
`
Java.atan2:  9594us
Nate.atan2:  966us
Riven.atan2: 1659us
Input: -0.12577985
Nate: 0.9921001, avg.error=0.04307788871778716, max.error=0.07111472846664224
Riven: 0.9923854, avg.error=0.0029046661205663024, max.error=0.00787278381587564
Java: 0.9921001376530111
~~~prevent opt. ~~~-0.5958918~~~-0.5486365662017681~~~-0.5450384

Java.atan2:  9093us
Nate.atan2:  925us
Riven.atan2: 1650us
Input: -2.7883778
Nate: -0.9382652, avg.error=0.04323209591878118, max.error=0.07111474342576019
Riven: -0.9376059, avg.error=0.00290633592197212, max.error=0.007867688886052049
Java: -0.9382654809616374
~~~prevent opt. ~~~1.0878927~~~1.1530357950577979~~~1.1554981

Java.atan2:  9194us
Nate.atan2:  935us
Riven.atan2: 1650us
Input: 1.9041518
Nate: -0.32721555, avg.error=0.043134051519980066, max.error=0.07111471440464712
Riven: -0.3274853, avg.error=0.0029083043816270954, max.error=0.007872691136467047
Java: -0.32721561538512783
~~~prevent opt. ~~~-1.2705337~~~-1.3387262269635727~~~-1.3370532
`

Nate · August 8, 2009, 3:54am

This is great stuff guys! Let’s do sqrt next!

Accuracy was just showing the accurate value from java.lang.Math and the inaccurate value from other algorithms. Riven greatly improved (and clarified) the benchmark! The time results are to give some (vague) idea of the speed improvement of using the less accurate algorithms.

On a desktop, this stuff is mostly moot, but it does end up mattering for the current Android phones, which I think is kind of fun.

pjt33 · August 8, 2009, 7:46am

You’d be surprised. Back in the 1.4 days when JNI was slooooooow doing trig was the major bottleneck in a game I was working on and it ended up being worthwhile porting the native implementations of sin and cos to Java.

pjt33 · August 8, 2009, 8:37am

/**
 * Total table size: 64kB.
 */
public class FastMathPjt
{
	private static final float[] s1, s2, c1, c2;
	private static final float SCALE = (float)((1 << 23) / Math.PI);

	static
	{
		s1 = new float[1 << 12];
		s2 = new float[1 << 12];
		c1 = new float[1 << 12];
		c2 = new float[1 << 12];

		for (int i = 0; i < s1.length; i++)
		{
			float a = (i << 12) / SCALE;
			float b = i / SCALE;
			s1[i] = (float)Math.sin(a);
			s2[i] = (float)Math.sin(b);
			c1[i] = (float)Math.cos(a);
			c2[i] = (float)Math.cos(b);
		}
	}

	public static final float sin(float x)
	{
		if (x >= 0)
		{
			int i = (int)(x * SCALE);
			int a = (i >> 12) & 0xfff;
			int b = i & 0xfff;
			return s1[a] * c2[b] + c1[a] * s2[b];
		}
		else
		{
			int i = (int)(-x * SCALE);
			int a = (i >> 12) & 0xfff;
			int b = i & 0xfff;
			return -(s1[a] * c2[b] + c1[a] * s2[b]);
		}
	}

	public static final float cos(float x)
	{
		if (x < 0) x = -x;
		int i = (int)(x * SCALE);
		int a = (i >> 12) & 0xfff;
		int b = i & 0xfff;
		return c1[a] * c2[b] - s1[a] * s2[b];
	}
}

This is the #3 in my list above. Twice as much memory usage as FastMathRiven, twice as fast as FastMathDzzD, similar error magnitude to FastMathDzzD. Memory usage could be reduced at cost of speed.

Roquen · August 8, 2009, 9:02am

FYI, some easy intros:

http://www.research.scea.com/gdc2003/fast-math-functions_p1.pdf
http://www.research.scea.com/gdc2003/fast-math-functions_p2.pdf

Note:

It’s very rare for well designed approximation to not be much faster than a general library call. These generalize calls are (almost always) designed to have well defined results for all inputs. Also Java only supports common analytic functions for doubles. So you could write a function mySin(float) such that mySin(x)==(float)Math.sin(x) for all values of ‘x’ which I would expect to be about 2x faster. That can be improved by throwing out specific results for one or more of the followin: NaNs, denormals and outside of common or specifically required range.
The timing methods used here will tend to show table based methods in a more favoriable light than might be the case under real usage-patterns.

DzzD · August 8, 2009, 11:17am

wow really sorry man, good you see it

[quote]Your benchmark was fundamentally flawed. It allowed the JVM to completely remove the Math.cos() call.
[/quote]
not mine I just copy/past the one posted in this thread

good catch as that was a big surprise to see that Math.cos was so fast …and results seems now to be what we would all attempt them to be

EDIT :

just one point you are measuring the error between mine and java but Java does not give the exact result too ( as for 0.5 * PI where taylor is more accurate the original java cos ) but not that far yes… (anyway this cos taylor version use float so it should be pretty less accurate)

DzzD · August 8, 2009, 12:10pm

about sqrt

as for me it is mainly used to normalize vector I use a lookup table of int where lookup[n]=2147483647/sqrt(n*k) //k is greater than 1 and allow to scale the range of input

it also exist a recurrent algorithm using dicotomic approch something like :

    static double approxSqrt(double n,double error)
    {
		double r=n/2;
		double r2=r*r;
		double d=r*0.5;
		int loop=0;
		do
		{
			System.out.println("("+loop+++") r="+r);
			if(r2>n)
 				r-=d;
			else
				r+=d;
			r2=r*r;
			d*=0.5;
		}
		while(Math.abs(r2-n)>error);
		return r;
	}

output for approxSqrt(100,0.1)

EDIT:

and the same sor int :

static int intSQRT(int input)
	{
		int r=input>>1;
		int r2=r*r;
		int d=r>>1;
		int loop=0;
		do
		{
			System.out.println("("+loop+++") r="+r + " r*r=" +(r*r));
			r+=(r2>input)?-d:d;
			r2=r*r;
			d>>=1;
		}
		while(r2!=input && d!=0);
		return r;
		
	}

output for sqrt(100);

this is only the base idea, the first value of r should be optimised by using a lookup table or anyother way to make it smarter

Roquen · August 8, 2009, 1:13pm

A better sin minimax example, the maximum error is 0x1.8p-23 at x ~1.466838 (again on +/- pi/2)


    x2 = x*x;
    r  = -2.39e-08f;
    r *= x2;
    r += 2.7526e-06f;
    r *= x2;
    r -= 1.98409e-04f;
    r *= x2;
    r += 8.3333315e-03f;
    r *= x2;
    r -= 1.666666664e-01f;
    r *= x2;
    r += 1.f;
    r *= x;

DzzD · August 8, 2009, 1:19pm

Roquen:

A better sin minimax example, the maximum error is 0x1.8p-23 at x ~1.466838 (again on +/- pi/2)


    x2 = x*x;
    r  = -2.39e-08f;
    r *= x2;
    r += 2.7526e-06f;
    r *= x2;
    r -= 1.98409e-04f;
    r *= x2;
    r += 8.3333315e-03f;
    r *= x2;
    r -= 1.666666664e-01f;
    r *= x2;
    r += 1.f;
    r *= x;

ok but so the original java cos error is greater, no ? also the version I usually use is based on double not float (wich for the computer I have tested run faster than float)

EDIT :

System.out.println("Math.cos(Math.PI*0.5))="+ Math.cos(Math.PI*0.5)  );

ouput this :
Math.cos(Math.PI*0.5))=6.123233995736766E-17

and should be 0.0

Roquen · August 8, 2009, 2:14pm

The “error” you’re seeing in an argument reduction issue, Math.PI/2 is not pi/2. (I agree your version make more sense). Example:

double d = Math.PI*0.5;
System.out.printf("%s\n", Double.toHexString(Math.cos(d)));
System.out.printf("%s\n", Double.toHexString(Math.cos(Math.nextUp(d))));

Most hardware will execute floats faster than doubles (notable counter-example is intel-a-likes using x87 instead of SSE).

Note, I’m only attempting to compare truncated power-series vs. other polynomial approximation methods. Truncated power series ignore finite precision and are centered on a single point and most NA methods take into account finite precision and target a range. I’m ignoring stuff like argument reduction and dealing with any special values (NaN, +/-Infinity, denormals, -zero)

Stuck a sqrt(x) and 1/sqrt(x) here: http://www.java-gaming.org/index.php/topic,20997.0.html

DzzD · August 8, 2009, 2:56pm

sorry but I dont agree, IMHO double is probably faster or at least same speed (and for sure will become faster)

you can check a nice discussion on gamedev about that (back in 2005) : http://www.gamedev.net/community/forums/topic.asp?topic_id=343479

according to this discussion for example double is native on an Xbox360

Roquen · August 8, 2009, 3:24pm

Most of these comments are ill informed. For example an SSE machine can perform 4 float ops faster than the 2 double ops that will fix in the same registers. Example - dividing 4x4 floats has a throughput of 36 cycles and 2x2 doubles is 62 (for CPUID 69) - so 9 cycles/divide for float and 31 cycles/divide for doubles. I know of no hardware in which doubles are faster than floats and don’t expect to see it in my lifetime (as least for consumer hardware).

DzzD · August 8, 2009, 5:00pm

I played with Riven lookup version and try a little improvment on accuracy by adding linear interpolation on results, it is twice slower but a little bit more accurate

I juste replaced :

 public static final float cos(float rad)
   {
      return cos[(int) (rad * radToIndex) & SIN_MASK];
   }

by :

public static final float cos(float rad)
   {
   	
   	  int idx=(int) (rad * radToIndex * 256f);
   	  
   	  float r=(idx&0xFF)*0.00390625f;
   	  idx>>=8;
   	  float c1=cos[idx & SIN_MASK];
   	  float c2=cos[idx+1 & SIN_MASK];
   	  return c1*(1f-r)+c2*r;
   	  
      
   }

and re-run the bench code

it give the following result :

before :
time between 290 to 300 us
Riven: -0.78881806, avg.error=6.071037300951922E-4, max.error=0.0022978310394030435

after adding linear interpolation :
time between 550 to 560 us
Riven: 0.9108251, avg.error=4.901244049417835E-4, max.error=7.731781248493941E-4

EDIT : maybe not really usefull but it make it act as a continuaous function rather than a sampled one

Riven · August 8, 2009, 5:15pm

Right… forgetting the fact that Java has no SIMD? Besides that, the statement ‘has a throughput of 36 cycles’ makes no sense, at all.

Benchmark to show that float / double performance is nearly identical:


 public static void main(String[] args)
   {
      int elems = 1024;
      int runs = 16;

      Random r = new Random();

      for (int k = 0; k < 8; k++)
      {
         float[] f1 = new float[elems];
         float[] f2 = new float[elems];
         float[] f4x4 = new float[16];
         for (int i = 0; i < 16; i++)
            f4x4[i] = r.nextFloat();
         long float_ms = benchFloat(f1, f2, f4x4, elems, runs);
         System.out.println("float performance: " + float_ms / 1000000L + "ms (midpoint)");

         double[] d1 = new double[elems];
         double[] d2 = new double[elems];
         double[] d4x4 = new double[16];
         for (int i = 0; i < 16; i++)
            d4x4[i] = r.nextDouble();
         long double_ms = benchDouble(d1, d2, d4x4, elems, runs);
         System.out.println("double performance: " + double_ms / 1000000L + "ms (midpoint)");
      }
   }

   public static long benchFloat(float[] f1, float[] f2, float[] mat, int elems, int runs)
   {
      long[] ts = new long[runs];
      for (int i = 0; i < ts.length; i++)
      {
         long a = System.nanoTime();
         for (float t1 = 0.0f; t1 < 1.0f; t1 += 0.01f)
         {
            for (float t2 = 0.0f; t2 < 1.0f; t2 += 0.02f)
            {
               fiddleFloat(t1, t2, elems, f1, f2, mat);
            }
         }
         long b = System.nanoTime();
         ts[i] = b - a;
      }
      Arrays.sort(ts);
      return ts[ts.length / 2];
   }

   public static long benchDouble(double[] d1, double[] d2, double[] mat, int elems, int runs)
   {
      long[] ts = new long[runs];
      for (int i = 0; i < ts.length; i++)
      {
         long a = System.nanoTime();
         for (double t1 = 0.0; t1 < 1.0; t1 += 0.01f)
         {
            for (double t2 = 0.0; t2 < 1.0; t2 += 0.02f)
            {
               fiddleDouble(t1, t2, elems, d1, d2, mat);
            }
         }
         long b = System.nanoTime();
         ts[i] = b - a;
      }
      Arrays.sort(ts);
      return ts[ts.length / 2];
   }

   public static float fiddleFloat(float t1, float t2, int elems, float[] op1, float[] op2, float[] m4x4)
   {
      float sum = 0.0f;
      for (int i = 0; i < elems; i++)
      {
         float f1 = op1[i];
         float f2 = op2[i];
         float diff1 = f2 - f1;
         float f3 = t1 * diff1 + f1;
         float diff2 = f3 - f2;

         sum += t2 * diff2 + f2;
         sum -= m4x4[0] * f1 + m4x4[1] * f2 + m4x4[2] * f3 + m4x4[3];
         sum += m4x4[4] * f1 + m4x4[5] * f2 + m4x4[6] * f3 + m4x4[7];
         sum -= m4x4[8] * f1 + m4x4[9] * f2 + m4x4[10] * f3 + m4x4[11];
      }
      return sum;
   }

   public static double fiddleDouble(double t1, double t2, int elems, double[] op1, double[] op2, double[] m4x4)
   {
      double sum = 0.0f;
      for (int i = 0; i < elems; i++)
      {
         double f1 = op1[i];
         double f2 = op2[i];
         double diff1 = f2 - f1;
         double f3 = t1 * diff1 + f1;
         double diff2 = f3 - f2;

         sum += t2 * diff2 + f2;
         sum -= m4x4[0] * f1 + m4x4[1] * f2 + m4x4[2] * f3 + m4x4[3];
         sum += m4x4[4] * f1 + m4x4[5] * f2 + m4x4[6] * f3 + m4x4[7];
         sum -= m4x4[8] * f1 + m4x4[9] * f2 + m4x4[10] * f3 + m4x4[11];
      }
      return sum;
   }


float performance: 51ms (midpoint)
double performance: 52ms (midpoint)
float performance: 75ms (midpoint)
double performance: 52ms (midpoint)
float performance: 50ms (midpoint)
double performance: 51ms (midpoint)
float performance: 50ms (midpoint)
double performance: 52ms (midpoint)
float performance: 50ms (midpoint)
double performance: 50ms (midpoint)
float performance: 50ms (midpoint)
double performance: 51ms (midpoint)
float performance: 50ms (midpoint)
double performance: 52ms (midpoint)
float performance: 50ms (midpoint)
double performance: 72ms (midpoint)