Extremely Fast atan2

mooman219 · July 25, 2015, 4:38pm

Hello,

I’ve just been looking around for a faster way to do Math.atan2(y,x) and I came across a number of methods. To compare them, I setup a simple JMH benchmark and accuracy test. Criticisms and new atan2 submissions are appreciated.

Recommended implementation with lookup table: Icecore’s
Recommended implementation without lookup table: Kappa’s

Relevant data

     Current results:
     apache_atan2   : Average Error 0.00000 / Largest Error 0.00000 - Performance 367.945 ns/op
     default_atan2  : Average Error 0.00000 / Largest Error 0.00000 - Performance 579.170 ns/op
     DSP.atan2_acc  : Average Error 0.00344 / Largest Error 1.57079 - Performance  64.812 ns/op
     DSP.atan2_fast : Average Error 0.04318 / Largest Error 1.57080 - Performance  51.691 ns/op
     Icecore.atan2  : Average Error 0.00004 / Largest Error 0.00010 - Performance  27.291 ns/op
     Kappa.atan2    : Average Error 0.00231 / Largest Error 0.00488 - Performance  38.766 ns/op
     Riven.atan2    : Average Error 0.00288 / Largest Error 0.00787 - Performance  36.640 ns/op

I posted the raw data below for anyone to interpret for yourselves. The links to the original sources are provided in the comments.

public class ATAN2 {

    // Values declared above to avoid the JIT from inlining
    public static int nTen = -10;
    public static int nFour = -4;
    public static int nTwo = -2;
    public static int nOne = -1;
    public static int pOne = 1;
    public static int pThree = 3;
    public static int pFour = 4;
    public static int pNine = 9;

    ///////////////////////////////////////
    // Default atan2
    ///////////////////////////////////////

    @Benchmark
    public double math_default() {
        return Math.atan2(pThree, pFour) + Math.atan2(nTwo, pOne) + Math.atan2(nFour, nOne) + Math.atan2(pNine, nTen);
    }

    ///////////////////////////////////////
    // Riven's atan2 ( http://www.java-gaming.org/index.php?topic=14647.0 )
    ///////////////////////////////////////

    public static final class Riven {

        private static final int ATAN2_BITS = 7;

        private static final int ATAN2_BITS2 = ATAN2_BITS << 1;
        private static final int ATAN2_MASK = ~(-1 << ATAN2_BITS2);
        private static final int ATAN2_COUNT = ATAN2_MASK + 1;
        private static final int ATAN2_DIM = (int) Math.sqrt(ATAN2_COUNT);

        private static final float INV_ATAN2_DIM_MINUS_1 = 1.0f / (ATAN2_DIM - 1);

        private static final float[] atan2 = new float[ATAN2_COUNT];

        static {
            for (int i = 0; i < ATAN2_DIM; i++) {
                for (int j = 0; j < ATAN2_DIM; j++) {
                    float x0 = (float) i / ATAN2_DIM;
                    float y0 = (float) j / ATAN2_DIM;
                    atan2[j * ATAN2_DIM + i] = (float) Math.atan2(y0, x0);
                }
            }
        }

        public static final float atan2(float y, float x) {
            float add, mul;

            if (x < 0.0f) {
                if (y < 0.0f) {
                    x = -x;
                    y = -y;

                    mul = 1.0f;
                } else {
                    x = -x;
                    mul = -1.0f;
                }
                add = -3.141592653f;
            } else {
                if (y < 0.0f) {
                    y = -y;
                    mul = -1.0f;
                } else {
                    mul = 1.0f;
                }
                add = 0.0f;
            }

            float invDiv = 1.0f / (((x < y) ? y : x) * INV_ATAN2_DIM_MINUS_1);

            int xi = (int) (x * invDiv);
            int yi = (int) (y * invDiv);

            return (atan2[yi * ATAN2_DIM + xi] + add) * mul;
        }
    }


    @Benchmark
    public double math_riven() {
        return Riven.atan2(pThree, pFour) + Riven.atan2(nTwo, pOne) + Riven.atan2(nFour, nOne) + Riven.atan2(pNine, nTen);
    }

    ///////////////////////////////////////
    // DSP's atan2 ( http://dspguru.com/dsp/tricks/fixed-point-atan2-with-self-normalization )
    ///////////////////////////////////////

    public static final class DSP {

        private static final double coeff_1 = Math.PI / 4;
        private static final double coeff_2 = coeff_1 * 3;

        public static final double atan2_fast(double y, double x) {
            double r;
            if (y < 0) {
                y = -y;
                if (x > 0) {
                    r = (x - y) / (x + y);
                    return -(coeff_1 - coeff_1 * r);
                } else {
                    r = (x + y) / (y - x);
                    return -(coeff_2 - coeff_1 * r);
                }
            } else {
                if (y == 0) {
                    y = 1.0E-25;
                }
                if (x > 0) {
                    r = (x - y) / (x + y);
                    return coeff_1 - coeff_1 * r;
                } else {
                    r = (x + y) / (y - x);
                    return coeff_2 - coeff_1 * r;
                }
            }
        }

        public static final double atan2_accurate(double y, double x) {
            double r;
            if (y < 0) {
                y = -y;
                if (x > 0) {
                    r = (x - y) / (x + y);
                    return -(0.1963 * r * r * r - 0.9817 * r + coeff_1);
                } else {
                    r = (x + y) / (y - x);
                    return -(0.1963 * r * r * r - 0.9817 * r + coeff_2);
                }
            } else {
                if (y == 0) {
                    y = 1.0E-25;
                }
                if (x > 0) {
                    r = (x - y) / (x + y);
                    return 0.1963 * r * r * r - 0.9817 * r + coeff_1;
                } else {
                    r = (x + y) / (y - x);
                    return 0.1963 * r * r * r - 0.9817 * r + coeff_2;
                }
            }
        }
    }

    @Benchmark
    public double math_dsp_fast() {
        return DSP.atan2_fast(pThree, pFour) + DSP.atan2_fast(nTwo, pOne) + DSP.atan2_fast(nFour, nOne) + DSP.atan2_fast(pNine, nTen);
    }

    @Benchmark
    public double math_dsp_accurate() {
        return DSP.atan2_accurate(pThree, pFour) + DSP.atan2_accurate(nTwo, pOne) + DSP.atan2_accurate(nFour, nOne) + DSP.atan2_accurate(pNine, nTen);
    }

    ///////////////////////////////////////
    // kappa's atan2 ( http://www.java-gaming.org/topics/extremely-fast-atan2/36467/msg/346112/view.html#msg346112 )
    ///////////////////////////////////////

    public static final class Kappa {

        static final float PI = 3.1415927f;
        static final float PI_2 = PI / 2f;
        static final float MINUS_PI_2 = -PI_2;

        public static final float atan2(float y, float x) {
            if (x == 0.0f) {
                if (y > 0.0f) {
                    return PI_2;
                }
                if (y == 0.0f) {
                    return 0.0f;
                }
                return MINUS_PI_2;
            }

            final float atan;
            final float z = y / x;
            if (Math.abs(z) < 1.0f) {
                atan = z / (1.0f + 0.28f * z * z);
                if (x < 0.0f) {
                    return (y < 0.0f) ? atan - PI : atan + PI;
                }
                return atan;
            } else {
                atan = PI_2 - z / (z * z + 0.28f);
                return (y < 0.0f) ? atan - PI : atan;
            }
        }
    }

    @Benchmark
    public double math_kappa() {
        return Kappa.atan2(pThree, pFour) + Kappa.atan2(nTwo, pOne) + Kappa.atan2(nFour, nOne) + Kappa.atan2(pNine, nTen);
    }

    ///////////////////////////////////////
    // Icecore's atan2 ( http://www.java-gaming.org/topics/extremely-fast-atan2/36467/msg/346145/view.html#msg346145 )
    ///////////////////////////////////////

    public static final class Icecore {

        private static final int Size_Ac = 100000;
        private static final int Size_Ar = Size_Ac + 1;
        private static final float Pi = (float) Math.PI;
        private static final float Pi_H = Pi / 2;

        private static final float Atan2[] = new float[Size_Ar];
        private static final float Atan2_PM[] = new float[Size_Ar];
        private static final float Atan2_MP[] = new float[Size_Ar];
        private static final float Atan2_MM[] = new float[Size_Ar];

        private static final float Atan2_R[] = new float[Size_Ar];
        private static final float Atan2_RPM[] = new float[Size_Ar];
        private static final float Atan2_RMP[] = new float[Size_Ar];
        private static final float Atan2_RMM[] = new float[Size_Ar];

        static {
            for (int i = 0; i <= Size_Ac; i++) {
                double d = (double) i / Size_Ac;
                double x = 1;
                double y = x * d;
                float v = (float) Math.atan2(y, x);
                Atan2[i] = v;
                Atan2_PM[i] = Pi - v;
                Atan2_MP[i] = -v;
                Atan2_MM[i] = -Pi + v;

                Atan2_R[i] = Pi_H - v;
                Atan2_RPM[i] = Pi_H + v;
                Atan2_RMP[i] = -Pi_H + v;
                Atan2_RMM[i] = -Pi_H - v;
            }
        }

        public static final float atan2(float y, float x) {
            if (y < 0) {
                if (x < 0) {
                    //(y < x) because == (-y > -x)
                    if (y < x) {
                        return Atan2_RMM[(int) (x / y * Size_Ac)];
                    } else {
                        return Atan2_MM[(int) (y / x * Size_Ac)];
                    }
                } else {
                    y = -y;
                    if (y > x) {
                        return Atan2_RMP[(int) (x / y * Size_Ac)];
                    } else {
                        return Atan2_MP[(int) (y / x * Size_Ac)];
                    }
                }
            } else {
                if (x < 0) {
                    x = -x;
                    if (y > x) {
                        return Atan2_RPM[(int) (x / y * Size_Ac)];
                    } else {
                        return Atan2_PM[(int) (y / x * Size_Ac)];
                    }
                } else {
                    if (y > x) {
                        return Atan2_R[(int) (x / y * Size_Ac)];
                    } else {
                        return Atan2[(int) (y / x * Size_Ac)];
                    }
                }
            }
        }
    }

    @Benchmark
    public double math_icecore() {
        return Icecore.atan2(pThree, pFour) + Icecore.atan2(nTwo, pOne) + Icecore.atan2(nFour, nOne) + Icecore.atan2(pNine, nTen);
    }

    ///////////////////////////////////////
    // Apache's FastMath.atan2 ( http://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math3/util/FastMath.html )
    ///////////////////////////////////////

    @Benchmark
    public double math_apache() {
        return FastMath.atan2(pThree, pFour) + FastMath.atan2(nTwo, pOne) + FastMath.atan2(nFour, nOne) + FastMath.atan2(pNine, nTen);
    }
}

Accuracy

    ///////////////////////////////////////
    // Accuracy
    ///////////////////////////////////////

    public static void main(String[] args) {
        int range = 1500;
        double[] total = new double[7];
        double[] largestError = new double[7];
        int count = 0;
        for (int y = -range; y <= range; y++) {
            for (int x = -range; x <= range; x++) {
                double result;

                result = Math.abs(Math.atan2(y, x) - Math.atan2(y, x));
                total[0] += result;
                largestError[0] = result > largestError[0] ? result : largestError[0];

                result = Math.abs(Math.atan2(y, x) - DSP.atan2_accurate(y, x));
                total[1] += result;
                largestError[1] = result > largestError[1] ? result : largestError[1];

                result = Math.abs(Math.atan2(y, x) - DSP.atan2_fast(y, x));
                total[2] += result;
                largestError[2] = result > largestError[2] ? result : largestError[2];

                result = Math.abs(Math.atan2(y, x) - Icecore.atan2(y, x));
                total[3] += result;
                largestError[3] = result > largestError[3] ? result : largestError[3];

                result = Math.abs(Math.atan2(y, x) - Kappa.atan2(y, x));
                total[4] += result;
                largestError[4] = result > largestError[4] ? result : largestError[4];

                result = Math.abs(Math.atan2(y, x) - Riven.atan2(y, x));
                total[5] += result;
                largestError[5] = result > largestError[5] ? result : largestError[5];


                result = Math.abs(Math.atan2(y, x) - FastMath.atan2(y, x));
                total[6] += result;
                largestError[6] = result > largestError[6] ? result : largestError[6];

                count++;
            }
        }
        System.out.println(String.format("A lower average means higher accuracy. Results over %,d samples.", count));
        System.out.println(String.format("apache_atan2   : Average Error %.5f / Largest Error %.5f", total[6] / count, largestError[6]));
        System.out.println(String.format("default_atan2  : Average Error %.5f / Largest Error %.5f", total[0] / count, largestError[0]));
        System.out.println(String.format("DSP.atan2_acc  : Average Error %.5f / Largest Error %.5f", total[1] / count, largestError[1]));
        System.out.println(String.format("DSP.atan2_fast : Average Error %.5f / Largest Error %.5f", total[2] / count, largestError[2]));
        System.out.println(String.format("Icecore.atan2  : Average Error %.5f / Largest Error %.5f", total[3] / count, largestError[3]));
        System.out.println(String.format("Kappa.atan2    : Average Error %.5f / Largest Error %.5f", total[4] / count, largestError[4]));
        System.out.println(String.format("Riven.atan2    : Average Error %.5f / Largest Error %.5f", total[5] / count, largestError[5]));
        /*
         A lower average means higher accuracy. Results over 9,006,001 samples.
         apache_atan2   : Average Error 0.00000 / Largest Error 0.00000
         default_atan2  : Average Error 0.00000 / Largest Error 0.00000
         DSP.atan2_acc  : Average Error 0.00344 / Largest Error 1.57079
         DSP.atan2_fast : Average Error 0.04318 / Largest Error 1.57080
         Icecore.atan2  : Average Error 0.00000 / Largest Error 0.00001
         Kappa.atan2    : Average Error 0.00231 / Largest Error 0.00488
         Riven.atan2    : Average Error 0.00288 / Largest Error 0.00787
         */
    }

Benchmark results

# JMH 1.9.3 (released 73 days ago)
# VM invoker: C:\Program Files\Java\jdk1.8.0_45\jre\bin\java.exe
# VM options: <none>
# Warmup: 5 iterations, 1 s each
# Measurement: 5 iterations, 1 s each
# Timeout: 10 min per iteration
# Threads: 1 thread, will synchronize iterations
# Benchmark mode: Average time, time/op
# Benchmark: com.gmail.mooman219.benchmark_test.ATAN2.math_apache

# Run progress: 0.00% complete, ETA 00:01:10
# Fork: 1 of 1
# Warmup Iteration   1: 374.275 ns/op
# Warmup Iteration   2: 372.820 ns/op
# Warmup Iteration   3: 369.142 ns/op
# Warmup Iteration   4: 368.814 ns/op
# Warmup Iteration   5: 367.294 ns/op
Iteration   1: 366.987 ns/op
Iteration   2: 371.812 ns/op
Iteration   3: 366.907 ns/op
Iteration   4: 366.877 ns/op
Iteration   5: 367.142 ns/op


Result "math_apache":
  367.945 ñ(99.9%) 8.334 ns/op [Average]
  (min, avg, max) = (366.877, 367.945, 371.812), stdev = 2.164
  CI (99.9%): [359.611, 376.279] (assumes normal distribution)


# JMH 1.9.3 (released 73 days ago)
# VM invoker: C:\Program Files\Java\jdk1.8.0_45\jre\bin\java.exe
# VM options: <none>
# Warmup: 5 iterations, 1 s each
# Measurement: 5 iterations, 1 s each
# Timeout: 10 min per iteration
# Threads: 1 thread, will synchronize iterations
# Benchmark mode: Average time, time/op
# Benchmark: com.gmail.mooman219.benchmark_test.ATAN2.math_default

# Run progress: 14.29% complete, ETA 00:01:02
# Fork: 1 of 1
# Warmup Iteration   1: 590.515 ns/op
# Warmup Iteration   2: 585.678 ns/op
# Warmup Iteration   3: 578.451 ns/op
# Warmup Iteration   4: 577.610 ns/op
# Warmup Iteration   5: 578.103 ns/op
Iteration   1: 577.544 ns/op
Iteration   2: 585.795 ns/op
Iteration   3: 577.450 ns/op
Iteration   4: 577.399 ns/op
Iteration   5: 577.663 ns/op


Result "math_default":
  579.170 ñ(99.9%) 14.266 ns/op [Average]
  (min, avg, max) = (577.399, 579.170, 585.795), stdev = 3.705
  CI (99.9%): [564.904, 593.437] (assumes normal distribution)


# JMH 1.9.3 (released 73 days ago)
# VM invoker: C:\Program Files\Java\jdk1.8.0_45\jre\bin\java.exe
# VM options: <none>
# Warmup: 5 iterations, 1 s each
# Measurement: 5 iterations, 1 s each
# Timeout: 10 min per iteration
# Threads: 1 thread, will synchronize iterations
# Benchmark mode: Average time, time/op
# Benchmark: com.gmail.mooman219.benchmark_test.ATAN2.math_dsp_accurate

# Run progress: 28.57% complete, ETA 00:00:51
# Fork: 1 of 1
# Warmup Iteration   1: 66.682 ns/op
# Warmup Iteration   2: 65.408 ns/op
# Warmup Iteration   3: 64.661 ns/op
# Warmup Iteration   4: 64.637 ns/op
# Warmup Iteration   5: 64.707 ns/op
Iteration   1: 64.645 ns/op
Iteration   2: 65.505 ns/op
Iteration   3: 64.629 ns/op
Iteration   4: 64.631 ns/op
Iteration   5: 64.652 ns/op


Result "math_dsp_accurate":
  64.812 ñ(99.9%) 1.492 ns/op [Average]
  (min, avg, max) = (64.629, 64.812, 65.505), stdev = 0.387
  CI (99.9%): [63.320, 66.304] (assumes normal distribution)


# JMH 1.9.3 (released 73 days ago)
# VM invoker: C:\Program Files\Java\jdk1.8.0_45\jre\bin\java.exe
# VM options: <none>
# Warmup: 5 iterations, 1 s each
# Measurement: 5 iterations, 1 s each
# Timeout: 10 min per iteration
# Threads: 1 thread, will synchronize iterations
# Benchmark mode: Average time, time/op
# Benchmark: com.gmail.mooman219.benchmark_test.ATAN2.math_dsp_fast

# Run progress: 42.86% complete, ETA 00:00:41
# Fork: 1 of 1
# Warmup Iteration   1: 52.145 ns/op
# Warmup Iteration   2: 51.808 ns/op
# Warmup Iteration   3: 51.245 ns/op
# Warmup Iteration   4: 51.228 ns/op
# Warmup Iteration   5: 51.298 ns/op
Iteration   1: 51.269 ns/op
Iteration   2: 52.402 ns/op
Iteration   3: 51.282 ns/op
Iteration   4: 51.425 ns/op
Iteration   5: 52.078 ns/op


Result "math_dsp_fast":
  51.691 ñ(99.9%) 1.993 ns/op [Average]
  (min, avg, max) = (51.269, 51.691, 52.402), stdev = 0.518
  CI (99.9%): [49.698, 53.684] (assumes normal distribution)


# JMH 1.9.3 (released 73 days ago)
# VM invoker: C:\Program Files\Java\jdk1.8.0_45\jre\bin\java.exe
# VM options: <none>
# Warmup: 5 iterations, 1 s each
# Measurement: 5 iterations, 1 s each
# Timeout: 10 min per iteration
# Threads: 1 thread, will synchronize iterations
# Benchmark mode: Average time, time/op
# Benchmark: com.gmail.mooman219.benchmark_test.ATAN2.math_icecore

# Run progress: 57.14% complete, ETA 00:00:31
# Fork: 1 of 1
# Warmup Iteration   1: 44.693 ns/op
# Warmup Iteration   2: 28.011 ns/op
# Warmup Iteration   3: 27.220 ns/op
# Warmup Iteration   4: 27.225 ns/op
# Warmup Iteration   5: 27.249 ns/op
Iteration   1: 27.204 ns/op
Iteration   2: 27.599 ns/op
Iteration   3: 27.220 ns/op
Iteration   4: 27.212 ns/op
Iteration   5: 27.221 ns/op


Result "math_icecore":
  27.291 ñ(99.9%) 0.663 ns/op [Average]
  (min, avg, max) = (27.204, 27.291, 27.599), stdev = 0.172
  CI (99.9%): [26.628, 27.955] (assumes normal distribution)


# JMH 1.9.3 (released 73 days ago)
# VM invoker: C:\Program Files\Java\jdk1.8.0_45\jre\bin\java.exe
# VM options: <none>
# Warmup: 5 iterations, 1 s each
# Measurement: 5 iterations, 1 s each
# Timeout: 10 min per iteration
# Threads: 1 thread, will synchronize iterations
# Benchmark mode: Average time, time/op
# Benchmark: com.gmail.mooman219.benchmark_test.ATAN2.math_kappa

# Run progress: 71.43% complete, ETA 00:00:20
# Fork: 1 of 1
# Warmup Iteration   1: 40.813 ns/op
# Warmup Iteration   2: 40.480 ns/op
# Warmup Iteration   3: 38.620 ns/op
# Warmup Iteration   4: 38.945 ns/op
# Warmup Iteration   5: 38.653 ns/op
Iteration   1: 38.622 ns/op
Iteration   2: 39.381 ns/op
Iteration   3: 38.611 ns/op
Iteration   4: 38.615 ns/op
Iteration   5: 38.600 ns/op


Result "math_kappa":
  38.766 ñ(99.9%) 1.325 ns/op [Average]
  (min, avg, max) = (38.600, 38.766, 39.381), stdev = 0.344
  CI (99.9%): [37.441, 40.090] (assumes normal distribution)


# JMH 1.9.3 (released 73 days ago)
# VM invoker: C:\Program Files\Java\jdk1.8.0_45\jre\bin\java.exe
# VM options: <none>
# Warmup: 5 iterations, 1 s each
# Measurement: 5 iterations, 1 s each
# Timeout: 10 min per iteration
# Threads: 1 thread, will synchronize iterations
# Benchmark mode: Average time, time/op
# Benchmark: com.gmail.mooman219.benchmark_test.ATAN2.math_riven

# Run progress: 85.71% complete, ETA 00:00:10
# Fork: 1 of 1
# Warmup Iteration   1: 45.780 ns/op
# Warmup Iteration   2: 40.609 ns/op
# Warmup Iteration   3: 36.467 ns/op
# Warmup Iteration   4: 36.467 ns/op
# Warmup Iteration   5: 36.502 ns/op
Iteration   1: 36.458 ns/op
Iteration   2: 37.000 ns/op
Iteration   3: 36.473 ns/op
Iteration   4: 36.435 ns/op
Iteration   5: 36.836 ns/op


Result "math_riven":
  36.640 ñ(99.9%) 1.002 ns/op [Average]
  (min, avg, max) = (36.435, 36.640, 37.000), stdev = 0.260
  CI (99.9%): [35.638, 37.643] (assumes normal distribution)


# Run complete. Total time: 00:01:12

Benchmark                Mode  Cnt    Score    Error  Units
ATAN2.math_apache        avgt    5  367.945 ñ  8.334  ns/op
ATAN2.math_default       avgt    5  579.170 ñ 14.266  ns/op
ATAN2.math_dsp_accurate  avgt    5   64.812 ñ  1.492  ns/op
ATAN2.math_dsp_fast      avgt    5   51.691 ñ  1.993  ns/op
ATAN2.math_icecore       avgt    5   27.291 ñ  0.663  ns/op
ATAN2.math_kappa         avgt    5   38.766 ñ  1.325  ns/op
ATAN2.math_riven         avgt    5   36.640 ñ  1.002  ns/op

Keep in mind these are micro benchmarks and not indicative of real world performance; the JMH can only do so much to make sure the JIT doesn’t interfere. There may be a fair amount of branch prediction and caching that’s going on. For example, the lookup table in Riven’s atan2 implementation might just be sitting in a cpu cache.

Riven · July 25, 2015, 7:22pm

Nice comparison. I doubt the accuracy of dsp_atan2 is enough for game purposes, but it’s indeed almost 80% faster than my LUT.

mooman219 · July 25, 2015, 7:47pm

Yeah, that’s pretty unreasonable error, so I have updated the original post with a more accurate version. The faster version was within ~0.07 radians. The new version I just added is within ~0.01 radians which should be acceptable in most cases. The benchmark/error/results have been updated to reflect this new addition. Given it’s just a couple of more multiplications, there doesn’t seem to be a performance difference between the accurate and faster version when using JMH. I suspect the minor difference would only matter on embedded devices meant for digital signal processing (What the original C implementation was targeting).

Riven · July 25, 2015, 7:50pm

Nice one. :point:

KaiHH · July 25, 2015, 8:26pm

Really nice find! Because I really wanted to know how it performs in a jitted environment with dynamic input data, I too conducted a test between the dsp accurate version and Java’s default Math.atan2.
The result was, that the dsp accurate version is about 2.8 to 3.2 times faster than Java’s version.
I gave the JVM 100,000,000 warmup iterations and performed 100,000,000 computations separately on both versions with random input values.

mooman219 · July 25, 2015, 8:31pm

That sounds great KaiHH! Do you think you could post a comparison with Riven’s in there too? That one is also quite fast too and I would like to see how dsp’s version holds up.

KaiHH · July 25, 2015, 8:40pm

Sure.

Riven’s is 1.7 times faster than Java’s version, but the accurate dsp version is 0.55 times faster than Riven’s.
I bumped up the test iterations to 600,000,000 invocations and now the accurate dsp version is consistently 3.2 times faster than Java’s.
I tested under Win7 x64 with jre1.8.0_51.

kappa · July 25, 2015, 8:40pm

There was also another version of a fast atan floating around on the forums a while back, not sure how it’ll compare to the above but might be worth giving it a spin in one of the benchmarks


public class Atan {
	
	static final float PI = 3.1415927f;
	static final float PI_2 = PI / 2f;
	static final float MINUS_PI_2 = -PI_2;
	
	public static final float atan2(float y, float x) {
		if (x == 0.0f) {
			if (y > 0.0f)
				return PI_2;
			if (y == 0.0f)
				return 0.0f;
			return MINUS_PI_2;
		}

		final float atan;
		final float z = y / x;
		if (Math.abs(z) < 1.0f) {
			atan = z / (1.0f + 0.28f * z * z);
			if (x < 0.0f)
				return (y < 0.0f) ? atan - PI : atan + PI;
			return atan;
		} else {
			atan = PI_2 - z / (z * z + 0.28f);
			return (y < 0.0f) ? atan - PI : atan;
		}
	}

}

mooman219 · July 25, 2015, 8:42pm

I’ll add that into the JMH and update the results. Should be a couple of minutes to run.
Edit: Kappa’s version was added to the benchmark.

KaiHH · July 25, 2015, 8:47pm

The version kappa posted is exactly as fast as the accurate dsp version. In 600 million invocations the dsp accurate version is in total only 4 milliseconds faster…

mooman219 · July 25, 2015, 8:54pm

This post is turning into a repository for atan2 implementations haha.
Kappa’s version was added and it’s very fast as well, testing only about 10% slower than the dsp version, which at ~3ns is pretty negligible.

KaiHH · July 25, 2015, 8:59pm

That’s great! Now please another one for fast sin() and cos() replacements within [-pi, +pi).
Rumor has it that the GNU Scientific Library is a treasure trove of nice and fast math functions.

kappa · July 25, 2015, 9:02pm

Heres a fast sin and cos from around the same time


public class FastCosSin {
	
	static final float PI = 3.1415927f;
	static final float MINUS_PI = -PI;
	static final float DOUBLE_PI = PI * 2f;
	static final float PI_2 = PI / 2f;
	
	static final float CONST_1 = 4f / PI;
	static final float CONST_2 = 4f / (PI * PI);

	public static final float sin(float x) {
		if (x < MINUS_PI)
			x += DOUBLE_PI;
		else if (x > PI)
			x -= DOUBLE_PI;

		return (x < 0f) ? (CONST_1 * x + CONST_2 * x * x)
				: (CONST_1 * x - CONST_2 * x * x);
	}

	public static final float cos(float x) {
		if (x < MINUS_PI)
			x += DOUBLE_PI;
		else if (x > PI)
			x -= DOUBLE_PI;

		x += PI_2;

		if (x > PI)
			x -= DOUBLE_PI;

		return (x < 0f) ? (CONST_1 * x + CONST_2 * x * x)
				: (CONST_1 * x - CONST_2 * x * x);
	}
}

KaiHH · July 25, 2015, 9:34pm

Okay, this one is quite interesting. It seems to be the same as: http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
I tested that and it is exactly as fast as the Math.sin() while also being more imprecise.
So, nowadays there really seems to be no need anymore for a “hyperfast” sine/cosine function, because Java’s (i.e. the processor’s) version is really really fast and accurate and nothing can beat that anymore it seems (except for SIMD of course to compute 4 sines at once ).

theagentd · July 25, 2015, 9:39pm

I had some big sine performance problems for the grass wind simulation in WSW. Ended up using a look-up table without any interpolation as my precision requirements were extremely low and it was faster by a magnitude. I find i hard to believe an approximated sin() is slower than Math.sin(). I can test it in WSW when I get back home tomorrow.

KaiHH · July 25, 2015, 9:45pm

I find that hard to believe, too, but those were the facts on my machine at least.
Both the approximation and Math.sin() were only off by a total of 40 milliseconds in 600,000,000 invocations with proper JIT warm up.

Here is my test code:


    private static float sine(float x) {
        float PI = 3.1415927f;
        float B = 4/PI;
        float C = -4/(PI*PI);
        float y = B * x + C * x * Math.abs(x);
        float P = 0.225f;
        y = P * (y * Math.abs(y) - y) + y;
        return y;
    }

    public static void main(String[] args) {
        float res = 0;
        int warm = 10_000_000;
        int inv = 600_000_000;
        Random rnd = new Random();
        for (int i = 0; i < warm; i++) {
            float y = Math.max(-PI, Math.min(rnd.nextFloat() - 0.5f, PI));
            res = (float) Math.sin(y);
            if (i == 0) {
                System.err.println(res);
            }
            res = sine(y);
            if (i == 0) {
                System.err.println(res);
            }
        }
        long time1 = System.nanoTime();
        for (int i = 0; i < inv; i++) {
            Math.sin(rnd.nextFloat() - 0.5f);
        }
        long time2 = System.nanoTime();
        System.err.println("Took: " + (time2 - time1) / 1E3f + " µs.");
        time1 = System.nanoTime();
        for (int i = 0; i < inv; i++) {
            sine(rnd.nextFloat() - 0.5f);
        }
        time2 = System.nanoTime();
        System.err.println("Took: " + (time2 - time1) / 1E3f + " µs.");
    }

On my machine that prints:


0.46587604
0.46571487
Took: 6771749.0 µs.
Took: 6731231.0 µs.

EDIT:
Hmm… it seems to depend heavily on the input that you give to both functions. If I just were to use ‘i’ (the loop counter) then Math.sin() is dramatically slow, making the approximation about 100 times faster, but of course also totally inaccurate, since the input range is outside of the valid values [-pi, +pi). Seems that the processor’s sine function normalizes the values first.

theagentd · July 25, 2015, 9:53pm

Random.nextFloat() is dominating that benchmark.

KaiHH · July 25, 2015, 9:58pm

You are right. I wanted to make sure with this that absolutely no inlining and any optimistic optimization is happening.
It is quite hard to do a “real” benchmark to trick the jit to not do those things.
Now, I think I’ve found a way by computing a single random value first, feed that to both functions in both loops, BUT ALSO increment an accumulator float variable with the results of the sine functions.
Now I get for 600,000,000 invocations:
Took: 1558867.5 µs.
Took: 520580.34 µs.
So, the approximation being around 2 times faster than Java’s version.

EDIT: My benchmark:


    public static void main(String[] args) {
        int warm = 10_000_000;
        int inv = 600_000_000;
        Random rnd = new Random();
        float res = 0;
        for (int i = 0; i < warm; i++) {
            float y = Math.max(-PI, Math.min(rnd.nextFloat() - 0.5f, PI));
            res = (float) Math.sin(y);
            res = sine(y);
        }
        float v = rnd.nextFloat();
        res = Float.POSITIVE_INFINITY;
        long time1 = System.nanoTime();
        for (int i = 0; i < inv; i++) {
            res += Math.sin(v);
        }
        long time2 = System.nanoTime();
        System.err.println("Took: " + (time2 - time1) / 1E3f + " µs.");
        time1 = System.nanoTime();
        for (int i = 0; i < inv; i++) {
            res += sine(v);
        }
        time2 = System.nanoTime();
        System.err.println("Took: " + (time2 - time1) / 1E3f + " µs.");
    }

Icecore · July 25, 2015, 10:00pm

I want also try
“Ifs” can be optimize for better performance. ^^

UP:add atan2_Op

(May have bugs because write in 30 minutes, but test don’t find them =) )


	static int Size_Ac = 10000;
	static double Atan2[] = new double[Size_Ac + 1];
	static double Pi = Math.PI;
	static double Pi_H = Pi / 2;
	static{
		for(int i = 0; i <= Size_Ac; i++){
			double d = (double)i / Size_Ac;
			double x = 1;
			double y = x * d;
			Atan2[i] = Math.atan2(y, x);
		}
	}
	
	static final public double atan2(double y, double x){
		double res;
		boolean ym = false;
		boolean xm = false;
		if(y < 0){
			ym = true;
			y = -y;
		}
		if(x < 0){
			xm = true;
			x = -x;
		}

		if(y > x){
			double d = x / y;
			int p = (int)(d * Size_Ac);
			res = Atan2[p];
			res = Pi_H - res;
		}
		else{
			double d = y / x;
			int p = (int)(d * Size_Ac);
			res = Atan2[p];
		}
		
		if(xm){
			res = Pi - res;
		}
		
		if(ym){
			res = -res;
		}
		return res;
	}
	
	static final public double atan2_Op(double y, double x){
		double res;
		if(y < 0){
			y = -y;
			if(x < 0){
				x = -x;
				if(y > x){
					double d = x / y;
					int p = (int)(d * Size_Ac);
					res = -Pi_H - Atan2[p];
				}
				else{
			        double d = y / x;
			        int p = (int)(d * Size_Ac);
			        res = -Pi + Atan2[p];
				}
			}
			else{
				if(y > x){
					double d = x / y;
					int p = (int)(d * Size_Ac);
					res = -Pi_H + Atan2[p];
				}
				else{
			        double d = y / x;
			        int p = (int)(d * Size_Ac);
			        res = -Atan2[p];
				}
			}
		}
		else{
			if(x < 0){
				x = -x;
				if(y > x){
					double d = x / y;
					int p = (int)(d * Size_Ac);
					res = Pi_H + Atan2[p];
				}
				else{
			        double d = y / x;
			        int p = (int)(d * Size_Ac);
			        res = Pi - Atan2[p];
				}
			}
			else{
				if(y > x){
					double d = x / y;
					int p = (int)(d * Size_Ac);
					res = Pi_H - Atan2[p];
				}
				else{
			        double d = y / x;
			        int p = (int)(d * Size_Ac);
			        res = Atan2[p];
				}
			}
		}
		return res;
	}
	static public void main(String[] args){
		double ep = 1d / Size_Ac;
		for(int x = -100; x <= 100; x++){
			for(int y = -100; y <= 100; y++){
				double a = Math.atan2(x, y);
				double b = atan2(x, y);
				double dif = a - b;
				if(Math.abs(dif) > ep){
					System.out.println(x + "Dif " + y);
					System.out.println(a);
					System.out.println(b);
				}
			}
		}

		System.out.println("End");
		System.out.println(Math.atan2(2222, -45454));
		System.out.println(atan2(2222, -45454));
	}

mooman219 · July 25, 2015, 10:13pm

I’ll put together a benchmark for sin/cos real quick with the two versions you guys posted and then break out into a new topic for sin/cos specifically probably.
As with the original post, I’ll post the benchmark code, the accuracy through some random tests, and the performance report.

Edit: Removed the benchmark code because it would be troublesome to keep it updated in two places. I moved it to http://www.java-gaming.org/topics/extremely-fast-sine-cosine/36469/view.html