I’d like to say that ‘my’ lookup tables take 16KB of memory, or 4 pages. That means that they are basically always in L2 cache, and not big enough to push other data out. They can even pretty much fully stay within L1 cache, in your inner loop.
math=202,067us full[]=7,586us half[]=12,851us taylor=26,783us
math=201,610us full[]=7,701us half[]=12,878us taylor=26,448us
math=202,474us full[]=7,672us half[]=12,886us taylor=26,438us
math=202,321us full[]=7,592us half[]=12,864us taylor=26,306us
I know it’s micro-benchmark, but it’s rotating vectors, which is a pretty common use case:
import java.util.Random;
public class FastMathBench {
public static void main(String[] args) {
System.out.println(Math.sin(0.5f));
System.out.println(sinFull(0.5f));
System.out.println(sinHalf(0.5f));
System.out.println(sinTaylor(0.5f));
System.out.println();
System.out.println(Math.sin(4.5f));
System.out.println(sinFull(4.5f));
System.out.println(sinHalf(4.5f));
System.out.println(sinTaylor(4.5f));
System.out.println();
float[] angles = new float[1024 * 1024];
float[] vectors = new float[angles.length * 2];
Random r = new Random(235);
for (int i = 0; i < angles.length; i++) {
angles[i] = (float) (r.nextDouble() * Math.PI * 2.0);
vectors[i * 2 + 0] = r.nextFloat();
vectors[i * 2 + 1] = r.nextFloat();
}
for (int i = 0; i < 16; i++) {
long t0 = System.nanoTime();
benchMathSin(vectors, angles);
long t1 = System.nanoTime();
benchLookupFullSin(vectors, angles);
long t2 = System.nanoTime();
benchLookupHalfSin(vectors, angles);
long t3 = System.nanoTime();
benchLookupTaylorSin(vectors, angles);
long t4 = System.nanoTime();
long tMath = (t1 - t0) / 1000;
long tLUT1 = (t2 - t1) / 1000;
long tLUT2 = (t3 - t2) / 1000;
long tTayl = (t4 - t3) / 1000;
System.out.println("math=" + tMath + "us\tfull[]=" + tLUT1 + "us\thalf[]=" + tLUT2 + "us\ttaylor=" + tTayl + "us");
}
}
private static void benchMathSin(float[] vectors, float[] angles) {
for (int i = 0; i < angles.length; i++) {
float x = vectors[i * 2 + 0];
float y = vectors[i * 2 + 1];
float a = angles[i];
float cos = (float) Math.cos(a);
float sin = (float) Math.sin(a);
vectors[i * 2 + 0] = x * cos - y * sin;
vectors[i * 2 + 1] = y * cos + x * sin;
}
}
private static void benchLookupFullSin(float[] vectors, float[] angles) {
for (int i = 0; i < angles.length; i++) {
float x = vectors[i * 2 + 0];
float y = vectors[i * 2 + 1];
float a = angles[i];
float cos = sinFull(a);
float sin = cosFull(a);
vectors[i * 2 + 0] = x * cos - y * sin;
vectors[i * 2 + 1] = y * cos + x * sin;
}
}
private static void benchLookupHalfSin(float[] vectors, float[] angles) {
for (int i = 0; i < angles.length; i++) {
float x = vectors[i * 2 + 0];
float y = vectors[i * 2 + 1];
float a = angles[i];
float cos = cosHalf(a);
float sin = sinHalf(a);
vectors[i * 2 + 0] = x * cos - y * sin;
vectors[i * 2 + 1] = y * cos + x * sin;
}
}
private static void benchLookupTaylorSin(float[] vectors, float[] angles) {
for (int i = 0; i < angles.length; i++) {
float x = vectors[i * 2 + 0];
float y = vectors[i * 2 + 1];
float a = angles[i];
float cos = cosTaylor(a);
float sin = sinTaylor(a);
vectors[i * 2 + 0] = x * cos - y * sin;
vectors[i * 2 + 1] = y * cos + x * sin;
}
}
private static final float RAD, DEG, SIN_TO_COS;
private static final int SIN_BITS, SIN_MASK, SIN_MASK2, SIN_COUNT, SIN_COUNT2;
private static final float radFull, radToIndex;
private static final float degFull, degToIndex;
private static final float[] sinFull, sinHalf;
static {
RAD = (float) Math.PI / 180.0f;
DEG = 180.0f / (float) Math.PI;
SIN_TO_COS = (float) (Math.PI * 0.5f);
SIN_BITS = 12;
SIN_MASK = ~(-1 << SIN_BITS);
SIN_MASK2 = SIN_MASK >> 1;
SIN_COUNT = SIN_MASK + 1;
SIN_COUNT2 = SIN_MASK2 + 1;
radFull = (float) (Math.PI * 2.0);
degFull = (float) (360.0);
radToIndex = SIN_COUNT / radFull;
degToIndex = SIN_COUNT / degFull;
sinFull = new float[SIN_COUNT];
for (int i = 0; i < SIN_COUNT; i++) {
sinFull[i] = (float) Math.sin((i + Math.min(1, i % (SIN_COUNT / 4)) * 0.5) / SIN_COUNT * radFull);
}
sinHalf = new float[SIN_COUNT2];
for (int i = 0; i < SIN_COUNT2; i++) {
sinHalf[i] = (float) Math.sin((i + Math.min(1, i % (SIN_COUNT / 4)) * 0.5) / SIN_COUNT * radFull);
}
}
//
public static final float sinFull(float rad) {
return sinFull[(int) (rad * radToIndex) & SIN_MASK];
}
public static final float cosFull(float rad) {
return sinFull(rad + SIN_TO_COS);
}
//
public static final float sinHalf(float rad) {
int index1 = (int) (rad * radToIndex) & SIN_MASK;
int index2 = index1 & SIN_MASK2;
// int mul = (((index1 - index2) >>> 31) << 1) + 1;
int mul = ((index1 == index2) ? +1 : -1);
return sinHalf[index2] * mul;
}
public static final float cosHalf(float rad) {
return sinHalf(rad + SIN_TO_COS);
}
//
public static final float sinTaylor(float rad) {
// TODO: clamp to -PI..+PI
double x = rad;
double x2 = x * x;
double x3 = x2 * x;
double x5 = x2 * x3;
double x7 = x2 * x5;
double x9 = x2 * x7;
double x11 = x2 * x9;
double x13 = x2 * x11;
double x15 = x2 * x13;
double x17 = x2 * x15;
double val = x;
val -= x3 * 0.16666666666666666666666666666667;
val += x5 * 0.00833333333333333333333333333333;
val -= x7 * 1.984126984126984126984126984127e-4;
val += x9 * 2.7557319223985890652557319223986e-6;
val -= x11 * 2.5052108385441718775052108385442e-8;
val += x13 * 1.6059043836821614599392377170155e-10;
val -= x15 * 7.6471637318198164759011319857881e-13;
val += x17 * 2.8114572543455207631989455830103e-15;
return (float) val;
}
public static final float cosTaylor(float rad) {
return sinTaylor(rad + SIN_TO_COS);
}
}