Using a lookup table it is possible to vastly outperform Math.hypot(x,y), as long as the input values are integers and they are in a limited range (in this case: -512…+511). Within this range, we observe a factor 40 speedup and an accuracy of 100%, while outside of this range, it’s still a respectable factor 4, with a worst case accuracy of 98%.
This code is used to speedup A* cost heuristic.
public class FastMath
{
private static final int HYPOT_LUT_SHIFT = 9; // 2^9=512
private static final int HYPOT_LUT_MAX_RANGE = 1 << HYPOT_LUT_SHIFT;
private static final float[] HYPOT_LUT;
private static final int NARROW_DOWN_MUL = 9;
private static final int NARROW_DOWN_DIV = NARROW_DOWN_MUL + 1;
private static final float NARROW_DOWN_FACTOR = (float) NARROW_DOWN_DIV / NARROW_DOWN_MUL;
static {
class Helper {
public float[] compute() {
float[] arr = new float[HYPOT_LUT_MAX_RANGE * HYPOT_LUT_MAX_RANGE];
EasyMath.init(arr);
return arr;
}
}
long t0 = System.nanoTime();
HYPOT_LUT = new Helper().compute();
long t1 = System.nanoTime();
System.out.println("EasyMath.LUT calculation took " + ((t1 - t0) / 1000000) + "ms");
//verify();
//bench();
}
public static float hypot(int dx, int dy) {
int ix = Math.abs(dx);
int iy = Math.abs(dy);
float mul = 1.0f;
while (((ix | iy) >> HYPOT_LUT_SHIFT) > 0) {
// narrow down, reasonably accurately
ix = ix * NARROW_DOWN_MUL / NARROW_DOWN_DIV;
iy = iy * NARROW_DOWN_MUL / NARROW_DOWN_DIV;
mul *= NARROW_DOWN_FACTOR;
}
return HYPOT_LUT[(iy << HYPOT_LUT_SHIFT) | ix] * mul;
}
private static void init(final float[] lut) {
if (false) {
WorkerPool.distibute((Object[]) null, HYPOT_LUT_MAX_RANGE, WorkerPool.THREAD_COUNT, new WorkerPool.TaskArrayOperator<Object>() {
@Override
public void operate(int taskid, Object[] data, int off, int end) {
for (int dx = off; dx < end; dx++) {
for (int dy = 0; dy < HYPOT_LUT_MAX_RANGE; dy++) {
lut[(dy << HYPOT_LUT_SHIFT) | dx] = (float) Math.hypot(dx, dy);
}
}
}
});
} else {
for (int dx = 0; dx < HYPOT_LUT_MAX_RANGE; dx++) {
for (int dy = 0; dy < HYPOT_LUT_MAX_RANGE; dy++) {
lut[(dy << HYPOT_LUT_SHIFT) | dx] = (float) Math.hypot(dx, dy);
}
}
}
}
private static void verify() {
final int ext = 3;
for (int dx = 0; dx < HYPOT_LUT_MAX_RANGE; dx++) {
for (int dy = 0; dy < HYPOT_LUT_MAX_RANGE; dy++) {
float d1 = hypot(dx, dy);
float d2 = (float) Math.hypot(dx, dy);
if (d1 != d2) {
throw new IllegalStateException(dx + "," + dy + " --> " + d1 + ", " + d2);
}
}
}
for (int dx = -HYPOT_LUT_MAX_RANGE + 1; dx < HYPOT_LUT_MAX_RANGE; dx++) {
for (int dy = -HYPOT_LUT_MAX_RANGE + 1; dy < HYPOT_LUT_MAX_RANGE; dy++) {
float d1 = hypot(dx, dy);
float d2 = (float) Math.hypot(dx, dy);
if (d1 != d2) {
throw new IllegalStateException(dx + "," + dy + " --> " + d1 + ", " + d2);
}
}
}
float maxError = 0.98f;
for (int dx = -HYPOT_LUT_MAX_RANGE * ext; dx < HYPOT_LUT_MAX_RANGE * ext; dx++) {
for (int dy = -HYPOT_LUT_MAX_RANGE * ext; dy < HYPOT_LUT_MAX_RANGE * ext; dy++) {
float d1 = hypot(dx, dy);
float d2 = (float) Math.hypot(dx, dy);
if (d1 / d2 < maxError || d2 / d1 < maxError) {
System.err.println(dx + "," + dy + " --> " + d1 + ", " + d2 + " (" + d1 / d2 + ")");
}
}
System.out.println("dx=" + dx);
}
}
private static void bench() {
final int ext = 3;
float sum0 = 0.0f;
float sum1 = 0.0f;
float sum2 = 0.0f;
int ops0 = 0;
int ops1 = 0;
int ops2 = 0;
System.out.println("-");
long t0 = System.nanoTime();
for (int dx = -HYPOT_LUT_MAX_RANGE * ext; dx < HYPOT_LUT_MAX_RANGE * ext; dx++) {
for (int dy = -HYPOT_LUT_MAX_RANGE * ext; dy < HYPOT_LUT_MAX_RANGE * ext; dy++) {
sum0 += hypot(dx, dy);
ops0 += 1;
}
}
System.out.println("a");
long t1 = System.nanoTime();
for (int dx = -HYPOT_LUT_MAX_RANGE + 1; dx < HYPOT_LUT_MAX_RANGE; dx++) {
for (int dy = -HYPOT_LUT_MAX_RANGE + 1; dy < HYPOT_LUT_MAX_RANGE; dy++) {
sum1 += hypot(dx, dy);
ops1 += 1;
}
}
System.out.println("b");
long t2 = System.nanoTime();
for (int dx = -HYPOT_LUT_MAX_RANGE + 1; dx < HYPOT_LUT_MAX_RANGE; dx++) {
for (int dy = -HYPOT_LUT_MAX_RANGE + 1; dy < HYPOT_LUT_MAX_RANGE; dy++) {
sum2 += (float) Math.hypot(dx, dy);
ops2 += 1;
}
}
System.out.println("c");
long t3 = System.nanoTime();
System.out.println("EasyMath.LUT out of range: " + (int) (ops0 / ((t1 - t0) / 1000000.0)) + "K ops/sec [" + sum0 + "]");
System.out.println("EasyMath.LUT in range: " + (int) (ops1 / ((t2 - t1) / 1000000.0)) + "K ops/sec [" + sum1 + "]");
System.out.println("EasyMath.LUT native: " + (int) (ops2 / ((t3 - t2) / 1000000.0)) + "K ops/sec [" + sum2 + "]");
}
}
EasyMath.LUT calculation took 65ms
EasyMath.LUT out of range: 4861K ops/sec [1.10062418E10]
EasyMath.LUT in range: 57928K ops/sec [4.09633024E8]
EasyMath.LUT native: 1432K ops/sec [4.09633024E8]