I’d like to say that ‘my’ lookup tables take 16KB of memory, or 4 pages. That means that they are basically always in L2 cache, and not big enough to push other data out. They can even pretty much fully stay within L1 cache, in your inner loop.
math=202,067us	full[]=7,586us	half[]=12,851us	taylor=26,783us
math=201,610us	full[]=7,701us	half[]=12,878us	taylor=26,448us
math=202,474us	full[]=7,672us	half[]=12,886us	taylor=26,438us
math=202,321us	full[]=7,592us	half[]=12,864us	taylor=26,306us
I know it’s micro-benchmark, but it’s rotating vectors, which is a pretty common use case:
import java.util.Random;
public class FastMathBench {
	public static void main(String[] args) {
		System.out.println(Math.sin(0.5f));
		System.out.println(sinFull(0.5f));
		System.out.println(sinHalf(0.5f));
		System.out.println(sinTaylor(0.5f));
		System.out.println();
		System.out.println(Math.sin(4.5f));
		System.out.println(sinFull(4.5f));
		System.out.println(sinHalf(4.5f));
		System.out.println(sinTaylor(4.5f));
		System.out.println();
		float[] angles = new float[1024 * 1024];
		float[] vectors = new float[angles.length * 2];
		Random r = new Random(235);
		for (int i = 0; i < angles.length; i++) {
			angles[i] = (float) (r.nextDouble() * Math.PI * 2.0);
			vectors[i * 2 + 0] = r.nextFloat();
			vectors[i * 2 + 1] = r.nextFloat();
		}
		for (int i = 0; i < 16; i++) {
			long t0 = System.nanoTime();
			benchMathSin(vectors, angles);
			long t1 = System.nanoTime();
			benchLookupFullSin(vectors, angles);
			long t2 = System.nanoTime();
			benchLookupHalfSin(vectors, angles);
			long t3 = System.nanoTime();
			benchLookupTaylorSin(vectors, angles);
			long t4 = System.nanoTime();
			long tMath = (t1 - t0) / 1000;
			long tLUT1 = (t2 - t1) / 1000;
			long tLUT2 = (t3 - t2) / 1000;
			long tTayl = (t4 - t3) / 1000;
			System.out.println("math=" + tMath + "us\tfull[]=" + tLUT1 + "us\thalf[]=" + tLUT2 + "us\ttaylor=" + tTayl + "us");
		}
	}
	private static void benchMathSin(float[] vectors, float[] angles) {
		for (int i = 0; i < angles.length; i++) {
			float x = vectors[i * 2 + 0];
			float y = vectors[i * 2 + 1];
			float a = angles[i];
			float cos = (float) Math.cos(a);
			float sin = (float) Math.sin(a);
			vectors[i * 2 + 0] = x * cos - y * sin;
			vectors[i * 2 + 1] = y * cos + x * sin;
		}
	}
	private static void benchLookupFullSin(float[] vectors, float[] angles) {
		for (int i = 0; i < angles.length; i++) {
			float x = vectors[i * 2 + 0];
			float y = vectors[i * 2 + 1];
			float a = angles[i];
			float cos = sinFull(a);
			float sin = cosFull(a);
			vectors[i * 2 + 0] = x * cos - y * sin;
			vectors[i * 2 + 1] = y * cos + x * sin;
		}
	}
	private static void benchLookupHalfSin(float[] vectors, float[] angles) {
		for (int i = 0; i < angles.length; i++) {
			float x = vectors[i * 2 + 0];
			float y = vectors[i * 2 + 1];
			float a = angles[i];
			float cos = cosHalf(a);
			float sin = sinHalf(a);
			vectors[i * 2 + 0] = x * cos - y * sin;
			vectors[i * 2 + 1] = y * cos + x * sin;
		}
	}
	private static void benchLookupTaylorSin(float[] vectors, float[] angles) {
		for (int i = 0; i < angles.length; i++) {
			float x = vectors[i * 2 + 0];
			float y = vectors[i * 2 + 1];
			float a = angles[i];
			float cos = cosTaylor(a);
			float sin = sinTaylor(a);
			vectors[i * 2 + 0] = x * cos - y * sin;
			vectors[i * 2 + 1] = y * cos + x * sin;
		}
	}
	private static final float RAD, DEG, SIN_TO_COS;
	private static final int SIN_BITS, SIN_MASK, SIN_MASK2, SIN_COUNT, SIN_COUNT2;
	private static final float radFull, radToIndex;
	private static final float degFull, degToIndex;
	private static final float[] sinFull, sinHalf;
	static {
		RAD = (float) Math.PI / 180.0f;
		DEG = 180.0f / (float) Math.PI;
		SIN_TO_COS = (float) (Math.PI * 0.5f);
		SIN_BITS = 12;
		SIN_MASK = ~(-1 << SIN_BITS);
		SIN_MASK2 = SIN_MASK >> 1;
		SIN_COUNT = SIN_MASK + 1;
		SIN_COUNT2 = SIN_MASK2 + 1;
		radFull = (float) (Math.PI * 2.0);
		degFull = (float) (360.0);
		radToIndex = SIN_COUNT / radFull;
		degToIndex = SIN_COUNT / degFull;
		sinFull = new float[SIN_COUNT];
		for (int i = 0; i < SIN_COUNT; i++) {
			sinFull[i] = (float) Math.sin((i + Math.min(1, i % (SIN_COUNT / 4)) * 0.5) / SIN_COUNT * radFull);
		}
		sinHalf = new float[SIN_COUNT2];
		for (int i = 0; i < SIN_COUNT2; i++) {
			sinHalf[i] = (float) Math.sin((i + Math.min(1, i % (SIN_COUNT / 4)) * 0.5) / SIN_COUNT * radFull);
		}
	}
	//
	public static final float sinFull(float rad) {
		return sinFull[(int) (rad * radToIndex) & SIN_MASK];
	}
	public static final float cosFull(float rad) {
		return sinFull(rad + SIN_TO_COS);
	}
	//
	public static final float sinHalf(float rad) {
		int index1 = (int) (rad * radToIndex) & SIN_MASK;
		int index2 = index1 & SIN_MASK2;
		// int mul = (((index1 - index2) >>> 31) << 1) + 1;
		int mul = ((index1 == index2) ? +1 : -1);
		return sinHalf[index2] * mul;
	}
	public static final float cosHalf(float rad) {
		return sinHalf(rad + SIN_TO_COS);
	}
	//
	public static final float sinTaylor(float rad) {
		// TODO: clamp to -PI..+PI
		double x = rad;
		double x2 = x * x;
		double x3 = x2 * x;
		double x5 = x2 * x3;
		double x7 = x2 * x5;
		double x9 = x2 * x7;
		double x11 = x2 * x9;
		double x13 = x2 * x11;
		double x15 = x2 * x13;
		double x17 = x2 * x15;
		double val = x;
		val -= x3 * 0.16666666666666666666666666666667;
		val += x5 * 0.00833333333333333333333333333333;
		val -= x7 * 1.984126984126984126984126984127e-4;
		val += x9 * 2.7557319223985890652557319223986e-6;
		val -= x11 * 2.5052108385441718775052108385442e-8;
		val += x13 * 1.6059043836821614599392377170155e-10;
		val -= x15 * 7.6471637318198164759011319857881e-13;
		val += x17 * 2.8114572543455207631989455830103e-15;
		return (float) val;
	}
	public static final float cosTaylor(float rad) {
		return sinTaylor(rad + SIN_TO_COS);
	}
}