I had an hour to spare, and decided to write and optimize something that I did not need.
I decided to try to make Arrays.binarySearch(…) less cache-trashy.
I also changed the specification of my binarySearch algorithm a bit, to make it behave more like a traversable database-index, by demanding it must return the lowest index of a specific value in the array, in case the value occurs multiple times.
My initial (hierarchial) approach shows (on average) performance 3.1x as fast as a naive implementation:
Creating data-set...
Generating random values...
Sorting data-set...
Generating lookup indices...
Generating lookup tables...
Printing table lengths...
67108864
2097152
65536
2048
64
Performing sanity-checks...
Starting benchmark...
...
norm took 955ms (100%, 1097K ops/s)
rec1 took 610ms (156%, 1718K ops/s)
rec2 took 523ms (182%, 2004K ops/s)
rec3 took 505ms (189%, 2076K ops/s)
rec4 took 388ms (246%, 2702K ops/s)
...
4GB: (die hard mode)
...
norm took 12564ms (100%, 667K ops/s)
rec1 took 8137ms (154%, 1030K ops/s)
rec2 took 5306ms (236%, 1580K ops/s)
rec3 took 4962ms (253%, 1690K ops/s)
rec4 took 4137ms (303%, 2027K ops/s)
...
Die-hards are supposed to set [icode]isDiehard[/icode] to [icode]true[/icode].
Who feels like wasting some time squeezing even more single threaded performance out of this puppy?
Low hanging fruits are frowned upon, but graciously incorporated!
import java.util.Arrays;
import java.util.Random;
public class QuickBinarySearch {
public static void main(String[] args) {
Random rndm = new Random(1235L);
boolean isDiehard = false; // 4 GB (512 MB for whips)
System.out.println("Creating data-set...");
long[] somedata = new long[(isDiehard ? 512 : 64) * 1024 * 1024];
System.out.println("Generating random values...");
for (int i = 0; i < somedata.length; i++) {
somedata[i] = Math.abs(rndm.nextLong()) % somedata.length * 8;
}
System.out.println("Sorting data-set...");
Arrays.sort(somedata);
System.out.println("Generating lookup indices...");
long[] lookups = new long[somedata.length / 64];
for (int i = 0; i < lookups.length; i++) {
lookups[i] = somedata[rndm.nextInt(somedata.length)] + rndm.nextInt(10) - 5;
}
System.out.println("Generating lookup tables...");
final int shift = 5;
long[] reduced1 = new long[somedata.length >> (shift * 1)];
long[] reduced2 = new long[somedata.length >> (shift * 2)];
long[] reduced3 = new long[somedata.length >> (shift * 3)];
long[] reduced4 = new long[somedata.length >> (shift * 4)];
reduce(somedata, reduced1, shift);
reduce(reduced1, reduced2, shift);
reduce(reduced2, reduced3, shift);
reduce(reduced3, reduced4, shift);
System.out.println("Printing table lengths...");
System.out.println(somedata.length);
System.out.println(reduced1.length);
System.out.println(reduced2.length);
System.out.println(reduced3.length);
System.out.println(reduced4.length);
System.out.println("Performing sanity-checks...");
{
for (int i = -1_000_000; i <= 1_000_000; i++) {
int i1 = binarySearchLo(somedata, i);
int i2 = binarySearchLo(somedata, reduced1, shift, i);
int i3 = binarySearchLo(somedata, reduced1, reduced2, shift, i);
int i4 = binarySearchLo(somedata, reduced1, reduced2, reduced3, shift, i);
int i5 = binarySearchLo(somedata, reduced1, reduced2, reduced3, reduced4, shift, i);
if (i1 != i2)
throw new IllegalStateException();
if (i1 != i3)
throw new IllegalStateException();
if (i1 != i4)
throw new IllegalStateException();
if (i1 != i5)
throw new IllegalStateException();
}
}
int dummy = 0;
System.out.println("Starting benchmark...");
for (int k = 0; k < 8; k++) {
System.out.println();
long normTook;
{
long t0 = System.nanoTime();
for (long lookup : lookups) {
dummy += binarySearchLo(somedata, lookup);
}
long t1 = System.nanoTime();
long took = (t1 - t0) / 1_000_000L;
System.out.println("norm took " + took + "ms (100%, " + (lookups.length / took) + "K ops/s)");
normTook = took;
}
{
long t0 = System.nanoTime();
for (long lookup : lookups) {
dummy += binarySearchLo(somedata, reduced1, shift, lookup);
}
long t1 = System.nanoTime();
long took = (t1 - t0) / 1_000_000L;
System.out.println("rec1 took " + took + "ms (" + (int) (normTook * 100.0f / took) + "%, "
+ (lookups.length / took) + "K ops/s)");
}
{
long t0 = System.nanoTime();
for (long lookup : lookups) {
dummy += binarySearchLo(somedata, reduced1, reduced2, shift, lookup);
}
long t1 = System.nanoTime();
long took = (t1 - t0) / 1_000_000L;
System.out.println("rec2 took " + took + "ms (" + (int) (normTook * 100.0f / took) + "%, "
+ (lookups.length / took) + "K ops/s)");
}
{
long t0 = System.nanoTime();
for (long lookup : lookups) {
dummy += binarySearchLo(somedata, reduced1, reduced2, reduced3, shift, lookup);
}
long t1 = System.nanoTime();
long took = (t1 - t0) / 1_000_000L;
System.out.println("rec3 took " + took + "ms (" + (int) (normTook * 100.0f / took) + "%, "
+ (lookups.length / took) + "K ops/s)");
}
{
long t0 = System.nanoTime();
for (long lookup : lookups) {
dummy += binarySearchLo(somedata, reduced1, reduced2, reduced3, reduced4, shift, lookup);
}
long t1 = System.nanoTime();
long took = (t1 - t0) / 1_000_000L;
System.out.println("rec4 took " + took + "ms (" + (int) (normTook * 100.0f / took) + "%, "
+ (lookups.length / took) + "K ops/s)");
}
}
System.out.println("dummy:" + dummy);
}
public static int binarySearchLo(long[] data1, long[] data2, long[] data3, long[] data4, long[] data5, int shift,
long lookup) {
int idx = binarySearchLo(data2, data3, data4, data5, shift, lookup);
return binarySearchLoNextLevel(data1, idx, shift, lookup);
}
public static int binarySearchLo(long[] data1, long[] data2, long[] data3, long[] data4, int shift, long lookup) {
int idx = binarySearchLo(data2, data3, data4, shift, lookup);
return binarySearchLoNextLevel(data1, idx, shift, lookup);
}
public static int binarySearchLo(long[] data1, long[] data2, long[] data3, int shift, long lookup) {
int idx = binarySearchLo(data2, data3, shift, lookup);
return binarySearchLoNextLevel(data1, idx, shift, lookup);
}
public static int binarySearchLo(long[] data1, long[] data2, int shift, long lookup) {
int idx = binarySearchLo(data2, lookup);
return binarySearchLoNextLevel(data1, idx, shift, lookup);
}
public static long[] reduce(long[] data, int shift) {
long[] lookup = new long[data.length >> shift];
reduce(data, lookup, shift);
return lookup;
}
public static void reduce(long[] src, long[] dst, int shift) {
if (src.length != (dst.length << shift)) {
throw new IllegalStateException();
}
for (int i = 0; i < dst.length; i++) {
dst[i] = src[i << shift];
}
}
private static int binarySearchLoNextLevel(long[] data, int idx, int shift, long lookup) {
idx = (idx >> 31) ^ idx;
// int min = Math.max(0, (idx - 1) << shift);
int min = (idx - 1) << shift;
min &= ~(min >> 31);
int max = Math.min(data.length, (idx << shift) + 1);
return binarySearchLo(data, min, max, lookup);
}
private static int binarySearchLo(long[] a, long key) {
return binarySearchLo(a, 0, a.length, key);
}
private static int binarySearchLo(long[] a, int fromIndex, int toIndex, long key) {
int low = fromIndex;
int high = toIndex - 1;
while (low <= high) {
int idx = (low + high) >>> 1;
long val = a[idx];
if (val < key)
low = idx + 1;
else if (val > key)
high = idx - 1;
else if (idx != fromIndex && val == a[idx - 1])
high = idx - 1; // find lower index
else
return idx; // lowest index with value
}
return -(low + 1);
}
}