I read in some article* of a JVM engineer that creating new objects was ‘almost at the cost of shifting a pointer’.
* I tried hard to find the article, but sometimes java.sun.com is kinda hard to wade through
Further, the GC is considered so intelligent and efficient, that its effect should be ‘noise’ even in performance-critical code.
Combining these two, would almost make you think allocating and discarding tiny objects is nearly free, or at least have a small impact.
I decided to give it a test, in a real-world application which has its bottleneck in some sphere<->triangle method.
Basic vector-math (Vec3) was implemented like:
public static final Vec3 add(Vec3 a, Vec3 b) {
return new Vec3(a.x + b.x, a.y + b.y, a.z + b.z);
}
When I was writing this code it seemed horribly inefficient.
The next code, shows the algorithm:
Vec3 ba = sub(b, a);
Vec3 ca = sub(c, a);
Vec3 pa = sub(p, a);
float snom = dot(pa, ba);
float tnom = dot(pa, ca);
if (snom <= 0.0f && tnom <= 0.0f)
return a;
Vec3 cb = sub(c, b);
Vec3 pb = sub(p, b);
float unom = dot(pb, cb);
float sdenom = dot(pb, sub(a, b));
if (sdenom <= 0.0f && unom <= 0.0f)
return b;
Vec3 pc = sub(p, c);
float tdenom = dot(pc, sub(a, c));
float udenom = dot(pc, sub(b, c));
if (tdenom <= 0.0f && udenom <= 0.0f)
return c;
Vec3 n = cross(ba, ca);
Vec3 ap = sub(a, p);
Vec3 bp = sub(b, p);
float vc = dot(n, cross(ap, bp));
if (vc <= 0.0f && snom >= 0.0f && sdenom >= 0.0f)
return add(a, mul(snom / (snom + sdenom), ba));
Vec3 cp = sub(c, p);
float va = dot(n, cross(bp, cp));
if (va <= 0.0f && unom >= 0.0f && udenom >= 0.0f)
return add(b, mul(unom / (unom + udenom), cb));
float vb = dot(n, cross(cp, ap));
if (vb <= 0.0f && tnom >= 0.0f && tdenom >= 0.0f)
return add(a, mul(tnom / (tnom + tdenom), ca));
float u = va / (va + vb + vc);
float v = vb / (va + vb + vc);
float w = 1.0f - u - v;
return add(add(mul(u, a), mul(v, b)), mul(w, c));
The following is the version where all Vec3 methods are inlined:
float bax = b.x - a.x;
float bay = b.y - a.y;
float baz = b.z - a.z;
float cax = c.x - a.x;
float cay = c.y - a.y;
float caz = c.z - a.z;
float pax = p.x - a.x;
float pay = p.y - a.y;
float paz = p.z - a.z;
float snom = pax * bax + pay * bay + paz * baz;
float tnom = pax * cax + pay * cay + paz * caz;
if (snom <= 0.0f && tnom <= 0.0f)
return a;
float abx = a.x - b.x;
float aby = a.y - b.y;
float abz = a.z - b.z;
float cbx = c.x - b.x;
float cby = c.y - b.y;
float cbz = c.z - b.z;
float pbx = p.x - b.x;
float pby = p.y - b.y;
float pbz = p.z - b.z;
float unom = pbx * cbx + pby * cby + pbz * cbz;
float sdenom = pbx * abx + pby * aby + pbz * abz;
if (sdenom <= 0.0f && unom <= 0.0f)
return b;
float pcx = p.x - c.x;
float pcy = p.y - c.y;
float pcz = p.z - c.z;
float acx = a.x - c.x;
float acy = a.y - c.y;
float acz = a.z - c.z;
float bcx = b.x - c.x;
float bcy = b.y - c.y;
float bcz = b.z - c.z;
float tdenom = pcx * acx + pcy * acy + pcz * acz;
float udenom = pcx * bcx + pcy * bcy + pcz * bcz;
if (tdenom <= 0.0f && udenom <= 0.0f)
return c;
float nx = bay * caz - baz * cay;
float ny = baz * cax - bax * caz;
float nz = bax * cay - bay * cax;
float apx = a.x - p.x;
float apy = a.y - p.y;
float apz = a.z - p.z;
float bpx = b.x - p.x;
float bpy = b.y - p.y;
float bpz = b.z - p.z;
float APBPx = apy * bpz - apz * bpy;
float APBPy = apz * bpx - apx * bpz;
float APBPz = apx * bpy - apy * bpx;
float vc = nx * APBPx + ny * APBPy + nz * APBPz;
if (vc <= 0.0f && snom >= 0.0f && sdenom >= 0.0f)
{
Vec3 r = new Vec3();
float t = snom / (snom + sdenom);
r.x = bax * t + a.x;
r.y = bay * t + a.y;
r.z = baz * t + a.z;
return r;
}
float cpx = c.x - p.x;
float cpy = c.y - p.y;
float cpz = c.z - p.z;
float BPCPx = bpy * cpz - bpz * cpy;
float BPCPy = bpz * cpx - bpx * cpz;
float BPCPz = bpx * cpy - bpy * cpx;
float va = nx * BPCPx + ny * BPCPy + nz * BPCPz;
if (va <= 0.0f && unom >= 0.0f && udenom >= 0.0f)
{
Vec3 r = new Vec3();
float t = unom / (unom + udenom);
r.x = cbx * t + b.x;
r.y = cby * t + b.y;
r.z = cbz * t + b.z;
return r;
}
float CPAPx = cpy * apz - cpz * apy;
float CPAPy = cpz * apx - cpx * apz;
float CPAPz = cpx * apy - cpy * apx;
float vb = nx * CPAPx + ny * CPAPy + nz * CPAPz;
if (vb <= 0.0f && tnom >= 0.0f && tdenom >= 0.0f)
{
Vec3 r = new Vec3();
float t = (tnom / (tnom + tdenom));
r.x = cax * t + a.x;
r.y = cay * t + a.y;
r.z = caz * t + a.z;
return r;
}
float u = va / (va + vb + vc);
float v = vb / (va + vb + vc);
float w = 1.0f - u - v;
Vec3 r = new Vec3();
r.x = u * a.x + v * b.x + w * c.x;
r.y = u * a.y + v * b.y + w * c.y;
r.z = u * a.z + v * b.z + w * c.z;
return r;
After warming both loops for several seconds, allowing the JVM to inline and optimize, these are the results:
[tr][td]Objects:[/td][td]1548ms[/td][td]1553ms[/td][td]1551ms[/td][/tr]
[tr][td]Inlined:[/td][td]505ms[/td][td]500ms[/td][td]558ms[/td][/tr]
This is clearly not ‘noise’ anymore (timing difference wise).
Some of you guys (to be honest, including me) would say: doh! - but I kinda started to believe they really reduced the overhead of objects. Sadly this doesn’t seem to be the case as of yet.