The sourcecode is kinda blackmagic, but here is some input/output:
Input settings:
Declarations:
FB a;
f b;
FB d;
* e;
Statements: (for a lerp function)
e=b-a
e=e*c
d=e+a
Generated C code:
// e=b-a; e=e*c; d=e+a
JNIEXPORT void JNICALL JNICALL Java_com.eyeriv.simd_PerfMath_lerp0(JNIEnv * env, jclass clazz, jobject aBuf, jint aPos, jobject bBuf, jint bPos, jfloat cVal, jobject dBuf, jint dPos, jint elements) {
int fastLoops = elements / 4;
int slowLoops = elements % 4;
long* aAddr = (long*) env->GetDirectBufferAddress(aBuf);
long* bAddr = (long*) env->GetDirectBufferAddress(bBuf);
long* dAddr = (long*) env->GetDirectBufferAddress(dBuf);
float* aData = ((float*) aAddr) + aPos;
float* bData = ((float*) bAddr) + bPos;
float* dData = ((float*) dAddr) + dPos;
__m128* aPnt = (__m128*) aData;
__m128* bPnt = (__m128*) bData;
__m128 cPnt = _mm_set_ps1(cVal);
__m128* dPnt = (__m128*) dData;
__m128 ePnt = _mm_set_ps1(0.0f);
float eTmp;
int shift = fastLoops * 4;
aData += shift;
bData += shift;
dData += shift;
for (int i = 0; i < fastLoops; i++) {
ePnt = _mm_sub_ps(*bPnt, *aPnt);
ePnt = _mm_mul_ps( ePnt, cPnt);
*dPnt = _mm_add_ps( ePnt, *aPnt);
aPnt++; bPnt++; dPnt++;
}
for (int i=0; i < slowLoops; i++) {
eTmp = (*bData) - (*aData);
eTmp = eTmp * cVal;
*dData = eTmp + (*aData);
aData++; bData++; dData++;
}
}
Generated Java code:
/**
* e=b-a; e=e*c; d=e+a
*/
public final void lerp(FloatBuffer a, FloatBuffer b, float c, FloatBuffer d) {
checkBuffers(a, b, d);
int elements = a.remaining();
float e;
boolean fast = (a.position() | b.position() | d.position()) == 0;
int fastLoops = fast ? (elements / 8) : 0;
int fastFinish = fastLoops * 8;
int aPos = a.position() + fastFinish;
int bPos = b.position() + fastFinish;
int dPos = d.position() + fastFinish;
for (int pntr = 0; pntr < fastFinish;) {
e = b.get(pntr) - a.get(pntr);
e = e * c;
d.put(pntr, e + a.get(pntr)); pntr++;
e = b.get(pntr) - a.get(pntr);
e = e * c;
d.put(pntr, e + a.get(pntr)); pntr++;
e = b.get(pntr) - a.get(pntr);
e = e * c;
d.put(pntr, e + a.get(pntr)); pntr++;
e = b.get(pntr) - a.get(pntr);
e = e * c;
d.put(pntr, e + a.get(pntr)); pntr++;
e = b.get(pntr) - a.get(pntr);
e = e * c;
d.put(pntr, e + a.get(pntr)); pntr++;
e = b.get(pntr) - a.get(pntr);
e = e * c;
d.put(pntr, e + a.get(pntr)); pntr++;
e = b.get(pntr) - a.get(pntr);
e = e * c;
d.put(pntr, e + a.get(pntr)); pntr++;
e = b.get(pntr) - a.get(pntr);
e = e * c;
d.put(pntr, e + a.get(pntr)); pntr++;
}
for (int i = fastFinish; i < elements; i++) {
e = b.get(bPos) - a.get(aPos);
e = e * c;
d.put(dPos, e + a.get(aPos));
aPos++; bPos++; dPos++;
}
}