For those not interested in an elaborate background story, I’ll sum up the functionality of the code below:
- It uses unsynchronized mapped VBOs
- To make this guaranteed to be safe, it has 6 VBOs (worst case - see below)
- For every frame, it picks the next VBO (round robin)
- When it reuses a VBO, it is so ‘old’ (6 frames old) that it is guaranteed to be no longer in use by the GPU
Here is a straightforward code dump:
import java.nio.ByteBuffer;
import org.lwjgl.opengl.ARBMapBufferRange;
import org.lwjgl.opengl.GL15;
import org.lwjgl.opengl.GL30;
import org.lwjgl.opengl.GLContext;
import static org.lwjgl.opengl.GL15.*;
public class Unsync {
// triple buffering in stereo mode is rather rare through..
private static final int MAX_FRAMEBUFFER_COUNT = 2 * 3;
private final int glTarget, glUsage;
private final int[] bufferHandles, requestedSizes, allocatedSizes;
private int currentBufferIndex;
public Unsync(int glTarget, int glUsage) {
this.glTarget = glTarget; // GL_ARRAY_BUFFER, GL_ELEMENT_ARRAY_BUFFER
this.glUsage = glUsage; // GL_STATIC_DRAW, GL_STREAM_DRAW
requestedSizes = new int[MAX_FRAMEBUFFER_COUNT];
allocatedSizes = new int[MAX_FRAMEBUFFER_COUNT];
bufferHandles = new int[MAX_FRAMEBUFFER_COUNT];
for (int i = 0; i < this.bufferHandles.length; i++) {
bufferHandles[i] = glGenBuffers();
}
currentBufferIndex = -1;
}
public void nextFrame() {
currentBufferIndex = (currentBufferIndex + 1) % MAX_FRAMEBUFFER_COUNT;
}
public void bind() {
glBindBuffer(glTarget, currentBufferHandle());
}
public int currentBufferHandle() {
return bufferHandles[currentBufferIndex];
}
public void ensureSize(int size) {
assert size > 0;
requestedSizes[currentBufferIndex] = size;
if (size > allocatedSizes[currentBufferIndex]) {
glBufferData(glTarget, size, glUsage);
allocatedSizes[currentBufferIndex] = size;
}
}
public void trimToSize() {
if (requestedSizes[currentBufferIndex] != allocatedSizes[currentBufferIndex]) {
glBufferData(glTarget, requestedSizes[currentBufferIndex], glUsage);
allocatedSizes[currentBufferIndex] = requestedSizes[currentBufferIndex];
}
}
public ByteBuffer map() {
long offset = 0;
long length = requestedSizes[currentBufferIndex];
if (GLContext.getCapabilities().OpenGL30) {
int flags = GL30.GL_MAP_WRITE_BIT | GL30.GL_MAP_UNSYNCHRONIZED_BIT;
return GL30.glMapBufferRange(glTarget, offset, length, flags, null);
}
if (GLContext.getCapabilities().GL_ARB_map_buffer_range) {
int flags = ARBMapBufferRange.GL_MAP_WRITE_BIT | ARBMapBufferRange.GL_MAP_UNSYNCHRONIZED_BIT;
return ARBMapBufferRange.glMapBufferRange(glTarget, offset, length, flags, null);
}
return GL15.glMapBuffer(glTarget, GL15.GL_WRITE_ONLY, null);
}
public void unmap() {
glUnmapBuffer(glTarget);
}
public void deleteAll() {
for (int i = 0; i < this.bufferHandles.length; i++) {
glDeleteBuffers(this.bufferHandles[i]);
this.bufferHandles[i] = -1;
}
}
}
Unsync vbo = new Unsync(...);
while(true) { // render loop
vbo.nextFrame(); // mandatory
vbo.bind();
vbo.ensureSize(bytesInVBO);
ByteBuffer mapped = vbo.map();
// fill it
vbo.unmap();
// render things
// swap buffers
}
OpenGL VBO performance is incredibly hard to optimize. After you managed to fill the VBO data as fast as possible, you’re pretty much relying on the performance of glMapBuffer(…) and glUnmapBuffer() to pump the data over to the graphics card.
It turns out that these calls have a significant overhead, because the driver has to verify that the memory block it is about to return, is not currently in use by the GPU. Especially when doing many of these calls per frame for small batches of geometry, you’ll see it drags the framerate under 60Hz quickly: on my particular (low end) graphics card, I render 64x 4 tiny triangles per frame and end up with an abysmal 45fps.
@@ // what not to do...
while (!Display.isCloseRequested()) {
glClearColor(0, 0, 0, 1);
glClear(GL_COLOR_BUFFER_BIT);
for (int x = 0; x < drawCalls; x++) {
glVertexPointer(2, GL_FLOAT, stride, 0 << 2);
glColorPointer(4, GL_UNSIGNED_BYTE, stride, 2 << 2);
@@ FloatBuffer fb = glMapBuffer(...).asFloatBuffer();
for (int y = 0, i = 0; y < trisPerDrawCall; y++) {
fb.position((i++) * floatStride);
fb.put(x * 3 + 16).put(y * 3 + 16);
fb.put(packRGBA(0xFF, 0x00, 0x00, 0xFF));
fb.position((i++) * floatStride);
fb.put(x * 3 + 32).put(y * 3 + 16);
fb.put(packRGBA(0x00, 0xFF, 0x00, 0xFF));
fb.position((i++) * floatStride);
fb.put(x * 3 + 16).put(y * 3 + 32);
fb.put(packRGBA(0x00, 0x00, 0xFF, 0xFF));
}
@@ glUnmapBuffer();
glDrawArrays(GL_TRIANGLES, 0, 3 * trisPerDrawCall);
}
//
Display.update();
}
Taking full advantage of the performance of the GPU, means we have to keep the driver from doing these costly verifications. At first I thought that guaranteeing that the VBOs are not used in rendering anymore, by using a pool of VBOs would be enough, but the driver simply has to perform a lot of checks to prove what the application already knows. We somehow have to make the driver trust our input and disable any check.
Fortunately, we have [icode]glMapBufferRange( … | GL_MAP_UNSYNCHRONIZED_BIT)[/icode] to do exactly that! But it leaves us with a problem, now we have to ensure that all VBO mapping we do is done on memory guaranteed not to be in use by the GPU. As the GPU is fully asynchronous, that’s no easy feat.
But lets first check performance by simply using 1 VBO and using glMapBufferRange(…) instead of glMapBuffer(…). I got ~1450fps, that’s an improvement of over factor 32! Awesome! The down side is that the rendering, as the specs say are ‘undefined’, and I indeed see a lot of garbled renderings on the framebuffer.
@@ // what not to do either...
glVertexPointer(2, GL_FLOAT, stride, 0 << 2);
glColorPointer(4, GL_UNSIGNED_BYTE, stride, 2 << 2);
- FloatBuffer fb = glMapBuffer(..., GL_WRITE_ONLY, ...).asFloatBuffer();
+ FloatBuffer fb = glMapBufferRange(..., GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT, ...).asFloatBuffer();
for (int y = 0, i = 0; y < trisPerDrawCall; y++) {
fb.position((i++) * floatStride);
After a bit of messing around, I found a way to guarantee that the VBOs we map, are not in use anymore by the GPU. We can assume that on a (common) double buffered framebuffer, we know that 1 frame is being rendered into, and the other frame is displayed. On a triple buffered setup, 2 frames are being rendered into, and the last frame is displayed. On a stereo triple buffered setup, 4 frames are being rendered into, and the last pair of frames are displayed. This means that there can be up to 6 frames active in any game!
So if we create an array (of length 6) of lists of VBOs, we can pick a list of VBOs each frame, that has been used 6 frames ago, and therefore guanteed to be not used in any rendering. For every frame, we reuse and/or allocate as much VBOs as we need, trusting that we will encounter these VBOs again after 6 frames.
It wouldn’t be a ‘shared code’ post, if I wouldn’t dump the full code, so you can take advantage of the performance boost of doing to verification work of the driver yourself:
import static org.lwjgl.opengl.GL11.*;
import static org.lwjgl.opengl.GL15.*;
import java.nio.*;
import org.lwjgl.*;
import org.lwjgl.opengl.*;
public class MappedVBOTest {
private static float packRGBA(int r, int g, int b, int a) {
return Float.intBitsToFloat((r << 0) | (g << 8) | (b << 16) | (a << 24));
}
public static void main(String[] main) throws LWJGLException {
Display.setDisplayMode(new DisplayMode(800, 600));
Display.create();
{
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glOrtho(0, 800, 600, 0, -1, +1);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
}
@@ boolean isUnsynchronized = true;
MappedVertexBufferObjectProvider provider;
provider = new MappedVertexBufferObjectProvider(GL_ARRAY_BUFFER, GL_STATIC_DRAW, isUnsynchronized);
glEnableClientState(GL_VERTEX_ARRAY);
glEnableClientState(GL_COLOR_ARRAY);
int stride = (2 + 1) << 2;
{
// round up to multiple of 16 (for SIMD)
stride += 16 - 1;
stride /= 16;
stride *= 16;
}
int strideFloat = stride >> 2;
int drawCalls = 64;
int trisPerDrawCall = 4;
long lastSecond = System.nanoTime();
int frameCount = 0;
while (!Display.isCloseRequested()) {
@@ provider.nextFrame();
glClearColor(0, 0, 0, 1);
glClear(GL_COLOR_BUFFER_BIT);
for (int x = 0; x < drawCalls; x++) {
@@ MappedVertexBufferObject vbo = provider.nextVBO();
vbo.ensureSize(trisPerDrawCall * 3 * stride);
glVertexPointer(2, GL_FLOAT, stride, 0 << 2);
glColorPointer(4, GL_UNSIGNED_BYTE, stride, 2 << 2);
FloatBuffer fb = vbo.map().asFloatBuffer();
for (int y = 0, i = 0; y < trisPerDrawCall; y++) {
fb.position((i++) * strideFloat);
fb.put(x * 3 + 16).put(y * 3 + 16);
fb.put(packRGBA(0xFF, 0x00, 0x00, 0xFF));
fb.position((i++) * strideFloat);
fb.put(x * 3 + 32).put(y * 3 + 16);
fb.put(packRGBA(0x00, 0xFF, 0x00, 0xFF));
fb.position((i++) * strideFloat);
fb.put(x * 3 + 16).put(y * 3 + 32);
fb.put(packRGBA(0x00, 0x00, 0xFF, 0xFF));
}
vbo.unmap();
glDrawArrays(GL_TRIANGLES, 0, 3 * trisPerDrawCall);
}
//
Display.update();
frameCount++;
if (System.nanoTime() > lastSecond + 1_000_000_000L) {
lastSecond += 1_000_000_000L;
Display.setTitle(frameCount + "fps / " + (1000.0f / frameCount) + "ms");
frameCount = 0;
}
}
Display.destroy();
}
}
Set [icode]isUnsynchronized = false[/icode], and you’ll see a framedrop of anything in the realm of factor 30 to 60 (!).
AMD Radeon 5500: 1450fps vs 45fps
AMD Radeon 5870: 5250fps vs 88fps
import java.util.*;
public class MappedVertexBufferObjectProvider {
// triple buffering in stereo mode is rather rare through..
private static final int MAX_WINDOW_BUFFER_COUNT = 2 * 3;
private final int glTarget;
private final int glUsage;
private final boolean unsync;
@SuppressWarnings("unchecked")
public MappedVertexBufferObjectProvider(int glTarget, int glUsage, boolean unsync) {
this.glTarget = glTarget; // GL_ARRAY_BUFFER, GL_ELEMENT_ARRAY_BUFFER
this.glUsage = glUsage; // GL_STATIC_DRAW, GL_STREAM_DRAW
this.unsync = unsync;
frameToBufferObjects = new ArrayList[MAX_WINDOW_BUFFER_COUNT];
for (int i = 0; i < frameToBufferObjects.length; i++) {
frameToBufferObjects[i] = new ArrayList<>();
}
}
final List<MappedVertexBufferObject>[] frameToBufferObjects;
private int frameIndex = -1;
private int vboIndex = -1;
public void nextFrame() {
frameIndex += 1;
frameIndex %= frameToBufferObjects.length;
vboIndex = -1;
}
public MappedVertexBufferObject nextVBO() {
if (frameIndex == -1) {
throw new IllegalStateException("not in a frame");
}
vboIndex += 1;
List<MappedVertexBufferObject> vbos = frameToBufferObjects[frameIndex];
if (vboIndex == vbos.size()) {
vbos.add(new MappedVertexBufferObject(glTarget, glUsage, unsync));
}
MappedVertexBufferObject object = vbos.get(vboIndex);
object.bind();
return object;
}
public void orphanAll() {
for (List<MappedVertexBufferObject> vbos : frameToBufferObjects) {
for (MappedVertexBufferObject object : vbos) {
object.orphan();
}
}
}
public void trimAllToSize() {
for (List<MappedVertexBufferObject> vbos : frameToBufferObjects) {
for (MappedVertexBufferObject object : vbos) {
object.trimToSize();
}
}
}
public void delete() {
for (List<MappedVertexBufferObject> vbos : frameToBufferObjects) {
for (MappedVertexBufferObject object : vbos) {
object.delete();
}
}
}
@Override
public String toString() {
int[] vboCounts = new int[frameToBufferObjects.length];
for (int i = 0; i < vboCounts.length; i++) {
vboCounts[i] = frameToBufferObjects[i].size();
}
return this.getClass().getSimpleName() + "[" + Arrays.toString(vboCounts) + "]";
}
}
import static org.lwjgl.opengl.GL15.*;
import java.nio.*;
import org.lwjgl.opengl.*;
public class MappedVertexBufferObject {
private final int glTarget, glUsage;
private final int handle;
private int requestedSize, allocatedSize;
private boolean isMapped;
private final boolean unsync;
private static MappedVertexBufferObject bound;
public MappedVertexBufferObject(int glTarget, int glUsage, boolean unsync) {
this.glTarget = glTarget;
this.glUsage = glUsage;
this.unsync = unsync;
this.handle = glGenBuffers();
}
public void bind() {
if (bound == this) {
throw new IllegalStateException("already bound");
}
bound = this;
glBindBuffer(glTarget, this.handle);
}
public void ensureSize(int size) {
assert size > 0;
if (bound != this) {
throw new IllegalStateException("not bound");
}
requestedSize = size;
if (size > allocatedSize) {
glBufferData(glTarget, size, glUsage);
allocatedSize = size;
}
}
public void trimToSize() {
if (bound != this) {
throw new IllegalStateException("not bound");
}
if (requestedSize != allocatedSize) {
glBufferData(glTarget, requestedSize, glUsage);
allocatedSize = requestedSize;
}
}
public void orphan() {
if (bound != this) {
throw new IllegalStateException("not bound");
}
glBufferData(glTarget, 0, glUsage);
allocatedSize = requestedSize = 0;
}
public ByteBuffer map() {
if (bound != this) {
throw new IllegalStateException("not bound");
}
if (requestedSize == 0) {
throw new IllegalStateException("no data");
}
if (isMapped) {
throw new IllegalStateException("already mapped");
}
isMapped = true;
long offset = 0;
long length = requestedSize;
if (GLContext.getCapabilities().OpenGL30) {
int access = GL30.GL_MAP_WRITE_BIT;
if (unsync) {
access |= GL30.GL_MAP_UNSYNCHRONIZED_BIT;
}
return GL30.glMapBufferRange(glTarget, offset, length, access, null);
}
if (GLContext.getCapabilities().GL_ARB_map_buffer_range) {
int access = ARBMapBufferRange.GL_MAP_WRITE_BIT;
if (unsync) {
access |= ARBMapBufferRange.GL_MAP_UNSYNCHRONIZED_BIT;
}
return ARBMapBufferRange.glMapBufferRange(glTarget, offset, length, access, null);
}
int access = GL_WRITE_ONLY;
return glMapBuffer(glTarget, access, null);
}
public void unmap() {
if (bound != this) {
throw new IllegalStateException("not bound");
}
if (!isMapped) {
throw new IllegalStateException("not mapped");
}
isMapped = false;
glUnmapBuffer(glTarget);
}
public void delete() {
if (bound == this) {
throw new IllegalStateException("still bound");
}
if (isMapped) {
throw new IllegalStateException("still mapped");
}
glDeleteBuffers(handle);
}
public static void unbind(int glTarget) {
if (bound == null) {
throw new IllegalStateException("none bound");
}
glBindBuffer(glTarget, 0);
bound = null;
}
}