VBO performance

Most people associate immediate mode with the fixed function pipeline, that’s what I was talking about… Sorry OP and everyone else.

No, it’s not that. There is virtually no overdraw, and most sprites are culled before they get written out. I’m barely touching the fillrate of the card (which is a GTX280 on my development machine - 5 years old but still pretty hot). The cost is almost entirely due to transforming and writing sprite data out to VBOs, which I’ve just multithreaded which has given me a tidy speed increase :slight_smile: Various incantations involving glMapBufferRange flags give serious speed increases too.

Cas :slight_smile:

That is so odd. On my laptops integrated gpu (first gen i5) I can push into 50K+ easily at solid 60. This is without any game logic so there is that but I would think that it wouldn’t vary that much. I am sure your sprite batcher is more complicated then mine but it makes me wonder why it would be the bottle neck.

PS. multithreading…nice job man.

Well, here’s the code to write just one sprite:


	public void writeSprite(Sprite sprite, Matrix4f transform, FloatBuffer vertexBuf) {
		SpriteImage image = sprite.getImage();

		final float tx0 = image.tx0;
		final float tx1 = image.tx1;
		final float ty0 = image.ty0;
		final float ty1 = image.ty1;

		final float xscale = sprite.xscale;
		final float yscale = sprite.yscale;

		int alpha = (int) ((sprite.alpha * sprite.masterAlpha) * BattleRenderingConstants.ALPHA_DIV);
		boolean flipped = sprite.isFlipped();
		boolean mirrored = sprite.isMirrored();

		Sprite parent = sprite.getParent();
		for (Sprite p = parent; p != null; p = p.getParent()) {
			flipped ^= p.isFlipped();
			mirrored ^= p.isMirrored();
			alpha = (int) ((alpha * p.alpha * p.masterAlpha) * BattleRenderingConstants.ALPHA_DIV * BattleRenderingConstants.ALPHA_DIV);
		}
		final float x = sprite.x + (mirrored ? -sprite.ox : sprite.ox);
		final float y = sprite.y + (flipped ? -sprite.oy : sprite.oy);

		// First scale then rotate coordinates
		float scaledx0 = mirrored ? (image.ow - image.ox - image.w - image.hotspotx) * xscale : (image.ox - image.hotspotx) * xscale;
		float scaledy0 = flipped ? (image.oh - image.oy - image.h - image.hotspoty) * yscale : (image.oy - image.hotspoty) * yscale;
		float scaledx1 = mirrored ? (image.ow - image.hotspotx - image.ox) * xscale : (image.ox + image.w - image.hotspotx) * xscale;
		float scaledy1 = flipped ? (image.oh - image.hotspoty - image.oy) * yscale : (image.oy + image.h - image.hotspoty) * yscale;

		final float angle = (float) toRadians(mirrored ? -sprite.angle : sprite.angle);
		final float cos = (float) Math.cos(angle);
		final float sin = (float) Math.sin(angle);

		final float cosx0 = cos * scaledx0;
		final float siny0 = sin * scaledy0;
		final float cosx1 = cos * scaledx1;
		final float siny1 = sin * scaledy1;
		final float sinx0 = sin * scaledx0;
		final float cosy0 = cos * scaledy0;
		final float sinx1 = sin * scaledx1;
		final float cosy1 = cos * scaledy1;

		final float txA = mirrored ? tx1 : tx0;
		final float tyA = flipped ? ty0 : ty1;
		final float txB = mirrored ? tx0 : tx1;
		final float tyB = flipped ? ty1 : ty0;

		final float x0, y0, x1, y1, x2, y2, x3, y3;
		if (parent != null) {
			Vector4f temp = TEMP.get(), dest = DEST.get();
			temp.x = (cosx0 - siny0) + x;
			temp.y = (sinx0 + cosy0) + y;
			transform(transform, temp, dest);
			x0 = dest.x;
			y0 = dest.y;

			temp.x = (cosx1 - siny0) + x;
			temp.y = (sinx1 + cosy0) + y;
			transform(transform, temp, dest);
			x1 = dest.x;
			y1 = dest.y;

			temp.x = (cosx1 - siny1) + x;
			temp.y = (sinx1 + cosy1) + y;
			transform(transform, temp, dest);
			x2 = dest.x;
			y2 = dest.y;

			temp.x = (cosx0 - siny1) + x;
			temp.y = (sinx0 + cosy1) + y;
			transform(transform, temp, dest);
			x3 = dest.x;
			y3 = dest.y;

		} else {
			x0 = (cosx0 - siny0) + x;
			y0 = (sinx0 + cosy0) + y;
			x1 = (cosx1 - siny0) + x;
			y1 = (sinx1 + cosy0) + y;
			x2 = (cosx1 - siny1) + x;
			y2 = (sinx1 + cosy1) + y;
			x3 = (cosx0 - siny1) + x;
			y3 = (sinx0 + cosy1) + y;
		}

		float z = image.z;
		float m = mirrored ? -1.0f : 1.0f;

		vertexBuf.put(x0);
		vertexBuf.put(y0);
		vertexBuf.put(y0 - sprite.y);
		vertexBuf.put(txA);
		vertexBuf.put(tyA);
		vertexBuf.put(z);
		writeColor(sprite.color00, alpha, vertexBuf);
		vertexBuf.put(mode);
		vertexBuf.put(sprite.custom00);
		vertexBuf.put(sprite.custom01);
		vertexBuf.put(m);
		vertexBuf.put(angle);

		vertexBuf.put(x1);
		vertexBuf.put(y1);
		vertexBuf.put(y1 - sprite.y);
		vertexBuf.put(txB);
		vertexBuf.put(tyA);
		vertexBuf.put(z);
		writeColor(sprite.color10, alpha, vertexBuf);
		vertexBuf.put(mode);
		vertexBuf.put(sprite.custom00);
		vertexBuf.put(sprite.custom01);
		vertexBuf.put(m);
		vertexBuf.put(angle);

		vertexBuf.put(x2);
		vertexBuf.put(y2);
		vertexBuf.put(y2 - sprite.y);
		vertexBuf.put(txB);
		vertexBuf.put(tyB);
		vertexBuf.put(z);
		writeColor(sprite.color11, alpha, vertexBuf);
		vertexBuf.put(mode);
		vertexBuf.put(sprite.custom00);
		vertexBuf.put(sprite.custom01);
		vertexBuf.put(m);
		vertexBuf.put(angle);

		vertexBuf.put(x3);
		vertexBuf.put(y3);
		vertexBuf.put(y3 - sprite.y);
		vertexBuf.put(txA);
		vertexBuf.put(tyB);
		vertexBuf.put(z);
		writeColor(sprite.color01, alpha, vertexBuf);
		vertexBuf.put(mode);
		vertexBuf.put(sprite.custom00);
		vertexBuf.put(sprite.custom01);
		vertexBuf.put(m);
		vertexBuf.put(angle);
	}

	@Override
	public void writeColor(int color, int alpha, FloatBuffer dest) {
		float alpha00 = ((color >> 24) & 0xFF) * alpha * BattleRenderingConstants.ALPHA_DIV;
		float preMultAlpha00 = alpha00 * BattleRenderingConstants.ALPHA_DIV;
		if (glowing && !isBaking()) {
			dest.put(Float.intBitsToFloat(((int) ((color & 0xFF) * preMultAlpha00)) | ((int) (((color >> 8) & 0xFF) * preMultAlpha00) << 8) | ((int) (((color >> 16) & 0xFF) * preMultAlpha00) << 16)));
		} else {
			dest.put(Float.intBitsToFloat(((int) ((color & 0xFF) * preMultAlpha00)) | ((int) (((color >> 8) & 0xFF) * preMultAlpha00) << 8) | ((int) (((color >> 16) & 0xFF) * preMultAlpha00) << 16) | ((int) alpha00) << 24));
		}
	}

I have to do that 20,000 times in 17ms… after I’ve figured out which order to draw them in!

The other huge speedup I’ve managed is by putting the game logic in - wait for it - a separate thread. This is both more difficult and easier than it looks, though it helps that this particular game is designed around doing something like this. The game logic thread only needs to be interrupted for a very short time whilst sprite parameters are updated so the logic thread is usually well utilised.

Cas :slight_smile:

Anyway you could move some stuff such as rotation to a shader? May speed things up. Looks similar to what I used which I posted in I think the tutorials section. It is not using a VBO but the built in VA. I have since changed it to use a single interleaved buffer but still use the built in VA.

I would do but it’s not the whole story, and besides, the rotation is just a few multiplies… to get the shader to do it I’d have to pass in a whole bunch of extra data to the VBO instead. Here’s the method that calls writeSprite:


		public void addRoot(Sprite sprite, Style useStyle, Context context) {
			// First flatten the entire hierarchy into an array
			int hierarchySize = 1;
			int idx = 0;
			context.hierarchy[0] = sprite;
			while (idx < hierarchySize) {
				Sprite s = context.hierarchy[idx++];
				Sprite[] children = s.getChildren();
				if (children == null) {
					continue;
				}
				for (int i = 0, n = s.getNumChildren(); i < n; i++) {
					Sprite child = children[i];
					if (!child.isVisible()) {
						continue;
					}

					Style childUseStyle = child.getRenderStyle();
					if (childUseStyle == null) {
						continue;
					}
					context.hierarchy[hierarchySize++] = child;
				}
			}

			// Now sort it by order. The number of elements is so tiny that we will use GnomeSort as it's super cache friendly
			// and not too slow for small sizes.
			GnomeSort.sort(context.hierarchy, 0, hierarchySize, ORDER_COMPARATOR);

			// Now draw the sprites, but don't write data to the vertex buffers; just freeze it.
			add(sprite, useStyle, DrawMode.BUFFER, context);

			// And now draw the sprites in the order we want them drawn, using frozen data
			for (int i = 0; i < hierarchySize; i++) {
				Sprite s = context.hierarchy[i];
				add(s, s.getRenderStyle(), DrawMode.OUTPUT, context);
			}
		}

		private void checkStartNewState(Sprite sprite, Style useStyle) {
			boolean needToStartNewState = false;

			assert useStyle != null;

			if (currentRun == null) {
				needToStartNewState = true;
			} else if (currentStyle == null) {
				needToStartNewState = true;
			} else if (useStyle.getStyleID() != currentStyle.getStyleID()) {
				needToStartNewState = true;
			} else {
				SpriteImage img = sprite.image;
				if (img != null) {
					for (int i = 0, n = useStyle.getNumTextures(); i < n; i++) {
						if (img.texture.length > i) {
							GLBaseTexture tex = img.texture[i];
							if (tex != null && currentTexture[i] != tex) {
								needToStartNewState = true;
								break;
							}
						}
					}
				}
			}

			if (needToStartNewState) {
				// Changed state. Start new state.
				int currentVertexOffset = currentRun == null ? 0 : currentRun.vertexOffset;
				int currentIndexOffset = currentRun == null ? 0 : currentRun.indexOffset;
				nextStateRun();
				currentRun.style = useStyle;
				SpriteImage img = sprite.image;
				if (img != null) {
					for (int i = 0, n = useStyle.getNumTextures(); i < n; i++) {
						if (i >= img.texture.length) {
							break;
						}
						GLBaseTexture tex = img.texture[i];
						if (img.texture.length > i && tex != null) {
							currentTexture[i] = tex;
							currentRun.texture[i] = tex;
						}
					}
				}

				// If we've got a different stride, we need to start a new offset
				if (currentStride != useStyle.getVertexStride()) {
					currentStride = useStyle.getVertexStride();
					currentRun.startIndex = indexCursor = 0;
					currentRun.endIndex = 0;
					currentRun.startVertex = vertexCursor = 0;
					currentRun.endVertex = 0;
					currentRun.vertexOffset = vertexPosition * 4;
					currentRun.indexOffset = mappedIndices.position() * 4;
				} else {
					// Same stride as before, so continue using same stuff
					currentRun.startIndex = indexCursor;
					currentRun.endIndex = indexCursor;
					currentRun.startVertex = vertexCursor;
					currentRun.endVertex = vertexCursor;
					currentRun.vertexOffset = currentVertexOffset;
					currentRun.indexOffset = currentIndexOffset;
				}

				currentStyle = useStyle;
			}
		}

		public void add(Sprite sprite, Style useStyle, DrawMode mode, Context context) {
			if (mode != DrawMode.BUFFER) {
				// System.out.println("Position at " + vertexPosition + " for sprite " + System.identityHashCode(sprite) + " (base: " + baseVertexOffset + ")");
				checkStartNewState(sprite, useStyle);
			}

			boolean frozen = sprite.isFrozen();
			ByteBuffer raw = sprite.getRaw();
			FloatBuffer vertexBuf = mappedVertices;
			int startPosition = vertexPosition;
			vertexBuf.position(startPosition);
			int vertexStride = useStyle.getVertexStride();
			if (frozen) {
				if (raw == null) {
					raw = ByteBuffer.allocate(vertexStride * 4).order(ByteOrder.nativeOrder());
					sprite.setRaw(raw);
					frozen = false; // cause calcs to be made and raw to be written to
				} else {
					if (mode != DrawMode.BUFFER) {
						raw = sprite.getRaw();
						raw.limit(vertexStride * 4).rewind();
						rawVertices.limit((startPosition + vertexStride) << 2).position(startPosition << 2);
						rawVertices.put(raw);
					}
				}
			} else if (mode == DrawMode.BUFFER) {
				if (raw == null) {
					raw = ByteBuffer.allocate(vertexStride * 4).order(ByteOrder.nativeOrder());
					sprite.setRaw(raw);
				}
			}

			if (mode == DrawMode.OUTPUT) {
				raw = sprite.getRaw();
				raw.limit(vertexStride * 4).rewind();
				rawVertices.limit((startPosition + vertexStride) << 2).position(startPosition << 2);
				rawVertices.put(raw);
			} else if (!frozen) {
				if (raw != null) {
					raw.rewind();
					vertexBuf = raw.asFloatBuffer();
				}

				useStyle.writeSprite(sprite, context.stack[context.stackDepth], vertexBuf);

				if (raw != null && mode != DrawMode.BUFFER) {
					raw.rewind();
					rawVertices.limit((startPosition + vertexStride) << 2).position(startPosition << 2);
					rawVertices.put(raw);
				}
			}

			if (mode != DrawMode.BUFFER) {
				// Write indices: need 6, for two triangles
				IntBuffer indexBuf = mappedIndices;
				int vx = vertexCursor;
				indexBuf.put(vx + 0);
				indexBuf.put(vx + 1);
				indexBuf.put(vx + 2);
				indexBuf.put(vx + 0);
				indexBuf.put(vx + 2);
				indexBuf.put(vx + 3);

				indexCursor += 6;
				vertexPosition += vertexStride;
				vertexCursor += 4;
				currentRun.endIndex += 6;
				currentRun.endVertex += 4;
			}

			if (mode != DrawMode.BUFFER) {
				return;
			}

			Sprite[] children = sprite.getChildren();
			if (children != null) {
				context.stackDepth++;

				boolean mirrored = sprite.isMirrored();
				boolean flipped = sprite.isFlipped();
				Sprite parent = sprite.getParent();
				while (parent != null) {
					mirrored ^= parent.isMirrored();
					flipped ^= parent.isFlipped();
					parent = parent.getParent();
				}
				float ox = mirrored ? -sprite.ox : sprite.ox;
				float oy = flipped ? -sprite.oy : sprite.oy;
				context.stack[context.stackDepth].load(context.stack[context.stackDepth - 1]);
				context.temp_vec3.set(sprite.x + ox, sprite.y + oy, 0.0f);
				context.stack[context.stackDepth].translate(context.temp_vec3);

				float angle = sprite.angle;
				if (angle != 0.0f) {
					if (mirrored) {
						angle = -angle;
					}
					context.stack[context.stackDepth].rotate((float) Math.toRadians(angle), ROTATE_VEC);
				}
				if (sprite.xscale != 1.0f || sprite.yscale != 1.0f) {
					context.temp_vec3.set(sprite.xscale, sprite.yscale, 0.0f);
					context.stack[context.stackDepth].scale(context.temp_vec3);
				}

				for (int i = 0, n = sprite.getNumChildren(); i < n; i++) {
					Sprite child = children[i];
					if (!child.isVisible()) {
						continue;
					}

					Style childUseStyle = child.getRenderStyle();
					if (childUseStyle == null) {
						continue;
					}

					if (childUseStyle.getRenderSprite()) {
						add(child, childUseStyle, DrawMode.BUFFER, context);
					} else {
						assert false : "Not yet implemented properly"; // We need to apply the current transform to the geometry...
						                                               // hmmm
					}
				}

				context.stackDepth--;
			}
		}

Because our sprites exist in hierarchies we have two ways to add sprites to the scene; as the root of a hierarchy, or as a plain sprite. When you start adding child sprites there’s a cumulative transform applied. Note also how at that point I’m tracking GL state changes to build up batches.

Cas :slight_smile: