Improve IQM CPU vertex skinning performance

Only calculate vertex blend matrix for each unique bone indexes/weights combination once per-surface instead of recalculating for each vertex. For best performance the model surfaces needs to use few vertex bone indexes and weights combinations. Unroll loops so GCC better optimizes them. In my tests drawing animated IQM may take 50% as long in opengl1 and 70% as long in opengl2. It will vary by model though and might not help much at all. Made unanimated IQM models skip matrix math altogether.
2018-07-27 17:40:21 -05:00 · 2018-07-27 17:40:21 -05:00 · cccd283be8
commit cccd283be8
parent fdc08e860e
4 changed files with 601 additions and 254 deletions
--- a/code/renderergl2/tr_local.h
+++ b/code/renderergl2/tr_local.h
@ -950,21 +950,26 @@ typedef struct {
 	struct srfIQModel_s	*surfaces;

 	int		*triangles;
+
+	// vertex arrays
 	float		*positions;
 	float		*texcoords;
 	float		*normals;
 	float		*tangents;
-	byte		*blendIndexes;
+	byte		*colors;
+	int		*influences; // [num_vertexes] indexes into influenceBlendVertexes
+
+	// unique list of vertex blend indexes/weights for faster CPU vertex skinning
+	byte		*influenceBlendIndexes; // [num_influences]
 	union {
 		float	*f;
 		byte	*b;
-	} blendWeights;
-	byte		*colors;
+	} influenceBlendWeights; // [num_influences]

 	// depending upon the exporter, blend indices and weights might be int/float
 	// as opposed to the recommended byte/byte, for example Noesis exports
 	// int/float whereas the official IQM tool exports byte/byte
-	byte blendWeightsType; // IQM_UBYTE or IQM_FLOAT
+	int		blendWeightsType; // IQM_UBYTE or IQM_FLOAT

 	char		*jointNames;
 	int		*jointParents;
@ -981,6 +986,7 @@ typedef struct srfIQModel_s {
 	iqmData_t	*data;
 	int		first_vertex, num_vertexes;
 	int		first_triangle, num_triangles;
+	int		first_influence, num_influences;
 } srfIQModel_t;

 typedef struct srfVaoMdvMesh_s