Loading...

Details

Type: Bug
Resolution: Invalid
Priority: Not Evaluated
Fix Version/s: None
Affects Version/s: 5.12.0
Component/s: GUI: OpenGL
Labels:
None
Environment:
Windows 10, running x86 Qt 5.12 build

Platform/s:

Windows

Description

I am experiencing very slow compilation time for the following compute shader (to be precise, on calls to glGetShaderiv), from 5-10+ minutes:

// TODO: Determine active clusters
// See: http://www.aortiz.me/2018/12/21/CG.html#tiled-shading--forward
// NOTE: Hanging on compute shader call glGetShaderiv(m_shaderID, GL_COMPILE_STATUS, &status) means that the compute shader is taking a long time to compile//  The shader is dispatched six times because each thread group contains four z slices (24/4=6)// Thread groups sizes are actually relevant in this compute shader since I'm using shared GPU memory to reduce the number of reads and writes by only loading each light once per thread group, instead of once per cluster
// gl_NumWorkGroups - the number of work groups passed to the dispatch function
// gl_WorkGroupID - the current work group for this shader invocation. Each of the XZ components will be in the half-open rnage [0, gl_NumWorkGroups.XYZ)
// gl_LocalInvocationID - the current invocation of the shader within the work group
// gl_GlobalInvocationID - the current invocation of this shader within ALL work groups
// gl_LocalInvocationIndex - 1D representation of the gl_LocalInvocationID(used for indexing into a shared array)// TODO: Figure out optimal tile size, currently using a 16x9x24 subdivision#define FLT_MAX 3.402823466e+38
#define FLT_MIN 1.175494351e-38
#define DBL_MAX 1.7976931348623158e+308
#define DBL_MIN 2.2250738585072014e-308
#define LOCAL_SIZE_X 8
#define LOCAL_SIZE_Y 9
#define LOCAL_SIZE_Z 4layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;// Uniform block for light settings
// layout (std140) uniform LightSettingsBuffer
// {
	// int lightingModel;
	// int lightCount;
// };// TODO: Pack this more efficiently
struct Light {
	vec4 position;
	vec4 direction;
	vec4 ambientColor;
	vec4 diffuseColor;
	vec4 specularColor;
	vec4 attributes;
	vec4 intensity;
	ivec4 typeIndexAndFlags;
	// uint flags;
};// Array containing offset and number of lights in a cluster
struct LightGrid{
    uint offset;
    uint count;
};struct VolumeTileAABB{
    vec4 minPoint;
    vec4 maxPoint;
};layout(std430, binding = 0) readonly buffer LightBuffer {
	Light data[];
} lightBuffer;
layout (std430, binding = 1) buffer clusterAABB{
    VolumeTileAABB cluster[ ];
};layout (std430, binding = 2) buffer screenToView{
    mat4 inverseProjection;
    uvec4 tileSizes;
    uvec2 screenDimensions;
};// layout (std430, binding = 3) buffer lightSSBO{
    // PointLight pointLight[];
// };// SSBO of active light indices
layout (std430, binding = 4) buffer lightIndexSSBO{
    uint globalLightIndexList[];
};layout (std430, binding = 5) buffer lightGridSSBO{
    LightGrid lightGrid[];
};layout (std430, binding = 6) buffer globalIndexCountSSBO{
    uint globalIndexCount;
};// Shared variables, shared between all invocations WITHIN A WORK GROUP
// TODO: See if I can use gl_WorkGroupSize for this, gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z
// A grouped-shared array which contains all the lights being evaluated
shared Light sharedLights[LOCAL_SIZE_X*LOCAL_SIZE_Y*LOCAL_SIZE_Z]; // A grouped-shared array which contains all the lights being evaluated, size is thread-countuniform mat4 viewMatrix;bool testSphereAABB(uint light, uint tile);
float sqDistPointAABB(vec3 point, uint tile);
bool testConeAABB(uint light, uint tile);
float getLightRange(uint lightIndex);
bool isEnabled(uint lightIndex);// Runs in batches of multiple Z slices at once
// In this implementation, 6 batches, since each thread group contains four z slices (24/4=6)
// We begin by each thread representing a cluster
// Then in the light traversal loop they change to representing lights
// Then change again near the end to represent clusters
// NOTE: Tiles actually mean clusters, it's just a legacy name from tiled shading
void main(){
	// Reset every frame
    globalIndexCount = 0; // How many lights are active in t  his scene
    uint threadCount = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; // Number of threads in a group, same as local_size_x, local_size_y, local_size_z
    uint lightCount  = lightBuffer.data.length(); // Number of total lights in the scene
    uint numBatches = uint((lightCount + threadCount -1) / threadCount); // Number of groups of lights that will be completed, i.e., number of passes
    uint tileIndex = gl_LocalInvocationIndex + gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * gl_WorkGroupID.z;
    // uint tileIndex = gl_GlobalInvocationID; // doesn't wortk, is uvec3
	
	// Local thread variables
    uint visibleLightCount = 0;
    uint visibleLightIndices[100]; // local light index list, to be transferred to global list	// Every light is being checked against every cluster in the view frustum
	// TODO: Perform active cluster determination
	// Each individual thread will be responsible for loading a light and writing it to shared memory so other threads can read it
    for( uint batch = 0; batch < numBatches; ++batch){
        uint lightIndex = batch * threadCount + gl_LocalInvocationIndex;        //Prevent overflow by clamping to last light which is always null
        lightIndex = min(lightIndex, lightCount);        //Populating shared light array
		// NOTE: It is VERY important that lightBuffer.data not be referenced after this point,
		// since that is not thread-safe
        sharedLights[gl_LocalInvocationIndex] = lightBuffer.data[lightIndex];
        barrier(); // Synchronize read/writes between invocations within a work group        //Iterating within the current batch of lights
        for( uint light = 0; light < threadCount; ++light){
            if( isEnabled(light)){
				uint lightType = uint(sharedLights[light].typeIndexAndFlags[0]);
				if(lightType == 0){
					// Point light
					if( testSphereAABB(light, tileIndex) ){
						visibleLightIndices[visibleLightCount] = batch * threadCount + light;
						visibleLightCount += 1;
					}
				}
				else if(lightType == 1){
					// Directional light
					visibleLightIndices[visibleLightCount] = batch * threadCount + light;
					visibleLightCount += 1;
				}
				else if(lightType == 2){
					// Spot light
					if( testConeAABB(light, tileIndex) ){
						visibleLightIndices[visibleLightCount] = batch * threadCount + light;
						visibleLightCount += 1;
					}
				}
            }
        }
    }    // We want all thread groups to have completed the light tests before continuing
    barrier();
	
	// Back to every thread representing a cluster	// Adding the light indices to the cluster light index list
    uint offset = atomicAdd(globalIndexCount, visibleLightCount);
    for(uint i = 0; i < visibleLightCount; ++i){
        globalLightIndexList[offset + i] = visibleLightIndices[i];
    }	// Updating the light grid for each cluster
    lightGrid[tileIndex].offset = offset;
    lightGrid[tileIndex].count = visibleLightCount;
}// Return whether or not the specified light intersects with the specified tile (cluster)
bool testSphereAABB(uint light, uint tile){
    float radius = getLightRange(light);
    vec3 center  = vec3(viewMatrix * sharedLights[light].position);
    float squaredDistance = sqDistPointAABB(center, tile);    return squaredDistance <= (radius * radius);
}// TODO: Different test for spot-lights
// Has been done by using several AABBs for spot-light cone, this could be a good approach, or even just use one to start.
bool testConeAABB(uint light, uint tile){
	// Light light = lightBuffer.data[lightIndex];
	// float innerAngleCos = light.attributes[0];
	// float outerAngleCos = light.attributes[1];
	// float innerAngle = acos(innerAngleCos);
	// float outerAngle = acos(outerAngleCos);
	// FIXME: Actually do something clever here
	return true;
}
// Get range of light given the specified light index
float getLightRange(uint lightIndex){
	int lightType = sharedLights[lightIndex].typeIndexAndFlags[0];
	float range;
	if(lightType == 0){
		// Point light
		float brightness = 0.01; // cutoff for end of range
		float c = sharedLights[lightIndex].attributes.x;
		float lin = sharedLights[lightIndex].attributes.y;
		float quad = sharedLights[lightIndex].attributes.z;
		
		range = (-lin + sqrt(lin*lin - 4.0 * c * quad + (4.0/brightness)* quad)) / (2.0 * quad);
	}
	else if(lightType == 1){
		// Directional light
		range = FLT_MAX;
	}
	else{
		// Spot light
		range = FLT_MAX;
	}
	return range;
}
// Whether the light at the specified index is enabled
bool isEnabled(uint lightIndex){
	uint flags = sharedLights[lightIndex].typeIndexAndFlags[2];
	return (flags | 1) != 0;
}
// Get squared distance from a point to the AABB of the specified tile (cluster)
float sqDistPointAABB(vec3 point, uint tile){
    float sqDist = 0.0;
    VolumeTileAABB currentCell = cluster[tile];
    cluster[tile].maxPoint[3] = tile;
    for(int i = 0; i < 3; ++i){
        float v = point[i];
        if(v < currentCell.minPoint[i]){
            sqDist += (currentCell.minPoint[i] - v) * (currentCell.minPoint[i] - v);
        }
        if(v > currentCell.maxPoint[i]){
            sqDist += (v - currentCell.maxPoint[i]) * (v - currentCell.maxPoint[i]);
        }
    }
	return sqDist;
}

This is using:

{
    m_shaderID = glCreateShader(GL_COMPUTE_SHADER);
    const char* sourceCStr = m_source.c_str();
    glShaderSource(m_shaderID,
        1, 
        &sourceCStr,
        NULL);    
    glCompileShader(m_shaderID);

    int status; 
    glGetShaderiv(m_shaderID, GL_COMPILE_STATUS, &status);
}

But replacing the sqDistPointAABB function with one that doesn't have a loop drops compilation time down to less than a second:

float sqDistPointAABB(vec3 point, uint tile){ 
float sqDist = 0.0; VolumeTileAABB currentCell = cluster[tile]; cluster[tile].maxPoint[3] = tile; 
vec3 minPoint = currentCell.minPoint.xyz; 
vec3 maxPoint = currentCell.maxPoint.xyz; 
vec3 t1 = vec3(lessThan(point, minPoint)); 
vec3 t2 = vec3(greaterThan(point, maxPoint)); 
vec3 sqDist = t1 * (minPoint - point) * (minPoint - point) + t2 * (maxPoint - point) * (maxPoint - point); 
return sqDist.x + sqDist.y + sqDist.z;
}

Attachments

- Sort By Name
- Sort By Date
- Ascending
- Descending
- Thumbnails
- List

light_culling.comp
10 kB
11 Aug '20 17:57

Gerrit Reviews

- Issue Only
- Show All Reviews
- Show Open Reviews
- Show All Issues
- Show Open Issues

No reviews matched the request. Check your Options in the drop-down menu of this sections header.

Extremely slow (10+ minute) compute shader compilation with a relatively simple shader

Details

Description

Attachments

Attachments

Gerrit Reviews

Activity

People

Dates

Gerrit Reviews