Uploaded image for project: 'Qt'
  1. Qt
  2. QTBUG-85995

Extremely slow (10+ minute) compute shader compilation with a relatively simple shader

    XMLWordPrintable

Details

    • Bug
    • Resolution: Invalid
    • Not Evaluated
    • None
    • 5.12.0
    • GUI: OpenGL
    • None
    • Windows 10, running x86 Qt 5.12 build
    • Windows

    Description

      I am experiencing very slow compilation time for the following compute shader (to be precise, on calls to glGetShaderiv), from 5-10+ minutes:

       

       

      // TODO: Determine active clusters
      // See: http://www.aortiz.me/2018/12/21/CG.html#tiled-shading--forward
      // NOTE: Hanging on compute shader call glGetShaderiv(m_shaderID, GL_COMPILE_STATUS, &status) means that the compute shader is taking a long time to compile//  The shader is dispatched six times because each thread group contains four z slices (24/4=6)// Thread groups sizes are actually relevant in this compute shader since I'm using shared GPU memory to reduce the number of reads and writes by only loading each light once per thread group, instead of once per cluster
      // gl_NumWorkGroups - the number of work groups passed to the dispatch function
      // gl_WorkGroupID - the current work group for this shader invocation. Each of the XZ components will be in the half-open rnage [0, gl_NumWorkGroups.XYZ)
      // gl_LocalInvocationID - the current invocation of the shader within the work group
      // gl_GlobalInvocationID - the current invocation of this shader within ALL work groups
      // gl_LocalInvocationIndex - 1D representation of the gl_LocalInvocationID(used for indexing into a shared array)// TODO: Figure out optimal tile size, currently using a 16x9x24 subdivision#define FLT_MAX 3.402823466e+38
      #define FLT_MIN 1.175494351e-38
      #define DBL_MAX 1.7976931348623158e+308
      #define DBL_MIN 2.2250738585072014e-308
      #define LOCAL_SIZE_X 8
      #define LOCAL_SIZE_Y 9
      #define LOCAL_SIZE_Z 4layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;// Uniform block for light settings
      // layout (std140) uniform LightSettingsBuffer
      // {
      	// int lightingModel;
      	// int lightCount;
      // };// TODO: Pack this more efficiently
      struct Light {
      	vec4 position;
      	vec4 direction;
      	vec4 ambientColor;
      	vec4 diffuseColor;
      	vec4 specularColor;
      	vec4 attributes;
      	vec4 intensity;
      	ivec4 typeIndexAndFlags;
      	// uint flags;
      };// Array containing offset and number of lights in a cluster
      struct LightGrid{
          uint offset;
          uint count;
      };struct VolumeTileAABB{
          vec4 minPoint;
          vec4 maxPoint;
      };layout(std430, binding = 0) readonly buffer LightBuffer {
      	Light data[];
      } lightBuffer;
      layout (std430, binding = 1) buffer clusterAABB{
          VolumeTileAABB cluster[ ];
      };layout (std430, binding = 2) buffer screenToView{
          mat4 inverseProjection;
          uvec4 tileSizes;
          uvec2 screenDimensions;
      };// layout (std430, binding = 3) buffer lightSSBO{
          // PointLight pointLight[];
      // };// SSBO of active light indices
      layout (std430, binding = 4) buffer lightIndexSSBO{
          uint globalLightIndexList[];
      };layout (std430, binding = 5) buffer lightGridSSBO{
          LightGrid lightGrid[];
      };layout (std430, binding = 6) buffer globalIndexCountSSBO{
          uint globalIndexCount;
      };// Shared variables, shared between all invocations WITHIN A WORK GROUP
      // TODO: See if I can use gl_WorkGroupSize for this, gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z
      // A grouped-shared array which contains all the lights being evaluated
      shared Light sharedLights[LOCAL_SIZE_X*LOCAL_SIZE_Y*LOCAL_SIZE_Z]; // A grouped-shared array which contains all the lights being evaluated, size is thread-countuniform mat4 viewMatrix;bool testSphereAABB(uint light, uint tile);
      float sqDistPointAABB(vec3 point, uint tile);
      bool testConeAABB(uint light, uint tile);
      float getLightRange(uint lightIndex);
      bool isEnabled(uint lightIndex);// Runs in batches of multiple Z slices at once
      // In this implementation, 6 batches, since each thread group contains four z slices (24/4=6)
      // We begin by each thread representing a cluster
      // Then in the light traversal loop they change to representing lights
      // Then change again near the end to represent clusters
      // NOTE: Tiles actually mean clusters, it's just a legacy name from tiled shading
      void main(){
      	// Reset every frame
          globalIndexCount = 0; // How many lights are active in t  his scene
          uint threadCount = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; // Number of threads in a group, same as local_size_x, local_size_y, local_size_z
          uint lightCount  = lightBuffer.data.length(); // Number of total lights in the scene
          uint numBatches = uint((lightCount + threadCount -1) / threadCount); // Number of groups of lights that will be completed, i.e., number of passes
          uint tileIndex = gl_LocalInvocationIndex + gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * gl_WorkGroupID.z;
          // uint tileIndex = gl_GlobalInvocationID; // doesn't wortk, is uvec3
      	
      	// Local thread variables
          uint visibleLightCount = 0;
          uint visibleLightIndices[100]; // local light index list, to be transferred to global list	// Every light is being checked against every cluster in the view frustum
      	// TODO: Perform active cluster determination
      	// Each individual thread will be responsible for loading a light and writing it to shared memory so other threads can read it
          for( uint batch = 0; batch < numBatches; ++batch){
              uint lightIndex = batch * threadCount + gl_LocalInvocationIndex;        //Prevent overflow by clamping to last light which is always null
              lightIndex = min(lightIndex, lightCount);        //Populating shared light array
      		// NOTE: It is VERY important that lightBuffer.data not be referenced after this point,
      		// since that is not thread-safe
              sharedLights[gl_LocalInvocationIndex] = lightBuffer.data[lightIndex];
              barrier(); // Synchronize read/writes between invocations within a work group        //Iterating within the current batch of lights
              for( uint light = 0; light < threadCount; ++light){
                  if( isEnabled(light)){
      				uint lightType = uint(sharedLights[light].typeIndexAndFlags[0]);
      				if(lightType == 0){
      					// Point light
      					if( testSphereAABB(light, tileIndex) ){
      						visibleLightIndices[visibleLightCount] = batch * threadCount + light;
      						visibleLightCount += 1;
      					}
      				}
      				else if(lightType == 1){
      					// Directional light
      					visibleLightIndices[visibleLightCount] = batch * threadCount + light;
      					visibleLightCount += 1;
      				}
      				else if(lightType == 2){
      					// Spot light
      					if( testConeAABB(light, tileIndex) ){
      						visibleLightIndices[visibleLightCount] = batch * threadCount + light;
      						visibleLightCount += 1;
      					}
      				}
                  }
              }
          }    // We want all thread groups to have completed the light tests before continuing
          barrier();
      	
      	// Back to every thread representing a cluster	// Adding the light indices to the cluster light index list
          uint offset = atomicAdd(globalIndexCount, visibleLightCount);
          for(uint i = 0; i < visibleLightCount; ++i){
              globalLightIndexList[offset + i] = visibleLightIndices[i];
          }	// Updating the light grid for each cluster
          lightGrid[tileIndex].offset = offset;
          lightGrid[tileIndex].count = visibleLightCount;
      }// Return whether or not the specified light intersects with the specified tile (cluster)
      bool testSphereAABB(uint light, uint tile){
          float radius = getLightRange(light);
          vec3 center  = vec3(viewMatrix * sharedLights[light].position);
          float squaredDistance = sqDistPointAABB(center, tile);    return squaredDistance <= (radius * radius);
      }// TODO: Different test for spot-lights
      // Has been done by using several AABBs for spot-light cone, this could be a good approach, or even just use one to start.
      bool testConeAABB(uint light, uint tile){
      	// Light light = lightBuffer.data[lightIndex];
      	// float innerAngleCos = light.attributes[0];
      	// float outerAngleCos = light.attributes[1];
      	// float innerAngle = acos(innerAngleCos);
      	// float outerAngle = acos(outerAngleCos);
      	// FIXME: Actually do something clever here
      	return true;
      }
      // Get range of light given the specified light index
      float getLightRange(uint lightIndex){
      	int lightType = sharedLights[lightIndex].typeIndexAndFlags[0];
      	float range;
      	if(lightType == 0){
      		// Point light
      		float brightness = 0.01; // cutoff for end of range
      		float c = sharedLights[lightIndex].attributes.x;
      		float lin = sharedLights[lightIndex].attributes.y;
      		float quad = sharedLights[lightIndex].attributes.z;
      		
      		range = (-lin + sqrt(lin*lin - 4.0 * c * quad + (4.0/brightness)* quad)) / (2.0 * quad);
      	}
      	else if(lightType == 1){
      		// Directional light
      		range = FLT_MAX;
      	}
      	else{
      		// Spot light
      		range = FLT_MAX;
      	}
      	return range;
      }
      // Whether the light at the specified index is enabled
      bool isEnabled(uint lightIndex){
      	uint flags = sharedLights[lightIndex].typeIndexAndFlags[2];
      	return (flags | 1) != 0;
      }
      // Get squared distance from a point to the AABB of the specified tile (cluster)
      float sqDistPointAABB(vec3 point, uint tile){
          float sqDist = 0.0;
          VolumeTileAABB currentCell = cluster[tile];
          cluster[tile].maxPoint[3] = tile;
          for(int i = 0; i < 3; ++i){
              float v = point[i];
              if(v < currentCell.minPoint[i]){
                  sqDist += (currentCell.minPoint[i] - v) * (currentCell.minPoint[i] - v);
              }
              if(v > currentCell.maxPoint[i]){
                  sqDist += (v - currentCell.maxPoint[i]) * (v - currentCell.maxPoint[i]);
              }
          }
      	return sqDist;
      }
      

       

       

      This is using:

      {
          m_shaderID = glCreateShader(GL_COMPUTE_SHADER);
          const char* sourceCStr = m_source.c_str();
          glShaderSource(m_shaderID,
              1, 
              &sourceCStr,
              NULL);    
          glCompileShader(m_shaderID);
      
          int status; 
          glGetShaderiv(m_shaderID, GL_COMPILE_STATUS, &status);
      }
      

       

       

      But replacing the sqDistPointAABB  function with one that doesn't have a loop drops compilation time down to less than a second:

       

       

      float sqDistPointAABB(vec3 point, uint tile){ 
      float sqDist = 0.0; VolumeTileAABB currentCell = cluster[tile]; cluster[tile].maxPoint[3] = tile; 
      vec3 minPoint = currentCell.minPoint.xyz; 
      vec3 maxPoint = currentCell.maxPoint.xyz; 
      vec3 t1 = vec3(lessThan(point, minPoint)); 
      vec3 t2 = vec3(greaterThan(point, maxPoint)); 
      vec3 sqDist = t1 * (minPoint - point) * (minPoint - point) + t2 * (maxPoint - point) * (maxPoint - point); 
      return sqDist.x + sqDist.y + sqDist.z;
      }
      

       

       

       

      Attachments

        No reviews matched the request. Check your Options in the drop-down menu of this sections header.

        Activity

          People

            lagocs Laszlo Agocs
            feistykittykat Dante Tufano
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved:

              Gerrit Reviews

                There are no open Gerrit changes