// TODO: finer culling for spot lights unsafe public void CullIntermediateLights(ref NativeArray <PrePunctualLight> punctualLights, ref NativeArray <ushort> lightIndices, int lightStartIndex, int lightCount, int istart, int iend, int jstart, int jend) { // Interestingly, 2-3% faster when using unsafe arrays. PrePunctualLight *_punctualLights = (PrePunctualLight *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(punctualLights); ushort * _lightIndices = (ushort *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(lightIndices); uint * _tileHeaders = (uint *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(m_TileHeaders); if (lightCount == 0) { for (int j = jstart; j < jend; ++j) { for (int i = istart; i < iend; ++i) { int headerOffset = GetTileHeaderOffset(i, j); _tileHeaders[headerOffset + 0] = 0; _tileHeaders[headerOffset + 1] = 0; } } return; } // Store culled result in temporary buffer. ushort *tiles = stackalloc ushort[lightCount]; int lightEndIndex = lightStartIndex + lightCount; for (int j = jstart; j < jend; ++j) { for (int i = istart; i < iend; ++i) { PreTile preTile = m_PreTiles[i + j * m_TileXCount]; int culledLightCount = 0; for (int vi = lightStartIndex; vi < lightEndIndex; ++vi) { ushort lightIndex = _lightIndices[vi]; PrePunctualLight ppl = _punctualLights[lightIndex]; // This is slightly faster than IntersectionLineSphere(). if (!Clip(ref preTile, ppl.posVS, ppl.radius)) { continue; } tiles[culledLightCount] = lightIndex; ++culledLightCount; } // Copy the culled light list. int tileOffset = culledLightCount > 0 ? AddTileData(tiles, ref culledLightCount) : 0; int headerOffset = GetTileHeaderOffset(i, j); _tileHeaders[headerOffset + 0] = (uint)tileOffset; _tileHeaders[headerOffset + 1] = (uint)culledLightCount; } } }
// This differs from CullIntermediateLights in 3 ways: // - tile-frustums/light intersection use different algorithm // - depth range of the light shape intersecting the tile-frustums is output in the tile list header section // - light indices written out are indexing visible_lights, rather than the array of PrePunctualLights. unsafe public void CullFinalLights(ref NativeArray <PrePunctualLight> punctualLights, ref NativeArray <ushort> lightIndices, int lightStartIndex, int lightCount, int istart, int iend, int jstart, int jend) { // Interestingly, 2-3% faster when using unsafe arrays. PrePunctualLight *_punctualLights = (PrePunctualLight *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(punctualLights); ushort * _lightIndices = (ushort *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(lightIndices); uint * _tileHeaders = (uint *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(m_TileHeaders); if (lightCount == 0) { for (int j = jstart; j < jend; ++j) { for (int i = istart; i < iend; ++i) { int headerOffset = GetTileHeaderOffset(i, j); _tileHeaders[headerOffset + 0] = 0; _tileHeaders[headerOffset + 1] = 0; _tileHeaders[headerOffset + 2] = 0; _tileHeaders[headerOffset + 3] = 0; } } return; } // Store culled lights in temporary buffer. Additionally store depth range of each light for a given tile too. // the depth range is a 32bit mask, but packed into a 16bits value since the range of the light is continuous // (only need to store first bit enabled, and count of enabled bits). ushort *tiles = stackalloc ushort[lightCount * 2]; float2 *depthRanges = stackalloc float2[lightCount]; int maxLightPerTile = 0; // for stats int lightEndIndex = lightStartIndex + lightCount; float2 tileSize = new float2((m_FrustumPlanes.right - m_FrustumPlanes.left) / m_TileXCount, (m_FrustumPlanes.top - m_FrustumPlanes.bottom) / m_TileYCount); float2 tileExtents = tileSize * 0.5f; float2 tileExtentsInv = new float2(1.0f / tileExtents.x, 1.0f / tileExtents.y); for (int j = jstart; j < jend; ++j) { float tileYCentre = m_FrustumPlanes.top - (tileExtents.y + j * tileSize.y); for (int i = istart; i < iend; ++i) { float tileXCentre = m_FrustumPlanes.left + tileExtents.x + i * tileSize.x; PreTile preTile = m_PreTiles[i + j * m_TileXCount]; int culledLightCount = 0; // For the current tile's light list, min&max depth range (absolute values). float listMinDepth = float.MaxValue; float listMaxDepth = -float.MaxValue; // Duplicate the inner loop twice. Testing for the ortographic case inside the inner loop would cost an extra 8% otherwise. // Missing C++ template argument here! if (!m_IsOrthographic) { for (int vi = lightStartIndex; vi < lightEndIndex; ++vi) { ushort lightIndex = _lightIndices[vi]; PrePunctualLight ppl = _punctualLights[lightIndex]; // Offset tileCentre toward the light to calculate a more conservative minMax depth bound, // but it must remains inside the tile and must not pass further than the light centre. float2 tileCentre = new float2(tileXCentre, tileYCentre); float2 dir = ppl.screenPos - tileCentre; float2 d = abs(dir * tileExtentsInv); float sInv = 1.0f / max3(d.x, d.y, 1.0f); float3 tileOffCentre = new float3(tileCentre.x + dir.x * sInv, tileCentre.y + dir.y * sInv, -m_FrustumPlanes.zNear); float3 tileOrigin = new float3(0.0f); float t0, t1; // This is more expensive than Clip() but allow to compute min&max depth range for the part of the light inside the tile. if (!IntersectionLineSphere(ppl.posVS, ppl.radius, tileOrigin, tileOffCentre, out t0, out t1)) { continue; } listMinDepth = listMinDepth < t0 ? listMinDepth : t0; listMaxDepth = listMaxDepth > t1 ? listMaxDepth : t1; depthRanges[culledLightCount] = new float2(t0, t1); // Because this always output to the finest tiles, contrary to CullLights(), // the result are indices into visibleLights, instead of indices into punctualLights. tiles[culledLightCount] = ppl.visLightIndex; ++culledLightCount; } } else { for (int vi = lightStartIndex; vi < lightEndIndex; ++vi) { ushort lightIndex = _lightIndices[vi]; PrePunctualLight ppl = _punctualLights[lightIndex]; // Offset tileCentre toward the light to calculate a more conservative minMax depth bound, // but it must remains inside the tile and must not pass further than the light centre. float2 tileCentre = new float2(tileXCentre, tileYCentre); float2 dir = ppl.screenPos - tileCentre; float2 d = abs(dir * tileExtentsInv); float sInv = 1.0f / max3(d.x, d.y, 1.0f); float3 tileOffCentre = new float3(0, 0, -m_FrustumPlanes.zNear); float3 tileOrigin = new float3(tileCentre.x + dir.x * sInv, tileCentre.y + dir.y * sInv, 0.0f); float t0, t1; // This is more expensive than Clip() but allow to compute min&max depth range for the part of the light inside the tile. if (!IntersectionLineSphere(ppl.posVS, ppl.radius, tileOrigin, tileOffCentre, out t0, out t1)) { continue; } listMinDepth = listMinDepth < t0 ? listMinDepth : t0; listMaxDepth = listMaxDepth > t1 ? listMaxDepth : t1; depthRanges[culledLightCount] = new float2(t0, t1); // Because this always output to the finest tiles, contrary to CullLights(), // the result are indices into visibleLights, instead of indices into punctualLights. tiles[culledLightCount] = ppl.visLightIndex; ++culledLightCount; } } // Post-multiply by zNear to get actual world unit absolute depth values, then clamp to valid depth range. listMinDepth = max2(listMinDepth * m_FrustumPlanes.zNear, m_FrustumPlanes.zNear); listMaxDepth = min2(listMaxDepth * m_FrustumPlanes.zNear, m_FrustumPlanes.zFar); // Calculate bitmask for 2.5D culling. uint bitMask = 0; float depthRangeInv = 1.0f / (listMaxDepth - listMinDepth); for (int culledLightIndex = 0; culledLightIndex < culledLightCount; ++culledLightIndex) { float lightMinDepth = max2(depthRanges[culledLightIndex].x * m_FrustumPlanes.zNear, m_FrustumPlanes.zNear); float lightMaxDepth = min2(depthRanges[culledLightIndex].y * m_FrustumPlanes.zNear, m_FrustumPlanes.zFar); int firstBit = (int)((lightMinDepth - listMinDepth) * 32.0f * depthRangeInv); int lastBit = (int)((lightMaxDepth - listMinDepth) * 32.0f * depthRangeInv); int bitCount = min(lastBit - firstBit + 1, 32 - firstBit); bitMask |= (uint)((0xFFFFFFFF >> (32 - bitCount)) << firstBit); tiles[culledLightCount + culledLightIndex] = (ushort)((uint)firstBit | (uint)(bitCount << 8)); } // As listMinDepth and listMaxDepth are used to calculate the geometry 2.5D bitmask, // we can optimize the shader execution (TileDepthInfo.shader) by refactoring the calculation. // int bitIndex = 32.0h * (geoDepth - listMinDepth) / (listMaxDepth - listMinDepth); // Equivalent to: // a = 32.0 / (listMaxDepth - listMinDepth) // b = -listMinDepth * 32.0 / (listMaxDepth - listMinDepth) // int bitIndex = geoDepth * a + b; float a = 32.0f * depthRangeInv; float b = -listMinDepth * a; int tileDataSize = culledLightCount * 2; int tileOffset = culledLightCount > 0 ? AddTileData(tiles, ref tileDataSize) : 0; int headerOffset = GetTileHeaderOffset(i, j); _tileHeaders[headerOffset + 0] = (uint)tileOffset; _tileHeaders[headerOffset + 1] = (uint)(tileDataSize == 0 ? 0 : culledLightCount); _tileHeaders[headerOffset + 2] = _f32tof16(a) | (_f32tof16(b) << 16); _tileHeaders[headerOffset + 3] = bitMask; maxLightPerTile = max(maxLightPerTile, culledLightCount); } } m_Counters[0] = max(m_Counters[0], maxLightPerTile); // TODO make it atomic }