static bool Clip(ref PreTile tile, float3 posVS, float radius)
        {
            // Simplified clipping code, only deals with 4 clipping planes.
            // zNear and zFar clipping planes are ignored as presumably the light is already visible to the camera frustum.

            float      radiusSq    = radius * radius;
            int        insideCount = 0;
            ClipResult res;

            res = ClipPartial(tile.planeLeft, tile.planeBottom, tile.planeTop, posVS, radius, radiusSq, ref insideCount);
            if (res != ClipResult.Unknown)
            {
                return(res == ClipResult.In);
            }

            res = ClipPartial(tile.planeRight, tile.planeBottom, tile.planeTop, posVS, radius, radiusSq, ref insideCount);
            if (res != ClipResult.Unknown)
            {
                return(res == ClipResult.In);
            }

            res = ClipPartial(tile.planeTop, tile.planeLeft, tile.planeRight, posVS, radius, radiusSq, ref insideCount);
            if (res != ClipResult.Unknown)
            {
                return(res == ClipResult.In);
            }

            res = ClipPartial(tile.planeBottom, tile.planeLeft, tile.planeRight, posVS, radius, radiusSq, ref insideCount);
            if (res != ClipResult.Unknown)
            {
                return(res == ClipResult.In);
            }

            return(insideCount == 4);
        }
        // TODO: finer culling for spot lights
        unsafe public void CullIntermediateLights(ref NativeArray <PrePunctualLight> punctualLights,
                                                  ref NativeArray <ushort> lightIndices, int lightStartIndex, int lightCount,
                                                  int istart, int iend, int jstart, int jend)
        {
            // Interestingly, 2-3% faster when using unsafe arrays.
            PrePunctualLight *_punctualLights = (PrePunctualLight *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(punctualLights);
            ushort *          _lightIndices   = (ushort *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(lightIndices);
            uint *            _tileHeaders    = (uint *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(m_TileHeaders);

            if (lightCount == 0)
            {
                for (int j = jstart; j < jend; ++j)
                {
                    for (int i = istart; i < iend; ++i)
                    {
                        int headerOffset = GetTileHeaderOffset(i, j);
                        _tileHeaders[headerOffset + 0] = 0;
                        _tileHeaders[headerOffset + 1] = 0;
                    }
                }
                return;
            }

            // Store culled result in temporary buffer.
            ushort *tiles = stackalloc ushort[lightCount];

            int lightEndIndex = lightStartIndex + lightCount;

            for (int j = jstart; j < jend; ++j)
            {
                for (int i = istart; i < iend; ++i)
                {
                    PreTile preTile          = m_PreTiles[i + j * m_TileXCount];
                    int     culledLightCount = 0;

                    for (int vi = lightStartIndex; vi < lightEndIndex; ++vi)
                    {
                        ushort           lightIndex = _lightIndices[vi];
                        PrePunctualLight ppl        = _punctualLights[lightIndex];

                        // This is slightly faster than IntersectionLineSphere().
                        if (!Clip(ref preTile, ppl.posVS, ppl.radius))
                        {
                            continue;
                        }

                        tiles[culledLightCount] = lightIndex;
                        ++culledLightCount;
                    }

                    // Copy the culled light list.
                    int tileOffset = culledLightCount > 0 ? AddTileData(tiles, ref culledLightCount) : 0;

                    int headerOffset = GetTileHeaderOffset(i, j);
                    _tileHeaders[headerOffset + 0] = (uint)tileOffset;
                    _tileHeaders[headerOffset + 1] = (uint)culledLightCount;
                }
            }
        }
        // This differs from CullIntermediateLights in 3 ways:
        // - tile-frustums/light intersection use different algorithm
        // - depth range of the light shape intersecting the tile-frustums is output in the tile list header section
        // - light indices written out are indexing visible_lights, rather than the array of PrePunctualLights.
        unsafe public void CullFinalLights(ref NativeArray <PrePunctualLight> punctualLights,
                                           ref NativeArray <ushort> lightIndices, int lightStartIndex, int lightCount,
                                           int istart, int iend, int jstart, int jend)
        {
            // Interestingly, 2-3% faster when using unsafe arrays.
            PrePunctualLight *_punctualLights = (PrePunctualLight *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(punctualLights);
            ushort *          _lightIndices   = (ushort *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(lightIndices);
            uint *            _tileHeaders    = (uint *)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(m_TileHeaders);

            if (lightCount == 0)
            {
                for (int j = jstart; j < jend; ++j)
                {
                    for (int i = istart; i < iend; ++i)
                    {
                        int headerOffset = GetTileHeaderOffset(i, j);
                        _tileHeaders[headerOffset + 0] = 0;
                        _tileHeaders[headerOffset + 1] = 0;
                        _tileHeaders[headerOffset + 2] = 0;
                        _tileHeaders[headerOffset + 3] = 0;
                    }
                }
                return;
            }

            // Store culled lights in temporary buffer. Additionally store depth range of each light for a given tile too.
            // the depth range is a 32bit mask, but packed into a 16bits value since the range of the light is continuous
            // (only need to store first bit enabled, and count of enabled bits).
            ushort *tiles       = stackalloc ushort[lightCount * 2];
            float2 *depthRanges = stackalloc float2[lightCount];

            int    maxLightPerTile = 0; // for stats
            int    lightEndIndex   = lightStartIndex + lightCount;
            float2 tileSize        = new float2((m_FrustumPlanes.right - m_FrustumPlanes.left) / m_TileXCount, (m_FrustumPlanes.top - m_FrustumPlanes.bottom) / m_TileYCount);
            float2 tileExtents     = tileSize * 0.5f;
            float2 tileExtentsInv  = new float2(1.0f / tileExtents.x, 1.0f / tileExtents.y);

            for (int j = jstart; j < jend; ++j)
            {
                float tileYCentre = m_FrustumPlanes.top - (tileExtents.y + j * tileSize.y);

                for (int i = istart; i < iend; ++i)
                {
                    float tileXCentre = m_FrustumPlanes.left + tileExtents.x + i * tileSize.x;

                    PreTile preTile          = m_PreTiles[i + j * m_TileXCount];
                    int     culledLightCount = 0;

                    // For the current tile's light list, min&max depth range (absolute values).
                    float listMinDepth = float.MaxValue;
                    float listMaxDepth = -float.MaxValue;

                    // Duplicate the inner loop twice. Testing for the ortographic case inside the inner loop would cost an extra 8% otherwise.
                    // Missing C++ template argument here!
                    if (!m_IsOrthographic)
                    {
                        for (int vi = lightStartIndex; vi < lightEndIndex; ++vi)
                        {
                            ushort           lightIndex = _lightIndices[vi];
                            PrePunctualLight ppl        = _punctualLights[lightIndex];

                            // Offset tileCentre toward the light to calculate a more conservative minMax depth bound,
                            // but it must remains inside the tile and must not pass further than the light centre.
                            float2 tileCentre = new float2(tileXCentre, tileYCentre);
                            float2 dir        = ppl.screenPos - tileCentre;
                            float2 d          = abs(dir * tileExtentsInv);

                            float  sInv          = 1.0f / max3(d.x, d.y, 1.0f);
                            float3 tileOffCentre = new float3(tileCentre.x + dir.x * sInv, tileCentre.y + dir.y * sInv, -m_FrustumPlanes.zNear);
                            float3 tileOrigin    = new float3(0.0f);

                            float t0, t1;
                            // This is more expensive than Clip() but allow to compute min&max depth range for the part of the light inside the tile.
                            if (!IntersectionLineSphere(ppl.posVS, ppl.radius, tileOrigin, tileOffCentre, out t0, out t1))
                            {
                                continue;
                            }

                            listMinDepth = listMinDepth < t0 ? listMinDepth : t0;
                            listMaxDepth = listMaxDepth > t1 ? listMaxDepth : t1;
                            depthRanges[culledLightCount] = new float2(t0, t1);
                            // Because this always output to the finest tiles, contrary to CullLights(),
                            // the result are indices into visibleLights, instead of indices into punctualLights.
                            tiles[culledLightCount] = ppl.visLightIndex;
                            ++culledLightCount;
                        }
                    }
                    else
                    {
                        for (int vi = lightStartIndex; vi < lightEndIndex; ++vi)
                        {
                            ushort           lightIndex = _lightIndices[vi];
                            PrePunctualLight ppl        = _punctualLights[lightIndex];

                            // Offset tileCentre toward the light to calculate a more conservative minMax depth bound,
                            // but it must remains inside the tile and must not pass further than the light centre.
                            float2 tileCentre = new float2(tileXCentre, tileYCentre);
                            float2 dir        = ppl.screenPos - tileCentre;
                            float2 d          = abs(dir * tileExtentsInv);

                            float  sInv          = 1.0f / max3(d.x, d.y, 1.0f);
                            float3 tileOffCentre = new float3(0, 0, -m_FrustumPlanes.zNear);
                            float3 tileOrigin    = new float3(tileCentre.x + dir.x * sInv, tileCentre.y + dir.y * sInv, 0.0f);

                            float t0, t1;
                            // This is more expensive than Clip() but allow to compute min&max depth range for the part of the light inside the tile.
                            if (!IntersectionLineSphere(ppl.posVS, ppl.radius, tileOrigin, tileOffCentre, out t0, out t1))
                            {
                                continue;
                            }

                            listMinDepth = listMinDepth < t0 ? listMinDepth : t0;
                            listMaxDepth = listMaxDepth > t1 ? listMaxDepth : t1;
                            depthRanges[culledLightCount] = new float2(t0, t1);
                            // Because this always output to the finest tiles, contrary to CullLights(),
                            // the result are indices into visibleLights, instead of indices into punctualLights.
                            tiles[culledLightCount] = ppl.visLightIndex;
                            ++culledLightCount;
                        }
                    }

                    // Post-multiply by zNear to get actual world unit absolute depth values, then clamp to valid depth range.
                    listMinDepth = max2(listMinDepth * m_FrustumPlanes.zNear, m_FrustumPlanes.zNear);
                    listMaxDepth = min2(listMaxDepth * m_FrustumPlanes.zNear, m_FrustumPlanes.zFar);

                    // Calculate bitmask for 2.5D culling.
                    uint  bitMask       = 0;
                    float depthRangeInv = 1.0f / (listMaxDepth - listMinDepth);
                    for (int culledLightIndex = 0; culledLightIndex < culledLightCount; ++culledLightIndex)
                    {
                        float lightMinDepth = max2(depthRanges[culledLightIndex].x * m_FrustumPlanes.zNear, m_FrustumPlanes.zNear);
                        float lightMaxDepth = min2(depthRanges[culledLightIndex].y * m_FrustumPlanes.zNear, m_FrustumPlanes.zFar);
                        int   firstBit      = (int)((lightMinDepth - listMinDepth) * 32.0f * depthRangeInv);
                        int   lastBit       = (int)((lightMaxDepth - listMinDepth) * 32.0f * depthRangeInv);
                        int   bitCount      = min(lastBit - firstBit + 1, 32 - firstBit);
                        bitMask |= (uint)((0xFFFFFFFF >> (32 - bitCount)) << firstBit);

                        tiles[culledLightCount + culledLightIndex] = (ushort)((uint)firstBit | (uint)(bitCount << 8));
                    }

                    // As listMinDepth and listMaxDepth are used to calculate the geometry 2.5D bitmask,
                    // we can optimize the shader execution (TileDepthInfo.shader) by refactoring the calculation.
                    //   int bitIndex = 32.0h * (geoDepth - listMinDepth) / (listMaxDepth - listMinDepth);
                    // Equivalent to:
                    //   a =                 32.0 / (listMaxDepth - listMinDepth)
                    //   b = -listMinDepth * 32.0 / (listMaxDepth - listMinDepth)
                    //   int bitIndex = geoDepth * a + b;
                    float a = 32.0f * depthRangeInv;
                    float b = -listMinDepth * a;

                    int tileDataSize = culledLightCount * 2;
                    int tileOffset   = culledLightCount > 0 ? AddTileData(tiles, ref tileDataSize) : 0;

                    int headerOffset = GetTileHeaderOffset(i, j);
                    _tileHeaders[headerOffset + 0] = (uint)tileOffset;
                    _tileHeaders[headerOffset + 1] = (uint)(tileDataSize == 0 ? 0 : culledLightCount);
                    _tileHeaders[headerOffset + 2] = _f32tof16(a) | (_f32tof16(b) << 16);
                    _tileHeaders[headerOffset + 3] = bitMask;

                    maxLightPerTile = max(maxLightPerTile, culledLightCount);
                }
            }

            m_Counters[0] = max(m_Counters[0], maxLightPerTile); // TODO make it atomic
        }