static float ComputeSumSimd(float *arr, int count)
    {
        // We're just going to assume that the length of the data is a multiple of 4, otherwise we'd have to handle the
        // other cases. It's not hard, but tedious.
        Assert.IsTrue(count % 4 == 0);

        if (Ssse3.IsSsse3Supported)
        {
            // To sum up all values in the array, we split the array into 4 subarrays and store their sums in the variable
            // `sum` below.
            v128 sum = new v128(0f);
            for (int i = 0; i < count; i += 4)
            {
                // Load 4 floats from memory.
                v128 reg = loadu_ps(arr + i);
                sum = add_ps(sum, reg);
            }

            // At this point, we have the sums of 4 subarrays in `sum` and we still need to merge them. SSE3 has a helpful
            // instruction for this:
            sum = Sse3.hadd_ps(sum, sum);
            // Now the first and third lane hold the sum of the first two subarrays and the second and fourth lane contain
            // the sum of the last two subarrays.
            sum = Sse3.hadd_ps(sum, sum);
            // Finally, all four lanes hold the same value (the sum of all subarrays) and we can return the first value
            // as a float.
            return(cvtss_f32(sum));

            // or alternatively, simply write:
            // return sum.Float0 + sum.Float1 + sum.Float2 + sum.Float3;
        }
        else if (IsNeonSupported)
        {
            // Same as above: 4 subarrays to accumulate the sum
            v128 sum = new v128(0f);
            for (int i = 0; i < count; i += 4)
            {
                // Load 4 floats from memory.
                v128 reg = vld1q_f32(arr + i);
                sum = vaddq_f32(sum, reg);
            }
            return(vaddvq_f32(sum));
        }
        else
        {
            // Managed fallback, equivalent to ComputeSum()
            float sum = 0;
            for (int i = 0; i < count; i++)
            {
                sum += arr[i];
            }
            return(sum);
        }
    }