/// <summary> /// SIMD optimized bulk implementation of <see cref="IPixel.PackFromVector4(Vector4)"/> /// that works only with `count` divisible by <see cref="Vector{UInt32}.Count"/>. /// </summary> /// <param name="sourceColors">The <see cref="BufferSpan{T}"/> to the source colors.</param> /// <param name="destVectors">The <see cref="BufferSpan{T}"/> to the dstination vectors.</param> /// <param name="count">The number of pixels to convert.</param> /// <remarks> /// Implementation adapted from: /// <see> /// <cref>http://stackoverflow.com/a/5362789</cref> /// </see> /// TODO: We can replace this implementation in the future using new Vector API-s: /// <see> /// <cref>https://github.com/dotnet/corefx/issues/15957</cref> /// </see> /// </remarks> internal static unsafe void ToVector4SimdAligned( BufferSpan <Color> sourceColors, BufferSpan <Vector4> destVectors, int count) { if (!Vector.IsHardwareAccelerated) { throw new InvalidOperationException( "Color.BulkOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); } int vecSize = Vector <uint> .Count; DebugGuard.IsTrue( count % vecSize == 0, nameof(count), "Argument 'count' should divisible by Vector<uint>.Count!"); Vector <float> bVec = new Vector <float>(256.0f / 255.0f); Vector <float> magicFloat = new Vector <float>(32768.0f); Vector <uint> magicInt = new Vector <uint>(1191182336); // reinterpreded value of 32768.0f Vector <uint> mask = new Vector <uint>(255); int unpackedRawCount = count * 4; uint *src = (uint *)sourceColors.PointerAtOffset; uint *srcEnd = src + count; using (PinnedBuffer <uint> tempBuf = new PinnedBuffer <uint>( unpackedRawCount + Vector <uint> .Count)) { uint * tPtr = (uint *)tempBuf.Pointer; uint[] temp = tempBuf.Array; float[] fTemp = Unsafe.As <float[]>(temp); UnpackedRGBA *dst = (UnpackedRGBA *)tPtr; for (; src < srcEnd; src++, dst++) { // This call is the bottleneck now: dst->Load(*src); } for (int i = 0; i < unpackedRawCount; i += vecSize) { Vector <uint> vi = new Vector <uint>(temp, i); vi &= mask; vi |= magicInt; Vector <float> vf = Vector.AsVectorSingle(vi); vf = (vf - magicFloat) * bVec; vf.CopyTo(fTemp, i); } BufferSpan.Copy <uint>(tempBuf, (BufferSpan <byte>)destVectors, unpackedRawCount); } }
/// <summary> /// SIMD optimized bulk implementation of <see cref="IPixel.PackFromVector4(Vector4)"/> /// that works only with `count` divisible by <see cref="Vector{UInt32}.Count"/>. /// </summary> /// <param name="sourceColors">The <see cref="Span{T}"/> to the source colors.</param> /// <param name="destVectors">The <see cref="Span{T}"/> to the dstination vectors.</param> /// <param name="count">The number of pixels to convert.</param> /// <remarks> /// Implementation adapted from: /// <see> /// <cref>http://stackoverflow.com/a/5362789</cref> /// </see> /// TODO: We can replace this implementation in the future using new Vector API-s: /// <see> /// <cref>https://github.com/dotnet/corefx/issues/15957</cref> /// </see> /// </remarks> internal static void ToVector4SimdAligned(Span <Rgba32> sourceColors, Span <Vector4> destVectors, int count) { if (!Vector.IsHardwareAccelerated) { throw new InvalidOperationException( "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); } DebugGuard.IsTrue( count % Vector <uint> .Count == 0, nameof(count), "Argument 'count' should divisible by Vector<uint>.Count!"); Vector <float> bVec = new Vector <float>(256.0f / 255.0f); Vector <float> magicFloat = new Vector <float>(32768.0f); Vector <uint> magicInt = new Vector <uint>(1191182336); // reinterpreded value of 32768.0f Vector <uint> mask = new Vector <uint>(255); int unpackedRawCount = count * 4; ref uint sourceBase = ref Unsafe.As <Rgba32, uint>(ref sourceColors.DangerousGetPinnableReference());