/// <summary>
            /// SIMD optimized bulk implementation of <see cref="IPixel.PackFromVector4(Vector4)"/>
            /// that works only with `count` divisible by <see cref="Vector{UInt32}.Count"/>.
            /// </summary>
            /// <param name="sourceColors">The <see cref="BufferPointer{T}"/> to the source colors.</param>
            /// <param name="destVectors">The <see cref="BufferPointer{T}"/> to the dstination vectors.</param>
            /// <param name="count">The number of pixels to convert.</param>
            /// <remarks>
            /// Implementation adapted from:
            /// <see>
            ///     <cref>http://stackoverflow.com/a/5362789</cref>
            /// </see>
            /// TODO: We can replace this implementation in the future using new Vector API-s:
            /// <see>
            ///     <cref>https://github.com/dotnet/corefx/issues/15957</cref>
            /// </see>
            /// </remarks>
            internal static unsafe void ToVector4SimdAligned(
                BufferPointer <Color> sourceColors,
                BufferPointer <Vector4> destVectors,
                int count)
            {
                int vecSize = Vector <uint> .Count;

                DebugGuard.IsTrue(
                    count % vecSize == 0,
                    nameof(count),
                    "Argument 'count' should divisible by Vector<uint>.Count!");

                Vector <float> bVec       = new Vector <float>(256.0f / 255.0f);
                Vector <float> magicFloat = new Vector <float>(32768.0f);
                Vector <uint>  magicInt   = new Vector <uint>(1191182336); // reinterpreded value of 32768.0f
                Vector <uint>  mask       = new Vector <uint>(255);

                int unpackedRawCount = count * 4;

                uint *src    = (uint *)sourceColors.PointerAtOffset;
                uint *srcEnd = src + count;

                using (PinnedBuffer <uint> tempBuf = new PinnedBuffer <uint>(
                           unpackedRawCount + Vector <uint> .Count))
                {
                    uint *        tPtr  = (uint *)tempBuf.Pointer;
                    uint[]        temp  = tempBuf.Array;
                    float[]       fTemp = Unsafe.As <float[]>(temp);
                    UnpackedRGBA *dst   = (UnpackedRGBA *)tPtr;

                    for (; src < srcEnd; src++, dst++)
                    {
                        // This call is the bottleneck now:
                        dst->Load(*src);
                    }

                    for (int i = 0; i < unpackedRawCount; i += vecSize)
                    {
                        Vector <uint> vi = new Vector <uint>(temp, i);

                        vi &= mask;
                        vi |= magicInt;

                        Vector <float> vf = Vector.AsVectorSingle(vi);
                        vf = (vf - magicFloat) * bVec;
                        vf.CopyTo(fTemp, i);
                    }

                    BufferPointer.Copy <uint>(tempBuf, (BufferPointer <byte>)destVectors, unpackedRawCount);
                }
            }
 /// <inheritdoc />
 internal override void ToXyzwBytes(BufferPointer <Color> sourceColors, BufferPointer <byte> destBytes, int count)
 {
     BufferPointer.Copy(sourceColors, destBytes, count);
 }
 /// <inheritdoc />
 internal override void PackFromXyzwBytes(BufferPointer <byte> sourceBytes, BufferPointer <Color> destColors, int count)
 {
     BufferPointer.Copy(sourceBytes, destColors, count);
 }