Пример #1
0
 static XxHash3()
 {
     StripeLength.AssertEqual(64u);
     AccumulatorBytes.AssertEqual(8u);
     StripesPerBlock.AssertEqual(16u);
     BlockLength.AssertEqual(1024u);
     SecretSpan.Length.AssertEqual((int)SecretLength);
 }
Пример #2
0
    internal static unsafe void ProcessTextureAvx2(Span <Color8> data)
    {
        uint registerElements = (uint)Vector256 <uint> .Count;

        registerElements.AssertEqual((uint)(sizeof(Vector256 <uint>) / sizeof(Color8)));

        uint offset;

        fixed(Color8 *dataPtr8 = data)
        {
            uint *dataPtr = (uint *)dataPtr8;

            for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements)
            {
                Vector256 <uint> rawColor = Avx2.LoadVector256(dataPtr + offset);

                Vector256 <uint> alphaMask = Vector256.Create(0xFF000000U);
                Vector256 <uint> alpha     = Avx2.And(rawColor, alphaMask);

                Vector256 <ushort> lo = Avx2.UnpackLow(rawColor.AsByte(), Vector256 <byte> .Zero).AsUInt16();
                Vector256 <ushort> hi = Avx2.UnpackHigh(rawColor.AsByte(), Vector256 <byte> .Zero).AsUInt16();

                const byte       offset0      = 6;
                const byte       offset1      = offset0 + 8;
                const byte       offset2      = offset1 + 8;
                const byte       offset3      = offset2 + 8;
                Vector256 <byte> alphaShuffle = Vector256.Create(
                    offset0, 0xFF, offset0, 0xFF, offset0, 0xFF, offset0, 0xFF,
                    offset1, 0xFF, offset1, 0xFF, offset1, 0xFF, offset1, 0xFF,
                    offset2, 0xFF, offset2, 0xFF, offset2, 0xFF, offset2, 0xFF,
                    offset3, 0xFF, offset3, 0xFF, offset3, 0xFF, offset3, 0xFF
                    );

                Vector256 <uint> alphaLo = Avx2.Shuffle(lo.AsByte(), alphaShuffle).AsUInt32();
                Vector256 <uint> alphaHi = Avx2.Shuffle(hi.AsByte(), alphaShuffle).AsUInt32();

                Vector256 <ushort> prodLo = Avx2.MultiplyLow(lo, alphaLo.AsUInt16());
                Vector256 <ushort> prodHi = Avx2.MultiplyLow(hi, alphaHi.AsUInt16());

                Vector256 <ushort> addend = Vector256.Create((ushort)0x00FFU);

                var sumLo = Avx2.Add(prodLo, addend);
                var sumHi = Avx2.Add(prodHi, addend);

                var shiftLo = Avx2.ShiftRightLogical(sumLo, 8);
                var shiftHi = Avx2.ShiftRightLogical(sumHi, 8);

                var packed = Avx2.PackUnsignedSaturate(shiftLo.AsInt16(), shiftHi.AsInt16()).AsUInt32();

                var mask = Vector256.Create(0x00FFFFFFU);
                packed = Avx2.And(packed, mask);
                packed = Avx2.Or(packed, alpha);

                Avx2.Store(dataPtr + offset, packed);
            }
        }

        // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4)
        if (offset < data.Length)
        {
            ProcessTextureScalar(data.SliceUnsafe(offset));
        }
    }
Пример #3
0
    internal static unsafe void ProcessTextureAvx2Unrolled(Span <Color8> data)
    {
        uint registerElements = (uint)Vector256 <uint> .Count * 4;

        registerElements.AssertEqual((uint)(sizeof(Vector256 <uint>) / sizeof(Color8)));

        uint offset;

        fixed(Color8 *dataPtr8 = data)
        {
            uint *dataPtr = (uint *)dataPtr8;

            for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements)
            {
                Vector256 <uint> rawColor0 = Avx2.LoadVector256(dataPtr + offset + 0x00);
                Vector256 <uint> rawColor1 = Avx2.LoadVector256(dataPtr + offset + 0x08);
                Vector256 <uint> rawColor2 = Avx2.LoadVector256(dataPtr + offset + 0x10);
                Vector256 <uint> rawColor3 = Avx2.LoadVector256(dataPtr + offset + 0x18);

                Vector256 <uint> alphaMask = Vector256.Create(0xFF000000U);
                Vector256 <uint> alpha0    = Avx2.And(rawColor0, alphaMask);
                Vector256 <uint> alpha1    = Avx2.And(rawColor1, alphaMask);
                Vector256 <uint> alpha2    = Avx2.And(rawColor2, alphaMask);
                Vector256 <uint> alpha3    = Avx2.And(rawColor3, alphaMask);

                Vector256 <ushort> lo0 = Avx2.UnpackLow(rawColor0.AsByte(), Vector256 <byte> .Zero).AsUInt16();
                Vector256 <ushort> lo1 = Avx2.UnpackLow(rawColor1.AsByte(), Vector256 <byte> .Zero).AsUInt16();
                Vector256 <ushort> lo2 = Avx2.UnpackLow(rawColor2.AsByte(), Vector256 <byte> .Zero).AsUInt16();
                Vector256 <ushort> lo3 = Avx2.UnpackLow(rawColor3.AsByte(), Vector256 <byte> .Zero).AsUInt16();
                Vector256 <ushort> hi0 = Avx2.UnpackHigh(rawColor0.AsByte(), Vector256 <byte> .Zero).AsUInt16();
                Vector256 <ushort> hi1 = Avx2.UnpackHigh(rawColor1.AsByte(), Vector256 <byte> .Zero).AsUInt16();
                Vector256 <ushort> hi2 = Avx2.UnpackHigh(rawColor2.AsByte(), Vector256 <byte> .Zero).AsUInt16();
                Vector256 <ushort> hi3 = Avx2.UnpackHigh(rawColor3.AsByte(), Vector256 <byte> .Zero).AsUInt16();

                const byte       offset0      = 6;
                const byte       offset1      = offset0 + 8;
                const byte       offset2      = offset1 + 8;
                const byte       offset3      = offset2 + 8;
                Vector256 <byte> alphaShuffle = Vector256.Create(
                    offset0, 0xFF, offset0, 0xFF, offset0, 0xFF, offset0, 0xFF,
                    offset1, 0xFF, offset1, 0xFF, offset1, 0xFF, offset1, 0xFF,
                    offset2, 0xFF, offset2, 0xFF, offset2, 0xFF, offset2, 0xFF,
                    offset3, 0xFF, offset3, 0xFF, offset3, 0xFF, offset3, 0xFF
                    );

                Vector256 <uint> alphaLo0 = Avx2.Shuffle(lo0.AsByte(), alphaShuffle).AsUInt32();
                Vector256 <uint> alphaLo1 = Avx2.Shuffle(lo1.AsByte(), alphaShuffle).AsUInt32();
                Vector256 <uint> alphaLo2 = Avx2.Shuffle(lo2.AsByte(), alphaShuffle).AsUInt32();
                Vector256 <uint> alphaLo3 = Avx2.Shuffle(lo3.AsByte(), alphaShuffle).AsUInt32();
                Vector256 <uint> alphaHi0 = Avx2.Shuffle(hi0.AsByte(), alphaShuffle).AsUInt32();
                Vector256 <uint> alphaHi1 = Avx2.Shuffle(hi1.AsByte(), alphaShuffle).AsUInt32();
                Vector256 <uint> alphaHi2 = Avx2.Shuffle(hi2.AsByte(), alphaShuffle).AsUInt32();
                Vector256 <uint> alphaHi3 = Avx2.Shuffle(hi3.AsByte(), alphaShuffle).AsUInt32();

                Vector256 <ushort> prodLo0 = Avx2.MultiplyLow(lo0, alphaLo0.AsUInt16());
                Vector256 <ushort> prodLo1 = Avx2.MultiplyLow(lo1, alphaLo1.AsUInt16());
                Vector256 <ushort> prodLo2 = Avx2.MultiplyLow(lo2, alphaLo2.AsUInt16());
                Vector256 <ushort> prodLo3 = Avx2.MultiplyLow(lo3, alphaLo3.AsUInt16());
                Vector256 <ushort> prodHi0 = Avx2.MultiplyLow(hi0, alphaHi0.AsUInt16());
                Vector256 <ushort> prodHi1 = Avx2.MultiplyLow(hi1, alphaHi1.AsUInt16());
                Vector256 <ushort> prodHi2 = Avx2.MultiplyLow(hi2, alphaHi2.AsUInt16());
                Vector256 <ushort> prodHi3 = Avx2.MultiplyLow(hi3, alphaHi3.AsUInt16());

                Vector256 <ushort> addend = Vector256.Create((ushort)0x00FFU);

                var sumLo0 = Avx2.Add(prodLo0, addend);
                var sumLo1 = Avx2.Add(prodLo1, addend);
                var sumLo2 = Avx2.Add(prodLo2, addend);
                var sumLo3 = Avx2.Add(prodLo3, addend);
                var sumHi0 = Avx2.Add(prodHi0, addend);
                var sumHi1 = Avx2.Add(prodHi1, addend);
                var sumHi2 = Avx2.Add(prodHi2, addend);
                var sumHi3 = Avx2.Add(prodHi3, addend);

                var shiftLo0 = Avx2.ShiftRightLogical(sumLo0, 8);
                var shiftLo1 = Avx2.ShiftRightLogical(sumLo1, 8);
                var shiftLo2 = Avx2.ShiftRightLogical(sumLo2, 8);
                var shiftLo3 = Avx2.ShiftRightLogical(sumLo3, 8);
                var shiftHi0 = Avx2.ShiftRightLogical(sumHi0, 8);
                var shiftHi1 = Avx2.ShiftRightLogical(sumHi1, 8);
                var shiftHi2 = Avx2.ShiftRightLogical(sumHi2, 8);
                var shiftHi3 = Avx2.ShiftRightLogical(sumHi3, 8);

                var packed0 = Avx2.PackUnsignedSaturate(shiftLo0.AsInt16(), shiftHi0.AsInt16()).AsUInt32();
                var packed1 = Avx2.PackUnsignedSaturate(shiftLo1.AsInt16(), shiftHi1.AsInt16()).AsUInt32();
                var packed2 = Avx2.PackUnsignedSaturate(shiftLo2.AsInt16(), shiftHi2.AsInt16()).AsUInt32();
                var packed3 = Avx2.PackUnsignedSaturate(shiftLo3.AsInt16(), shiftHi3.AsInt16()).AsUInt32();

                var mask = Vector256.Create(0x00FFFFFFU);
                packed0 = Avx2.And(packed0, mask);
                packed1 = Avx2.And(packed1, mask);
                packed2 = Avx2.And(packed2, mask);
                packed3 = Avx2.And(packed3, mask);
                packed0 = Avx2.Or(packed0, alpha0);
                packed1 = Avx2.Or(packed1, alpha1);
                packed2 = Avx2.Or(packed2, alpha2);
                packed3 = Avx2.Or(packed3, alpha3);

                Avx2.Store(dataPtr + offset + 0x00, packed0);
                Avx2.Store(dataPtr + offset + 0x08, packed1);
                Avx2.Store(dataPtr + offset + 0x10, packed2);
                Avx2.Store(dataPtr + offset + 0x18, packed3);
            }
        }

        // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4)
        if (offset < data.Length)
        {
            ProcessTextureScalar(data.SliceUnsafe(offset));
        }
    }
    internal static unsafe void ProcessTextureSse2(Span <Color8> data)
    {
        uint registerElements = (uint)Vector128 <uint> .Count;

        registerElements.AssertEqual((uint)(sizeof(Vector128 <uint>) / sizeof(Color8)));

        uint offset;

        fixed(Color8 *dataPtr8 = data)
        {
            uint *dataPtr = (uint *)dataPtr8;

            for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements)
            {
                Vector128 <uint> rawColor = Sse2.LoadVector128(dataPtr + offset);

                Vector128 <uint> alphaMask = Vector128.Create(0xFF000000U);
                Vector128 <uint> alpha     = Sse2.And(rawColor, alphaMask);

                Vector128 <ushort> lo = Sse2.UnpackLow(rawColor.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> hi = Sse2.UnpackHigh(rawColor.AsByte(), Vector128 <byte> .Zero).AsUInt16();

                Vector128 <uint> alphaLo, alphaHi;
                if (Ssse3.IsSupported)
                {
                    Vector128 <byte> alphaShuffle = Vector128.Create(6, 0xFF, 6, 0xFF, 6, 0xFF, 6, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF);

                    alphaLo = Ssse3.Shuffle(lo.AsByte(), alphaShuffle).AsUInt32();
                    alphaHi = Ssse3.Shuffle(hi.AsByte(), alphaShuffle).AsUInt32();
                }
                else
                {
                    alphaLo = Sse2.UnpackLow(alpha.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaHi = Sse2.UnpackHigh(alpha.AsByte(), Vector128 <byte> .Zero).AsUInt32();

                    Vector128 <uint> alphaLo16 = Sse2.ShiftRightLogical(alphaLo, 16);
                    Vector128 <uint> alphaHi16 = Sse2.ShiftRightLogical(alphaHi, 16);
                    alphaLo = Sse2.Or(alphaLo, alphaLo16);
                    alphaHi = Sse2.Or(alphaHi, alphaHi16);

                    Vector128 <ulong> alphaLo32 = Sse2.ShiftRightLogical(alphaLo.AsUInt64(), 32);
                    Vector128 <ulong> alphaHi32 = Sse2.ShiftRightLogical(alphaHi.AsUInt64(), 32);
                    alphaLo = Sse2.Or(alphaLo.AsUInt64(), alphaLo32).AsUInt32();
                    alphaHi = Sse2.Or(alphaHi.AsUInt64(), alphaHi32).AsUInt32();
                }

                Vector128 <ushort> prodLo = Sse2.MultiplyLow(lo, alphaLo.AsUInt16());
                Vector128 <ushort> prodHi = Sse2.MultiplyLow(hi, alphaHi.AsUInt16());

                Vector128 <ushort> addend = Vector128.Create((ushort)0x00FFU);

                var sumLo = Sse2.Add(prodLo, addend);
                var sumHi = Sse2.Add(prodHi, addend);

                var shiftLo = Sse2.ShiftRightLogical(sumLo, 8);
                var shiftHi = Sse2.ShiftRightLogical(sumHi, 8);

                var packed = Sse2.PackUnsignedSaturate(shiftLo.AsInt16(), shiftHi.AsInt16()).AsUInt32();

                var mask = Vector128.Create(0x00FFFFFFU);
                packed = Sse2.And(packed, mask);
                packed = Sse2.Or(packed, alpha);

                Sse2.Store(dataPtr + offset, packed);
            }
        }

        // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4)
        if (offset < data.Length)
        {
            ProcessTextureScalar(data.SliceUnsafe(offset));
        }
    }
    internal static unsafe void ProcessTextureSse2Unrolled(Span <Color8> data)
    {
        uint registerElements = (uint)Vector128 <uint> .Count * 4U;

        registerElements.AssertEqual((uint)(sizeof(Vector128 <uint>) / sizeof(Color8)));

        uint offset;

        fixed(Color8 *dataPtr8 = data)
        {
            uint *dataPtr = (uint *)dataPtr8;

            for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements)
            {
                Vector128 <uint> rawColor0 = Sse2.LoadVector128(dataPtr + offset + 0x0);
                Vector128 <uint> rawColor1 = Sse2.LoadVector128(dataPtr + offset + 0x4);
                Vector128 <uint> rawColor2 = Sse2.LoadVector128(dataPtr + offset + 0x8);
                Vector128 <uint> rawColor3 = Sse2.LoadVector128(dataPtr + offset + 0xC);

                Vector128 <uint> alphaMask = Vector128.Create(0xFF000000U);
                Vector128 <uint> alpha0    = Sse2.And(rawColor0, alphaMask);
                Vector128 <uint> alpha1    = Sse2.And(rawColor1, alphaMask);
                Vector128 <uint> alpha2    = Sse2.And(rawColor2, alphaMask);
                Vector128 <uint> alpha3    = Sse2.And(rawColor3, alphaMask);

                Vector128 <ushort> lo0 = Sse2.UnpackLow(rawColor0.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> lo1 = Sse2.UnpackLow(rawColor1.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> lo2 = Sse2.UnpackLow(rawColor2.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> lo3 = Sse2.UnpackLow(rawColor3.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> hi0 = Sse2.UnpackHigh(rawColor0.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> hi1 = Sse2.UnpackHigh(rawColor1.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> hi2 = Sse2.UnpackHigh(rawColor2.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> hi3 = Sse2.UnpackHigh(rawColor3.AsByte(), Vector128 <byte> .Zero).AsUInt16();

                Vector128 <uint> alphaLo0, alphaHi0;
                Vector128 <uint> alphaLo1, alphaHi1;
                Vector128 <uint> alphaLo2, alphaHi2;
                Vector128 <uint> alphaLo3, alphaHi3;
                if (Ssse3.IsSupported)
                {
                    Vector128 <byte> alphaShuffle = Vector128.Create(6, 0xFF, 6, 0xFF, 6, 0xFF, 6, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF);

                    alphaLo0 = Ssse3.Shuffle(lo0.AsByte(), alphaShuffle).AsUInt32();
                    alphaLo1 = Ssse3.Shuffle(lo1.AsByte(), alphaShuffle).AsUInt32();
                    alphaLo2 = Ssse3.Shuffle(lo2.AsByte(), alphaShuffle).AsUInt32();
                    alphaLo3 = Ssse3.Shuffle(lo3.AsByte(), alphaShuffle).AsUInt32();
                    alphaHi0 = Ssse3.Shuffle(hi0.AsByte(), alphaShuffle).AsUInt32();
                    alphaHi1 = Ssse3.Shuffle(hi1.AsByte(), alphaShuffle).AsUInt32();
                    alphaHi2 = Ssse3.Shuffle(hi2.AsByte(), alphaShuffle).AsUInt32();
                    alphaHi3 = Ssse3.Shuffle(hi3.AsByte(), alphaShuffle).AsUInt32();
                }
                else
                {
                    alphaLo0 = Sse2.UnpackLow(alpha0.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaLo1 = Sse2.UnpackLow(alpha1.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaLo2 = Sse2.UnpackLow(alpha2.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaLo3 = Sse2.UnpackLow(alpha3.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaHi0 = Sse2.UnpackHigh(alpha0.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaHi1 = Sse2.UnpackHigh(alpha1.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaHi2 = Sse2.UnpackHigh(alpha2.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaHi3 = Sse2.UnpackHigh(alpha3.AsByte(), Vector128 <byte> .Zero).AsUInt32();

                    Vector128 <uint> alphaLo160 = Sse2.ShiftRightLogical(alphaLo0, 16);
                    Vector128 <uint> alphaLo161 = Sse2.ShiftRightLogical(alphaLo1, 16);
                    Vector128 <uint> alphaLo162 = Sse2.ShiftRightLogical(alphaLo2, 16);
                    Vector128 <uint> alphaLo163 = Sse2.ShiftRightLogical(alphaLo3, 16);
                    Vector128 <uint> alphaHi160 = Sse2.ShiftRightLogical(alphaHi0, 16);
                    Vector128 <uint> alphaHi161 = Sse2.ShiftRightLogical(alphaHi1, 16);
                    Vector128 <uint> alphaHi162 = Sse2.ShiftRightLogical(alphaHi2, 16);
                    Vector128 <uint> alphaHi163 = Sse2.ShiftRightLogical(alphaHi3, 16);
                    alphaLo0 = Sse2.Or(alphaLo0, alphaLo160);
                    alphaLo1 = Sse2.Or(alphaLo1, alphaLo161);
                    alphaLo2 = Sse2.Or(alphaLo2, alphaLo162);
                    alphaLo3 = Sse2.Or(alphaLo3, alphaLo163);
                    alphaHi0 = Sse2.Or(alphaHi0, alphaHi160);
                    alphaHi1 = Sse2.Or(alphaHi1, alphaHi161);
                    alphaHi2 = Sse2.Or(alphaHi2, alphaHi162);
                    alphaHi3 = Sse2.Or(alphaHi3, alphaHi163);

                    Vector128 <ulong> alphaLo320 = Sse2.ShiftRightLogical(alphaLo0.AsUInt64(), 32);
                    Vector128 <ulong> alphaLo321 = Sse2.ShiftRightLogical(alphaLo1.AsUInt64(), 32);
                    Vector128 <ulong> alphaLo322 = Sse2.ShiftRightLogical(alphaLo2.AsUInt64(), 32);
                    Vector128 <ulong> alphaLo323 = Sse2.ShiftRightLogical(alphaLo3.AsUInt64(), 32);
                    Vector128 <ulong> alphaHi320 = Sse2.ShiftRightLogical(alphaHi0.AsUInt64(), 32);
                    Vector128 <ulong> alphaHi321 = Sse2.ShiftRightLogical(alphaHi1.AsUInt64(), 32);
                    Vector128 <ulong> alphaHi322 = Sse2.ShiftRightLogical(alphaHi2.AsUInt64(), 32);
                    Vector128 <ulong> alphaHi323 = Sse2.ShiftRightLogical(alphaHi3.AsUInt64(), 32);
                    alphaLo0 = Sse2.Or(alphaLo0.AsUInt64(), alphaLo320).AsUInt32();
                    alphaLo1 = Sse2.Or(alphaLo1.AsUInt64(), alphaLo321).AsUInt32();
                    alphaLo2 = Sse2.Or(alphaLo2.AsUInt64(), alphaLo322).AsUInt32();
                    alphaLo3 = Sse2.Or(alphaLo3.AsUInt64(), alphaLo323).AsUInt32();
                    alphaHi0 = Sse2.Or(alphaHi0.AsUInt64(), alphaHi320).AsUInt32();
                    alphaHi1 = Sse2.Or(alphaHi1.AsUInt64(), alphaHi321).AsUInt32();
                    alphaHi2 = Sse2.Or(alphaHi2.AsUInt64(), alphaHi322).AsUInt32();
                    alphaHi3 = Sse2.Or(alphaHi3.AsUInt64(), alphaHi323).AsUInt32();
                }

                Vector128 <ushort> prodLo0 = Sse2.MultiplyLow(lo0, alphaLo0.AsUInt16());
                Vector128 <ushort> prodLo1 = Sse2.MultiplyLow(lo1, alphaLo1.AsUInt16());
                Vector128 <ushort> prodLo2 = Sse2.MultiplyLow(lo2, alphaLo2.AsUInt16());
                Vector128 <ushort> prodLo3 = Sse2.MultiplyLow(lo3, alphaLo3.AsUInt16());
                Vector128 <ushort> prodHi0 = Sse2.MultiplyLow(hi0, alphaHi0.AsUInt16());
                Vector128 <ushort> prodHi1 = Sse2.MultiplyLow(hi1, alphaHi1.AsUInt16());
                Vector128 <ushort> prodHi2 = Sse2.MultiplyLow(hi2, alphaHi2.AsUInt16());
                Vector128 <ushort> prodHi3 = Sse2.MultiplyLow(hi3, alphaHi3.AsUInt16());

                Vector128 <ushort> addend = Vector128.Create((ushort)0x00FFU);

                var sumLo0 = Sse2.Add(prodLo0, addend);
                var sumLo1 = Sse2.Add(prodLo1, addend);
                var sumLo2 = Sse2.Add(prodLo2, addend);
                var sumLo3 = Sse2.Add(prodLo3, addend);
                var sumHi0 = Sse2.Add(prodHi0, addend);
                var sumHi1 = Sse2.Add(prodHi1, addend);
                var sumHi2 = Sse2.Add(prodHi2, addend);
                var sumHi3 = Sse2.Add(prodHi3, addend);

                var shiftLo0 = Sse2.ShiftRightLogical(sumLo0, 8);
                var shiftLo1 = Sse2.ShiftRightLogical(sumLo1, 8);
                var shiftLo2 = Sse2.ShiftRightLogical(sumLo2, 8);
                var shiftLo3 = Sse2.ShiftRightLogical(sumLo3, 8);
                var shiftHi0 = Sse2.ShiftRightLogical(sumHi0, 8);
                var shiftHi1 = Sse2.ShiftRightLogical(sumHi1, 8);
                var shiftHi2 = Sse2.ShiftRightLogical(sumHi2, 8);
                var shiftHi3 = Sse2.ShiftRightLogical(sumHi3, 8);

                var packed0 = Sse2.PackUnsignedSaturate(shiftLo0.AsInt16(), shiftHi0.AsInt16()).AsUInt32();
                var packed1 = Sse2.PackUnsignedSaturate(shiftLo1.AsInt16(), shiftHi1.AsInt16()).AsUInt32();
                var packed2 = Sse2.PackUnsignedSaturate(shiftLo2.AsInt16(), shiftHi2.AsInt16()).AsUInt32();
                var packed3 = Sse2.PackUnsignedSaturate(shiftLo3.AsInt16(), shiftHi3.AsInt16()).AsUInt32();

                var mask = Vector128.Create(0x00FFFFFFU);
                packed0 = Sse2.And(packed0, mask);
                packed1 = Sse2.And(packed1, mask);
                packed2 = Sse2.And(packed2, mask);
                packed3 = Sse2.And(packed3, mask);
                packed0 = Sse2.Or(packed0, alpha0);
                packed1 = Sse2.Or(packed1, alpha1);
                packed2 = Sse2.Or(packed2, alpha2);
                packed3 = Sse2.Or(packed3, alpha3);

                Sse2.Store(dataPtr + offset + 0x0, packed0);
                Sse2.Store(dataPtr + offset + 0x4, packed1);
                Sse2.Store(dataPtr + offset + 0x8, packed2);
                Sse2.Store(dataPtr + offset + 0xC, packed3);
            }
        }

        // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4)
        if (offset < data.Length)
        {
            ProcessTextureScalar(data.SliceUnsafe(offset));
        }
    }