private Vector128 <byte> LongToUtf8_16(long value)
        {
            Vector128 <sbyte> v;

            if (Sse41.X64.IsSupported)
            {
                v = Vector128.CreateScalarUnsafe(value).AsSByte();
            }
            else
            {
                var value0 = (int)value;
                var value1 = (int)((ulong)value >> 32);
                v = Sse41.Insert(Vector128.CreateScalarUnsafe(value0), value1, 1).AsSByte();
            }
            var vector = Ssse3.Shuffle(v, ShuffleMask).AsInt16();

            return(Sse2.Add(Sse2.Or(Sse2.ShiftRightLogical(vector, 4), Sse2.ShiftLeftLogical(Sse2.And(vector, LowMask), 8)), ShortCharA).AsByte());
        }
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new SimpleBinaryOpTest__MultiplyHighRoundScaleInt16();

            fixed(Vector128 <Int16> *pFld1 = &test._fld1)
            fixed(Vector128 <Int16> *pFld2 = &test._fld2)
            {
                var result = Ssse3.MultiplyHighRoundScale(
                    Sse2.LoadVector128((Int16 *)(pFld1)),
                    Sse2.LoadVector128((Int16 *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
            }
        }
示例#3
0
        private static void diagonalize(ref Vector128 <ulong> row1l, ref Vector128 <ulong> row2l, ref Vector128 <ulong> row3l, ref Vector128 <ulong> row4l,
                                        ref Vector128 <ulong> row1h, ref Vector128 <ulong> row2h, ref Vector128 <ulong> row3h, ref Vector128 <ulong> row4h, ref Vector128 <ulong> b0)
        {
            var t0 = Ssse3.AlignRight(row2h.As <sbyte>(), row2l.As <sbyte>(), 8);
            var t1 = Ssse3.AlignRight(row2l.As <sbyte>(), row2h.As <sbyte>(), 8);

            row2l = t0.As <ulong>();
            row2h = t1.As <ulong>();

            b0    = row3l;
            row3l = row3h;
            row3h = b0;

            t0    = Ssse3.AlignRight(row4h.As <sbyte>(), row4l.As <sbyte>(), 8);
            t1    = Ssse3.AlignRight(row4l.As <sbyte>(), row4h.As <sbyte>(), 8);
            row4l = t1.As <ulong>();
            row4h = t0.As <ulong>();
        }
示例#4
0
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new HorizontalBinaryOpTest__HorizontalSubtractSaturateInt16();

            fixed(Vector128 <Int16> *pFld1 = &test._fld1)
            fixed(Vector128 <Int16> *pFld2 = &test._fld2)
            {
                var result = Ssse3.HorizontalSubtractSaturate(
                    Sse2.LoadVector128((Int16 *)(pFld1)),
                    Sse2.LoadVector128((Int16 *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
            }
        }
示例#5
0
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new SimpleBinaryOpTest__SignSByte();

            fixed(Vector128 <SByte> *pFld1 = &test._fld1)
            fixed(Vector128 <SByte> *pFld2 = &test._fld2)
            {
                var result = Ssse3.Sign(
                    Sse2.LoadVector128((SByte *)(pFld1)),
                    Sse2.LoadVector128((SByte *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
            }
        }
示例#6
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, nint cb)
            {
                T *ip = (T *)ipstart, ipe = (T *)(ipstart + cb), op = (T *)opstart;

#if HWINTRINSICS
                if (typeof(T) == typeof(byte) && Ssse3.IsSupported && cb > Vector128 <byte> .Count)
                {
                    var mask = (ReadOnlySpan <byte>)(new byte[] {
                        0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5,
                        5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10,
                        10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15
                    });
                    byte *pmask = (byte *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(mask));

                    var vmask0 = Sse2.LoadVector128(pmask);
                    var vmask1 = Sse2.LoadVector128(pmask + Vector128 <byte> .Count);
                    var vmask2 = Sse2.LoadVector128(pmask + Vector128 <byte> .Count * 2);

                    ipe -= Vector128 <byte> .Count;
                    do
                    {
                        var v0 = Sse2.LoadVector128((byte *)ip);
                        ip += Vector128 <byte> .Count;

                        Sse2.Store((byte *)op, Ssse3.Shuffle(v0, vmask0));
                        Sse2.Store((byte *)op + Vector128 <byte> .Count, Ssse3.Shuffle(v0, vmask1));
                        Sse2.Store((byte *)op + Vector128 <byte> .Count * 2, Ssse3.Shuffle(v0, vmask2));
                        op += Vector128 <byte> .Count * 3;
                    } while (ip <= ipe);
                    ipe += Vector128 <byte> .Count;
                }
#endif

                while (ip < ipe)
                {
                    var i0 = *ip;
                    op[0] = i0;
                    op[1] = i0;
                    op[2] = i0;

                    ip++;
                    op += 3;
                }
            }
示例#7
0
        public bool Accepts(ReadOnlySpan <byte> input)
        {
            var transition = default(Vector128Impl).Id;

            int i = 0;

            for (; i + 6 < input.Length; i += 7)
            {
                var t1 = _transitions[input[i]];
                var t2 = _transitions[input[i + 1]];
                var t3 = _transitions[input[i + 2]];
                var t4 = _transitions[input[i + 3]];
                var t5 = _transitions[input[i + 4]];
                var t6 = _transitions[input[i + 5]];
                var t7 = _transitions[input[i + 6]];

                var t01 = Ssse3.Shuffle(t1, transition);
                var t23 = Ssse3.Shuffle(t3, t2);
                var t45 = Ssse3.Shuffle(t5, t4);
                var t67 = Ssse3.Shuffle(t7, t6);

                var t0123 = Ssse3.Shuffle(t23, t01);
                var t4567 = Ssse3.Shuffle(t67, t45);

                transition = Ssse3.Shuffle(t4567, t0123);
            }

            for (; i < input.Length; ++i)
            {
                transition = Ssse3.Shuffle(_transitions[input[i]], transition);
            }

            var state = Sse41.Extract(transition, (byte)_start);

            bool found = false;

            for (int j = 0; j < _accept.Length; ++j)
            {
                found = found | (_accept[j] == state);
            }

            return(found);
        }
示例#8
0
    static void ShiftRight128(Vector128 <ulong> initial, uint n, out Vector128 <ulong> outLeft,
                              out Vector128 <ulong> outRight)
    {
        uint maskPos = 16 - n;

        Vector128 <byte> maskA = Vector128.Create(_shuffleMasks[maskPos], _shuffleMasks[maskPos + 1],
                                                  _shuffleMasks[maskPos + 2], _shuffleMasks[maskPos + 3],
                                                  _shuffleMasks[maskPos + 4], _shuffleMasks[maskPos + 5],
                                                  _shuffleMasks[maskPos + 6], _shuffleMasks[maskPos + 7],
                                                  _shuffleMasks[maskPos + 8], _shuffleMasks[maskPos + 9],
                                                  _shuffleMasks[maskPos + 10], _shuffleMasks[maskPos + 11],
                                                  _shuffleMasks[maskPos + 12], _shuffleMasks[maskPos + 13],
                                                  _shuffleMasks[maskPos + 14], _shuffleMasks[maskPos + 15]);

        Vector128 <byte> maskB = Sse2.Xor(maskA, Sse2.CompareEqual(Vector128 <byte> .Zero, Vector128 <byte> .Zero));

        outLeft  = Ssse3.Shuffle(initial.AsByte(), maskB).AsUInt64();
        outRight = Ssse3.Shuffle(initial.AsByte(), maskA).AsUInt64();
    }
示例#9
0
        public static int4 nabs(int4 x)
        {
            if (Ssse3.IsSsse3Supported)
            {
                v128 _nabs = Sse2.sub_epi32(default(v128), Ssse3.abs_epi32(*(v128 *)&x));

                return(*(int4 *)&_nabs);
            }
            else if (Sse2.IsSse2Supported)
            {
                v128 _nabs = Mask.BlendV(Sse2.sub_epi32(default(v128), *(v128 *)&x), *(v128 *)&x, Sse2.cmpgt_epi32(default(v128), *(v128 *)&x));

                return(*(int4 *)&_nabs);
            }
            else
            {
                return(new int4(nabs(x.x), nabs(x.y), nabs(x.z), nabs(x.w)));
            }
        }
示例#10
0
        private unsafe static void BCnDecodeTileAlpha(Span <byte> output, Span <byte> rPal, ulong rI)
        {
            if (Avx2.IsSupported)
            {
                Span <Vector128 <byte> > outputAsVector128 = MemoryMarshal.Cast <byte, Vector128 <byte> >(output);

                Vector128 <uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
                Vector128 <uint> masks  = Vector128.Create(7u);

                Vector128 <byte> vClut;

                fixed(byte *pRPal = rPal)
                {
                    vClut = Sse2.LoadScalarVector128((ulong *)pRPal).AsByte();
                }

                Vector128 <uint> indices0  = Vector128.Create((uint)rI);
                Vector128 <uint> indices1  = Vector128.Create((uint)(rI >> 24));
                Vector128 <uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
                Vector128 <uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
                Vector128 <uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
                Vector128 <uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
                indices00 = Sse2.And(indices00, masks);
                indices10 = Sse2.And(indices10, masks);
                indices01 = Sse2.And(indices01, masks);
                indices11 = Sse2.And(indices11, masks);

                Vector128 <ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
                Vector128 <ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());

                Vector128 <byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());

                outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
            }
            else
            {
                for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
                {
                    output[i] = rPal[(int)(rI & 7)];
                }
            }
        }
示例#11
0
        public bool Accepts(ReadOnlySpan <byte> input)
        {
            var transition = default(Vector128Impl).Id;

            for (int i = 0; i < input.Length; ++i)
            {
                transition = Ssse3.Shuffle(_transitions[input[i]], transition);
            }

            var state = Sse41.Extract(transition, (byte)_start);

            bool found = false;

            for (int i = 0; i < _accept.Length; ++i)
            {
                found = found | (_accept[i] == state);
            }

            return(found);
        }
示例#12
0
        public byte8 NextByte8(byte8 min, byte8 max)
        {
            Assert.IsNotSmaller(max.x0, min.x0);
            Assert.IsNotSmaller(max.x1, min.x1);
            Assert.IsNotSmaller(max.x2, min.x2);
            Assert.IsNotSmaller(max.x3, min.x3);
            Assert.IsNotSmaller(max.x4, min.x4);
            Assert.IsNotSmaller(max.x5, min.x5);
            Assert.IsNotSmaller(max.x6, min.x6);
            Assert.IsNotSmaller(max.x7, min.x7);

            if (Ssse3.IsSsse3Supported)
            {
                short8 temp = (short8)(max - min) * new short8(NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState());

                return(min + Ssse3.shuffle_epi8(temp, new byte8(1, 3, 5, 7, 9, 11, 13, 15)));
            }
            else
            {
                return(min + (byte8)(((short8)(max - min) * new short8(NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState())) >> 8));
            }
        }
示例#13
0
        private unsafe int SumVectorizedSse2(ReadOnlySpan <int> source)
        {
            int result;

            fixed(int *pSource = source)
            {
                Vector128 <int> vresult = Vector128 <int> .Zero;

                int i = 0;
                int lastBlockIndex = source.Length - source.Length % 4;

                while (i < lastBlockIndex)
                {
                    vresult = Sse2.Add(vresult, Sse2.LoadVector128(pSource + i));
                    i      += 4;
                }

                if (Ssse3.IsSupported)
                {
                    vresult = Ssse3.HorizontalAdd(vresult, vresult);
                    vresult = Ssse3.HorizontalAdd(vresult, vresult);
                }
                else
                {
                    vresult = Sse2.Add(vresult, Sse2.Shuffle(vresult, 0x4E));
                    vresult = Sse2.Add(vresult, Sse2.Shuffle(vresult, 0xB1));
                }
                result = vresult.ToScalar();

                while (i < source.Length)
                {
                    result += pSource[i];
                    i      += 1;
                }
            }

            return(result);
        }
        private static void ReverseEndianess(uint *source, uint *dest, int len)
        {
            int vecLen = Vector128 <uint> .Count;

            if (Ssse3.IsSupported && len >= vecLen)
            {
                int i = 0;

                do
                {
                    var vec = Sse2.LoadVector128(source + i);

                    vec = Ssse3.Shuffle(vec.AsByte(), ReverseEndianess_32_128).AsUInt32();

                    Sse2.Store(dest + i, vec);

                    i += Vector128 <uint> .Count;
                }while (len - i >= Vector128 <uint> .Count);

                if (i < len) //Remainder problem
                {
                    i = len - vecLen;

                    var vec = Sse2.LoadVector128(source + i);

                    vec = Ssse3.Shuffle(vec.AsByte(), ReverseEndianess_32_128).AsUInt32();

                    Sse2.Store(dest + i, vec);
                }

                return;
            }

            for (int i = 0; i < len; ++i)
            {
                dest[i] = BinaryPrimitives.ReverseEndianness(source[i]);
            }
        }
        public static unsafe int CountEvenSIMD(int[] numbers)
        {
            int counter = 0;
            int len     = numbers.Length;

            fixed(int *num = numbers)
            {
                Vector128 <int> vresult = Vector128 <int> .Zero;
                Vector128 <int> ones    = Vector128.Create(1);

                int i = 0;
                int lastBlockIndex = len - (len % 4);

                while (i < lastBlockIndex)
                {
                    var vec  = Sse2.LoadVector128(num + i);
                    var odds = Sse2.And(vec, ones);
                    vresult = Sse2.Add(vresult, odds);

                    i += 4;
                }

                vresult = Ssse3.HorizontalAdd(vresult, vresult);
                vresult = Ssse3.HorizontalAdd(vresult, vresult);

                counter = vresult.ToScalar();

                while (i < len)
                {
                    var odd = numbers[i] & 1;
                    counter += odd;

                    i += 1;
                }
            }

            return(numbers.Length - counter);
        }
        static unsafe int SumVectorizedSse(ReadOnlySpan <int> source)
        {
            int result;

            fixed(int *sourcePointer = source)
            {
                Vector128 <int> resultVector = Vector128 <int> .Zero;

                int i = 0;
                int lastBlockIndex = source.Length - source.Length % 4;

                while (i < lastBlockIndex)
                {
                    resultVector = Sse2.Add(resultVector, Sse2.LoadVector128(sourcePointer + i));
                    i           += 4;
                }

                if (Ssse3.IsSupported)
                {
                    resultVector = Ssse3.HorizontalAdd(resultVector, resultVector);
                    resultVector = Ssse3.HorizontalAdd(resultVector, resultVector);
                }
                else
                {
                    resultVector = Sse2.Add(resultVector, Sse2.Shuffle(resultVector, 0x4E));
                    resultVector = Sse2.Add(resultVector, Sse2.Shuffle(resultVector, 0xB1));
                }
                result = resultVector.ToScalar();

                while (i < source.Length)
                {
                    result += sourcePointer[i];
                    i      += 1;
                }
            }

            return(result);
        }
示例#17
0
        public static byte3 lzcnt(byte3 x)
        {
            if (Ssse3.IsSsse3Supported)
            {
                v128 NIBBLE_MASK     = new v128(0x0F0F_0F0F);
                v128 SHUFFLE_MASK_LO = new v128(8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4);
                v128 SHUFFLE_MASK_HI = new v128(8, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);

                return(Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)),
                                     Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4)))));
            }
            else if (Sse2.IsSse2Supported)
            {
                byte3 y;
                byte3 n = 8;
                byte3 mask;

                y    = x >> 4;
                mask = Sse2.cmpeq_epi8(y, default(v128));
                n    = Mask.BlendV(n - 4, n, mask);
                x    = Mask.BlendV(y, x, mask);

                y    = x >> 2;
                mask = Sse2.cmpeq_epi8(y, default(v128));
                n    = Mask.BlendV(n - 2, n, mask);
                x    = Mask.BlendV(y, x, mask);

                y    = x >> 1;
                mask = Sse2.cmpeq_epi8(y, default(v128));

                return(Mask.BlendV(n - 2, n - x, mask));
            }
            else
            {
                return(new byte3(lzcnt(x.x), lzcnt(x.y), lzcnt(x.z)));
            }
        }
示例#18
0
        internal static unsafe uint GetSse(ReadOnlySpan <byte> buffer, uint s1, uint s2)
        {
            uint len = (uint)buffer.Length;

            uint blocks = len / BLOCK_SIZE;

            len = len - blocks * BLOCK_SIZE;

            Vector128 <sbyte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
            Vector128 <sbyte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
            Vector128 <byte>  zero = Vector128 <byte> .Zero;
            Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1);

            fixed(byte *bufPtr = &MemoryMarshal.GetReference(buffer))
            {
                var buf = bufPtr;

                while (blocks != 0)
                {
                    uint n = NMAX32 / BLOCK_SIZE;
                    if (n > blocks)
                    {
                        n = blocks;
                    }

                    blocks -= n;

                    // Process n blocks of data. At most NMAX data bytes can be
                    // processed before s2 must be reduced modulo BASE.
                    Vector128 <uint> v_ps = Vector128.Create(0, 0, 0, s1 * n);
                    Vector128 <uint> v_s2 = Vector128.Create(0, 0, 0, s2);
                    Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0);

                    do
                    {
                        // Load 32 input bytes.
                        Vector128 <byte> bytes1 = Sse2.LoadVector128(&buf[0]);
                        Vector128 <byte> bytes2 = Sse2.LoadVector128(&buf[16]);


                        // Add previous block byte sum to v_ps.
                        v_ps = Sse2.Add(v_ps, v_s1);



                        // Horizontally add the bytes for s1, multiply-adds the
                        // bytes by [ 32, 31, 30, ... ] for s2.
                        Vector128 <ushort> sad1 = Sse2.SumAbsoluteDifferences(bytes1, zero);
                        v_s1 = Sse2.Add(v_s1, sad1.AsUInt32());
                        Vector128 <short> mad11 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
                        Vector128 <int>   mad12 = Sse2.MultiplyAddAdjacent(mad11, ones);
                        v_s2 = Sse2.Add(v_s2, mad12.AsUInt32());


                        Vector128 <ushort> sad2 = Sse2.SumAbsoluteDifferences(bytes2, zero);
                        v_s1 = Sse2.Add(v_s1, sad2.AsUInt32());
                        Vector128 <short> mad21 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
                        Vector128 <int>   mad22 = Sse2.MultiplyAddAdjacent(mad21, ones);
                        v_s2 = Sse2.Add(v_s2, mad22.AsUInt32());

                        buf += BLOCK_SIZE;

                        n--;
                    } while (n != 0);

                    var shift = Sse2.ShiftLeftLogical(v_ps, 5);
                    v_s2 = Sse2.Add(v_s2, shift);


                    // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).

                    // A B C D -> B A D C
                    const int S2301 = 2 << 6 | 3 << 4 | 0 << 2 | 1;
                    // A B C D -> C D A B
                    const int S1032 = 1 << 6 | 0 << 4 | 3 << 2 | 2;

                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301));
                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));
                    s1  += Sse2.ConvertToUInt32(v_s1);
                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));
                    s2   = Sse2.ConvertToUInt32(v_s2);

                    s1 %= MOD32;
                    s2 %= MOD32;
                }

                if (len > 0)
                {
                    if (len >= 16)
                    {
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        len -= 16;
                    }

                    while (len-- > 0)
                    {
                        s2 += (s1 += *buf++);
                    }
                    if (s1 >= MOD32)
                    {
                        s1 -= MOD32;
                    }

                    s2 %= MOD32;
                }

                return(s1 | (s2 << 16));
            }
        }
示例#19
0
        public static unsafe void ReadVector(IntBlock block, BufferedReader reader, int[] buffer)
        {
            // Build first unadjusted vector and per-vector increment
            Vector128 <int> unadjusted = SetIncrement(block.Base, block.Slope);
            Vector128 <int> increment  = Set1(block.Slope * 4);

            if (block.BitsPerAdjustment == 0)
            {
                // If no adjustments, calculate in blocks and return
                fixed(int *resultPtr = buffer)
                {
                    for (int i = 0; i < block.Count; i += 4)
                    {
                        Unsafe.WriteUnaligned(&resultPtr[i], unadjusted);
                        unadjusted = Sse2.Add(unadjusted, increment);
                    }
                }

                return;
            }

            fixed(byte *bufferPtr = reader.Buffer)
            fixed(int *resultPtr        = buffer)
            fixed(sbyte *shuffleMaskPtr = ShuffleMasks)
            fixed(int *multiplyMaskPtr  = MultiplyMasks)
            {
                byte bitsPerAdjustment = block.BitsPerAdjustment;
                int  index             = reader.Index;
                int  count             = block.Count;

                // Calculate bytes consumed for the first and second four ints decoded (different for odd bit lengths)
                byte bytesPerEight = bitsPerAdjustment;
                byte bytes1        = (byte)(bytesPerEight / 2);

                // Calculate how much to shift values (from top of each int to bottom)
                byte shiftRightBits = (byte)(32 - bitsPerAdjustment);

                // Get shuffle mask (to get correct bits) and multiply value (to shift to top of each int) for halves
                Vector128 <sbyte> shuffle1  = Unsafe.ReadUnaligned <Vector128 <sbyte> >(&shuffleMaskPtr[32 * bitsPerAdjustment]);
                Vector128 <int>   multiply1 = Unsafe.ReadUnaligned <Vector128 <int> >(&multiplyMaskPtr[8 * bitsPerAdjustment]);

                Vector128 <sbyte> shuffle2  = Unsafe.ReadUnaligned <Vector128 <sbyte> >(&shuffleMaskPtr[32 * bitsPerAdjustment + 16]);
                Vector128 <int>   multiply2 = Unsafe.ReadUnaligned <Vector128 <int> >(&multiplyMaskPtr[8 * bitsPerAdjustment + 4]);

                for (int i = 0; i < count; i += 8, index += bytesPerEight)
                {
                    // Read source bytes
                    Vector128 <int> vector1 = Unsafe.ReadUnaligned <Vector128 <int> >(&bufferPtr[index]);
                    Vector128 <int> vector2 = Unsafe.ReadUnaligned <Vector128 <int> >(&bufferPtr[index + bytes1]);

                    // Shuffle to get the right bytes in each integer
                    vector1 = Sse.StaticCast <sbyte, int>(Ssse3.Shuffle(Sse.StaticCast <int, sbyte>(vector1), shuffle1));
                    vector2 = Sse.StaticCast <sbyte, int>(Ssse3.Shuffle(Sse.StaticCast <int, sbyte>(vector2), shuffle2));

                    // Multiply to shift each int so the desired bits are at the top
                    vector1 = Sse41.MultiplyLow(vector1, multiply1);
                    vector2 = Sse41.MultiplyLow(vector2, multiply2);

                    // Shift the desired bits to the bottom and zero the top
                    vector1 = Sse2.ShiftRightLogical(vector1, shiftRightBits);
                    vector2 = Sse2.ShiftRightLogical(vector2, shiftRightBits);

                    // Add the delta base value
                    vector1    = Sse2.Add(vector1, unadjusted);
                    unadjusted = Sse2.Add(unadjusted, increment);

                    vector2    = Sse2.Add(vector2, unadjusted);
                    unadjusted = Sse2.Add(unadjusted, increment);

                    // Write the decoded integers
                    Unsafe.WriteUnaligned(&resultPtr[i], vector1);
                    Unsafe.WriteUnaligned(&resultPtr[i + 4], vector2);
                }

                reader.Index = index;
            }
        }
示例#20
0
 private static int AbsSsse3(int a)
 => (int)Sse2.ConvertToUInt32(Ssse3.Abs(Vector128.CreateScalarUnsafe(a)));
示例#21
0
        private unsafe static Surface ReadNv12(ResourceManager rm, ref SlotSurfaceConfig config, ref PlaneOffsets offsets)
        {
            InputSurface input = ReadSurface(rm.Gmm, ref config, ref offsets, 1, 2);

            int width  = input.Width;
            int height = input.Height;

            int yStride  = GetPitch(width, 1);
            int uvStride = GetPitch(input.UvWidth, 2);

            Surface output = new Surface(rm.SurfacePool, width, height);

            if (Sse41.IsSupported)
            {
                Vector128 <byte> shufMask = Vector128.Create(
                    (byte)0, (byte)2, (byte)3, (byte)1,
                    (byte)4, (byte)6, (byte)7, (byte)5,
                    (byte)8, (byte)10, (byte)11, (byte)9,
                    (byte)12, (byte)14, (byte)15, (byte)13);
                Vector128 <short> alphaMask = Vector128.Create(0xffUL << 48).AsInt16();

                int yStrideGap  = yStride - width;
                int uvStrideGap = uvStride - input.UvWidth;

                int widthTrunc = width & ~0xf;

                fixed(Pixel *dstPtr = output.Data)
                {
                    Pixel *op = dstPtr;

                    fixed(byte *src0Ptr = input.Buffer0, src1Ptr = input.Buffer1)
                    {
                        byte *i0p = src0Ptr;

                        for (int y = 0; y < height; y++)
                        {
                            byte *i1p = src1Ptr + (y >> 1) * uvStride;

                            int x = 0;

                            for (; x < widthTrunc; x += 16, i0p += 16, i1p += 16)
                            {
                                Vector128 <short> ya0 = Sse41.ConvertToVector128Int16(i0p);
                                Vector128 <short> ya1 = Sse41.ConvertToVector128Int16(i0p + 8);

                                Vector128 <byte> uv = Sse2.LoadVector128(i1p);

                                Vector128 <short> uv0 = Sse2.UnpackLow(uv.AsInt16(), uv.AsInt16());
                                Vector128 <short> uv1 = Sse2.UnpackHigh(uv.AsInt16(), uv.AsInt16());

                                Vector128 <short> rgba0 = Sse2.UnpackLow(ya0, uv0);
                                Vector128 <short> rgba1 = Sse2.UnpackHigh(ya0, uv0);
                                Vector128 <short> rgba2 = Sse2.UnpackLow(ya1, uv1);
                                Vector128 <short> rgba3 = Sse2.UnpackHigh(ya1, uv1);

                                rgba0 = Ssse3.Shuffle(rgba0.AsByte(), shufMask).AsInt16();
                                rgba1 = Ssse3.Shuffle(rgba1.AsByte(), shufMask).AsInt16();
                                rgba2 = Ssse3.Shuffle(rgba2.AsByte(), shufMask).AsInt16();
                                rgba3 = Ssse3.Shuffle(rgba3.AsByte(), shufMask).AsInt16();

                                Vector128 <short> rgba16_0 = Sse41.ConvertToVector128Int16(rgba0.AsByte());
                                Vector128 <short> rgba16_1 = Sse41.ConvertToVector128Int16(HighToLow(rgba0.AsByte()));
                                Vector128 <short> rgba16_2 = Sse41.ConvertToVector128Int16(rgba1.AsByte());
                                Vector128 <short> rgba16_3 = Sse41.ConvertToVector128Int16(HighToLow(rgba1.AsByte()));
                                Vector128 <short> rgba16_4 = Sse41.ConvertToVector128Int16(rgba2.AsByte());
                                Vector128 <short> rgba16_5 = Sse41.ConvertToVector128Int16(HighToLow(rgba2.AsByte()));
                                Vector128 <short> rgba16_6 = Sse41.ConvertToVector128Int16(rgba3.AsByte());
                                Vector128 <short> rgba16_7 = Sse41.ConvertToVector128Int16(HighToLow(rgba3.AsByte()));

                                rgba16_0 = Sse2.Or(rgba16_0, alphaMask);
                                rgba16_1 = Sse2.Or(rgba16_1, alphaMask);
                                rgba16_2 = Sse2.Or(rgba16_2, alphaMask);
                                rgba16_3 = Sse2.Or(rgba16_3, alphaMask);
                                rgba16_4 = Sse2.Or(rgba16_4, alphaMask);
                                rgba16_5 = Sse2.Or(rgba16_5, alphaMask);
                                rgba16_6 = Sse2.Or(rgba16_6, alphaMask);
                                rgba16_7 = Sse2.Or(rgba16_7, alphaMask);

                                rgba16_0 = Sse2.ShiftLeftLogical(rgba16_0, 2);
                                rgba16_1 = Sse2.ShiftLeftLogical(rgba16_1, 2);
                                rgba16_2 = Sse2.ShiftLeftLogical(rgba16_2, 2);
                                rgba16_3 = Sse2.ShiftLeftLogical(rgba16_3, 2);
                                rgba16_4 = Sse2.ShiftLeftLogical(rgba16_4, 2);
                                rgba16_5 = Sse2.ShiftLeftLogical(rgba16_5, 2);
                                rgba16_6 = Sse2.ShiftLeftLogical(rgba16_6, 2);
                                rgba16_7 = Sse2.ShiftLeftLogical(rgba16_7, 2);

                                Sse2.Store((short *)(op + (uint)x + 0), rgba16_0);
                                Sse2.Store((short *)(op + (uint)x + 2), rgba16_1);
                                Sse2.Store((short *)(op + (uint)x + 4), rgba16_2);
                                Sse2.Store((short *)(op + (uint)x + 6), rgba16_3);
                                Sse2.Store((short *)(op + (uint)x + 8), rgba16_4);
                                Sse2.Store((short *)(op + (uint)x + 10), rgba16_5);
                                Sse2.Store((short *)(op + (uint)x + 12), rgba16_6);
                                Sse2.Store((short *)(op + (uint)x + 14), rgba16_7);
                            }

                            for (; x < width; x++, i1p += (x & 1) * 2)
                            {
                                Pixel *px = op + (uint)x;

                                px->R = Upsample(*i0p++);
                                px->G = Upsample(*i1p);
                                px->B = Upsample(*(i1p + 1));
                                px->A = 0x3ff;
                            }

                            op  += width;
                            i0p += yStrideGap;
                            i1p += uvStrideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    int uvBase = (y >> 1) * uvStride;

                    for (int x = 0; x < width; x++)
                    {
                        output.SetR(x, y, Upsample(input.Buffer0[y * yStride + x]));

                        int uvOffs = uvBase + (x & ~1);

                        output.SetG(x, y, Upsample(input.Buffer1[uvOffs]));
                        output.SetB(x, y, Upsample(input.Buffer1[uvOffs + 1]));
                        output.SetA(x, y, 0x3ff);
                    }
                }
            }

            return(output);
        }
示例#22
0
        private unsafe static void WriteA8R8G8B8(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets)
        {
            int width  = input.Width;
            int height = input.Height;
            int stride = GetPitch(width, 4);

            int dstIndex = rm.BufferPool.Rent(height * stride, out Span <byte> dst);

            if (Ssse3.IsSupported)
            {
                Vector128 <byte> shuffleMask = Vector128.Create(
                    (byte)2, (byte)1, (byte)0, (byte)3,
                    (byte)6, (byte)5, (byte)4, (byte)7,
                    (byte)10, (byte)9, (byte)8, (byte)11,
                    (byte)14, (byte)13, (byte)12, (byte)15);

                int widthTrunc = width & ~7;
                int strideGap  = stride - width * 4;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dst)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < height; y++, ip += input.Width)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 8)
                            {
                                Vector128 <ushort> pixel12 = Sse2.LoadVector128((ushort *)(ip + (uint)x));
                                Vector128 <ushort> pixel34 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 2));
                                Vector128 <ushort> pixel56 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 4));
                                Vector128 <ushort> pixel78 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 6));

                                pixel12 = Sse2.ShiftRightLogical(pixel12, 2);
                                pixel34 = Sse2.ShiftRightLogical(pixel34, 2);
                                pixel56 = Sse2.ShiftRightLogical(pixel56, 2);
                                pixel78 = Sse2.ShiftRightLogical(pixel78, 2);

                                Vector128 <byte> pixel1234 = Sse2.PackUnsignedSaturate(pixel12.AsInt16(), pixel34.AsInt16());
                                Vector128 <byte> pixel5678 = Sse2.PackUnsignedSaturate(pixel56.AsInt16(), pixel78.AsInt16());

                                pixel1234 = Ssse3.Shuffle(pixel1234, shuffleMask);
                                pixel5678 = Ssse3.Shuffle(pixel5678, shuffleMask);

                                Sse2.Store(op + 0x00, pixel1234);
                                Sse2.Store(op + 0x10, pixel5678);

                                op += 0x20;
                            }

                            for (; x < width; x++)
                            {
                                Pixel *px = ip + (uint)x;

                                *(op + 0) = Downsample(px->B);
                                *(op + 1) = Downsample(px->G);
                                *(op + 2) = Downsample(px->R);
                                *(op + 3) = Downsample(px->A);

                                op += 4;
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    int baseOffs = y * stride;

                    for (int x = 0; x < width; x++)
                    {
                        int offs = baseOffs + x * 4;

                        dst[offs + 0] = Downsample(input.GetB(x, y));
                        dst[offs + 1] = Downsample(input.GetG(x, y));
                        dst[offs + 2] = Downsample(input.GetR(x, y));
                        dst[offs + 3] = Downsample(input.GetA(x, y));
                    }
                }
            }

            bool outLinear = config.OutBlkKind == 0;

            int gobBlocksInY = 1 << config.OutBlkHeight;

            WriteBuffer(rm, dst, offsets.LumaOffset, outLinear, width, height, 4, gobBlocksInY);

            rm.BufferPool.Return(dstIndex);
        }
示例#23
0
        static unsafe int Main(string[] args)
        {
            int testResult = Pass;

            if (Ssse3.IsSupported)
            {
                using (TestTable <sbyte> sbyteTable = new TestTable <sbyte>(new sbyte[16] {
                    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
                }, new sbyte[16] {
                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
                }, new sbyte[16]))
                {
                    var vf1 = Unsafe.Read <Vector128 <sbyte> >(sbyteTable.inArray1Ptr);
                    var vf2 = Unsafe.Read <Vector128 <sbyte> >(sbyteTable.inArray2Ptr);

                    var vf3 = Ssse3.AlignRight(vf1, vf2, 27);
                    Unsafe.Write(sbyteTable.outArrayPtr, vf3);

                    if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 27) && (z[01] == 28) && (z[02] == 29) && (z[03] == 30) &&
                                                (z[04] == 31) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) &&
                                                (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) &&
                                                (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00)))
                    {
                        Console.WriteLine("SSE AlignRight failed on sbyte:");
                        foreach (var item in sbyteTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Ssse3.AlignRight(vf1, vf2, 5);
                    Unsafe.Write(sbyteTable.outArrayPtr, vf3);

                    if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 05) && (z[01] == 06) && (z[02] == 07) && (z[03] == 08) &&
                                                (z[04] == 09) && (z[05] == 10) && (z[06] == 11) && (z[07] == 12) &&
                                                (z[08] == 13) && (z[09] == 14) && (z[10] == 15) && (z[11] == 16) &&
                                                (z[12] == 17) && (z[13] == 18) && (z[14] == 19) && (z[15] == 20)))
                    {
                        Console.WriteLine("SSE AlignRight failed on sbyte:");
                        foreach (var item in sbyteTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Ssse3.AlignRight(vf1, vf2, 250);
                    Unsafe.Write(sbyteTable.outArrayPtr, vf3);

                    if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 00) && (z[01] == 00) && (z[02] == 00) && (z[03] == 00) &&
                                                (z[04] == 00) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) &&
                                                (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) &&
                                                (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00)))
                    {
                        Console.WriteLine("SSE AlignRight failed on sbyte:");
                        foreach (var item in sbyteTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Ssse3.AlignRight(vf1, vf2, 228);
                    Unsafe.Write(sbyteTable.outArrayPtr, vf3);

                    if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 00) && (z[01] == 00) && (z[02] == 00) && (z[03] == 00) &&
                                                (z[04] == 00) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) &&
                                                (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) &&
                                                (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00)))
                    {
                        Console.WriteLine("SSE AlignRight failed on sbyte:");
                        foreach (var item in sbyteTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = (Vector128 <sbyte>) typeof(Ssse3).GetMethod(nameof(Ssse3.AlignRight), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(27) });
                    Unsafe.Write(sbyteTable.outArrayPtr, vf3);

                    if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 27) && (z[01] == 28) && (z[02] == 29) && (z[03] == 30) &&
                                                (z[04] == 31) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) &&
                                                (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) &&
                                                (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00)))
                    {
                        Console.WriteLine("SSE AlignRight failed on sbyte:");
                        foreach (var item in sbyteTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }
            }

            return(testResult);
        }
示例#24
0
        public unsafe void Serialize(ref MessagePackWriter writer, float[]?value, MessagePackSerializerOptions options)
        {
            if (value == null)
            {
                writer.WriteNil();
                return;
            }

            var inputLength = value.Length;

            writer.WriteArrayHeader(inputLength);
            if (inputLength == 0)
            {
                return;
            }

            // output byte[] length can be calculated from input float[] length.
            var outputLength = inputLength * 5;
            var destination  = writer.GetSpan(outputLength);

            fixed(byte *pDestination = &destination[0])
            {
                var outputIterator = pDestination;

                fixed(float *pSource = &value[0])
                {
                    var inputEnd      = pSource + inputLength;
                    var inputIterator = (uint *)pSource;

                    if (Sse42.IsSupported)
                    {
                        if (inputLength < 6)
                        {
                            goto ProcessEach;
                        }

                        // Process 3 floats at once.
                        // From 12 bytes to 15 bytes.
                        var vectorConstant   = Vector128.Create(MessagePackCode.Float32, 0, 0, 0, 0, MessagePackCode.Float32, 0, 0, 0, 0, MessagePackCode.Float32, 0, 0, 0, 0, 0);
                        var vectorShuffle    = Vector128.Create(0x80, 3, 2, 1, 0, 0x80, 7, 6, 5, 4, 0x80, 11, 10, 9, 8, 0x80);
                        var vectorLoopLength = ((inputLength / 3) - 1) * 3;
                        for (var vectorizedEnd = inputIterator + vectorLoopLength; inputIterator != vectorizedEnd; inputIterator += 3, outputIterator += 15)
                        {
                            // new float[] { 1.0, -2.0, 3.5, } is byte[12] { 00, 00, 80, 3f, 00, 00, 00, c0, 00, 00, 60, 40 } in binary expression;
                            var current = Sse2.LoadVector128((byte *)inputIterator);
                            // Output binary should be byte[15] { ca, 3f, 80, 00, 00, ca, c0, 00, 00, 00, ca, 40, 60, 00, 00 };
                            Sse2.Store(outputIterator, Sse2.Or(Ssse3.Shuffle(current, vectorShuffle), vectorConstant));
                        }
                    }

ProcessEach:
                    while (inputIterator != inputEnd)
                    {
                        // Encode float as Big Endian
                        *   outputIterator++ = MessagePackCode.Float32;
                        var current          = *inputIterator++;
                        *   outputIterator++ = (byte)(current >> 24);
                        *   outputIterator++ = (byte)(current >> 16);
                        *   outputIterator++ = (byte)(current >> 8);
                        *   outputIterator++ = (byte)current;
                    }
                }
            }

            writer.Advance(outputLength);
        }
示例#25
0
        // PolyvalPowersTable updates the POLYVAL value in polyval to include length bytes
        // of data from input, given the POLYVAL key in hashKey. It uses the precomputed
        // powers of the key given in htbl. If the length is not divisible by 16, input
        // is padded with zeros until it's a multiple of 16 bytes.
        private static void PolyvalPowersTable(byte *polyval, byte *htbl, byte *input, int length)
        {
            if (length == 0)
            {
                return;
            }

            int blocks = Math.DivRem(length, 16, out int remainder16);
            int remainder128 = length % 128 - remainder16;
            Vector128 <ulong> tmp0, tmp1, tmp2, tmp3, tmp4;

            var xhi  = Sse2.SetZeroVector128 <ulong>();
            var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1));
            var t    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval));

            if (remainder128 != 0)
            {
                int remainder128Blocks = remainder128 / 16;
                blocks -= remainder128Blocks;

                var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(input)));
                var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - 1) * 16]));

                tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                tmp2 = Sse2.Xor(tmp2, tmp3);

                for (int i = 1; i < remainder128Blocks; ++i)
                {
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&input[i * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - i - 1) * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                }

                tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                xhi  = Sse2.Xor(tmp3, tmp1);
                t    = Sse2.Xor(tmp0, tmp2);
            }

            if (blocks != 0)
            {
                var fixedInput = input + remainder128;

                if (remainder128 == 0)
                {
                    var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[7 * 16]));
                    var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16]));

                    tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[6 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[5 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[4 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[3 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));
                    tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[2 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[1 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[0 * 16])));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                    tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                    xhi  = Sse2.Xor(tmp3, tmp1);
                    t    = Sse2.Xor(tmp0, tmp2);
                }

                for (int i = remainder128 == 0 ? 8 : 0; i < blocks; i += 8)
                {
                    var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 7) * 16]));
                    var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16]));

                    tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 6) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 5) * 16]));
                    tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    t    = Sse2.Xor(t, tmp4);
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 4) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 3) * 16]));
                    tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    t    = Sse2.Xor(t, tmp4);
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 2) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    t    = Sse2.Xor(t, xhi);
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 1) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[i * 16])));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                    tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                    xhi  = Sse2.Xor(tmp3, tmp1);
                    t    = Sse2.Xor(tmp0, tmp2);
                }
            }

            if (blocks != 0 || remainder128 != 0)
            {
                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                t    = Sse2.Xor(xhi, t);
            }

            if (remainder16 != 0)
            {
                byte *b = stackalloc byte[16];
                new Span <byte>(input + length - remainder16, remainder16).CopyTo(new Span <byte>(b, 16));

                var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b)));
                var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));

                tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                xhi  = Sse2.Xor(tmp3, tmp1);
                t    = Sse2.Xor(tmp0, tmp2);

                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                t    = Sse2.Xor(xhi, t);
            }

            Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t));
        }
示例#26
0
        public unsafe void Serialize(ref MessagePackWriter writer, double[]?value, MessagePackSerializerOptions options)
        {
            if (value == null)
            {
                writer.WriteNil();
                return;
            }

            var inputLength = value.Length;

            writer.WriteArrayHeader(inputLength);
            if (inputLength == 0)
            {
                return;
            }

            var outputLength = inputLength * 9;
            var destination  = writer.GetSpan(outputLength);

            fixed(byte *pDestination = &destination[0])
            {
                var outputIterator = pDestination;

                fixed(double *pSource = &value[0])
                {
                    var inputEnd      = pSource + inputLength;
                    var inputIterator = (ulong *)pSource;

                    if (Avx2.IsSupported)
                    {
                        const int ShiftCount = 2;
                        const int Stride     = 1 << ShiftCount;

                        if (inputLength < Stride << 1)
                        {
                            goto ProcessEach;
                        }

                        var vectorShuffle = Vector256.Create((byte)7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
                        for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride)
                        {
                            // Fetch 4 doubles.
                            var current = Avx.LoadVector256((byte *)inputIterator);
                            // Reorder Little Endian bytes to Big Endian.
                            var answer = Avx2.Shuffle(current, vectorShuffle).AsUInt64();
                            // Write 4 Big-Endian doubles.
                            *outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(0);
                            outputIterator          += 8;
                            *outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(1);
                            outputIterator          += 8;
                            *outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(2);
                            outputIterator          += 8;
                            *outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(3);
                            outputIterator          += 8;
                        }
                    }
                    else if (Ssse3.IsSupported)
                    {
                        const int ShiftCount = 1;
                        const int Stride     = 1 << ShiftCount;

                        if (inputLength < Stride << 1)
                        {
                            goto ProcessEach;
                        }

                        var vectorShuffle = Vector128.Create((byte)7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
                        for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride)
                        {
                            var current          = Sse2.LoadVector128((byte *)inputIterator);
                            var answer           = Ssse3.Shuffle(current, vectorShuffle).AsUInt64();
                            *   outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(0);
                            outputIterator          += 8;
                            *outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(1);
                            outputIterator          += 8;
                        }
                    }

ProcessEach:
                    while (inputIterator != inputEnd)
                    {
                        *   outputIterator++ = MessagePackCode.Float64;
                        var current          = *inputIterator++;
                        *   outputIterator++ = (byte)(current >> 56);
                        *   outputIterator++ = (byte)(current >> 48);
                        *   outputIterator++ = (byte)(current >> 40);
                        *   outputIterator++ = (byte)(current >> 32);
                        *   outputIterator++ = (byte)(current >> 24);
                        *   outputIterator++ = (byte)(current >> 16);
                        *   outputIterator++ = (byte)(current >> 8);
                        *   outputIterator++ = (byte)current;
                    }
                }
            }

            writer.Advance(outputLength);
        }
示例#27
0
文件: Program.cs 项目: z77ma/runtime
    static int Main()
    {
        s_success = true;

        // We expect the AOT compiler generated HW intrinsics with the following characteristics:
        //
        // * TRUE = IsSupported assumed to be true, no runtime check
        // * NULL = IsSupported is a runtime check, code should be behind the check or bad things happen
        // * FALSE = IsSupported assumed to be false, no runtime check, PlatformNotSupportedException if used
        //
        // The test is compiled with multiple defines to test this.

#if BASELINE_INTRINSICS
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 16;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = null;
        bool?AesLzPcl           = null;
        bool?Sse4142            = null;
        bool?PopCnt             = null;
        bool?Avx12    = false;
        bool?FmaBmi12 = false;
        bool?Avxvnni  = false;
#elif NON_VEX_INTRINSICS
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 16;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = true;
        bool?AesLzPcl           = null;
        bool?Sse4142            = true;
        bool?PopCnt             = null;
        bool?Avx12    = false;
        bool?FmaBmi12 = false;
        bool?Avxvnni  = false;
#elif VEX_INTRINSICS
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 32;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = true;
        bool?AesLzPcl           = null;
        bool?Sse4142            = true;
        bool?PopCnt             = null;
        bool?Avx12    = true;
        bool?FmaBmi12 = null;
        bool?Avxvnni  = null;
#else
#error Who dis?
#endif

        if (vectorsAccelerated != Vector.IsHardwareAccelerated)
        {
            throw new Exception($"Vectors HW acceleration state unexpected - expected {vectorsAccelerated}, got {Vector.IsHardwareAccelerated}");
        }

        if (byteVectorLength != Vector <byte> .Count)
        {
            throw new Exception($"Unexpected vector length - expected {byteVectorLength}, got {Vector<byte>.Count}");
        }

        Check("Sse", Sse2AndBelow, &SseIsSupported, Sse.IsSupported, () => Sse.Subtract(Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Sse.X64", Sse2AndBelow, &SseX64IsSupported, Sse.X64.IsSupported, () => Sse.X64.ConvertToInt64WithTruncation(Vector128 <float> .Zero) == 0);

        Check("Sse2", Sse2AndBelow, &Sse2IsSupported, Sse2.IsSupported, () => Sse2.Extract(Vector128 <ushort> .Zero, 0) == 0);
        Check("Sse2.X64", Sse2AndBelow, &Sse2X64IsSupported, Sse2.X64.IsSupported, () => Sse2.X64.ConvertToInt64(Vector128 <double> .Zero) == 0);

        Check("Sse3", Sse3Group, &Sse3IsSupported, Sse3.IsSupported, () => Sse3.MoveHighAndDuplicate(Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Sse3.X64", Sse3Group, &Sse3X64IsSupported, Sse3.X64.IsSupported, null);

        Check("Ssse3", Sse3Group, &Ssse3IsSupported, Ssse3.IsSupported, () => Ssse3.Abs(Vector128 <short> .Zero).Equals(Vector128 <ushort> .Zero));
        Check("Ssse3.X64", Sse3Group, &Ssse3X64IsSupported, Ssse3.X64.IsSupported, null);

        Check("Sse41", Sse4142, &Sse41IsSupported, Sse41.IsSupported, () => Sse41.Max(Vector128 <int> .Zero, Vector128 <int> .Zero).Equals(Vector128 <int> .Zero));
        Check("Sse41.X64", Sse4142, &Sse41X64IsSupported, Sse41.X64.IsSupported, () => Sse41.X64.Extract(Vector128 <long> .Zero, 0) == 0);

        Check("Sse42", Sse4142, &Sse42IsSupported, Sse42.IsSupported, () => Sse42.Crc32(0, 0) == 0);
        Check("Sse42.X64", Sse4142, &Sse42X64IsSupported, Sse42.X64.IsSupported, () => Sse42.X64.Crc32(0, 0) == 0);

        Check("Aes", AesLzPcl, &AesIsSupported, Aes.IsSupported, () => Aes.KeygenAssist(Vector128 <byte> .Zero, 0).Equals(Vector128.Create((byte)99)));
        Check("Aes.X64", AesLzPcl, &AesX64IsSupported, Aes.X64.IsSupported, null);

        Check("Avx", Avx12, &AvxIsSupported, Avx.IsSupported, () => Avx.Add(Vector256 <double> .Zero, Vector256 <double> .Zero).Equals(Vector256 <double> .Zero));
        Check("Avx.X64", Avx12, &AvxX64IsSupported, Avx.X64.IsSupported, null);

        Check("Avx2", Avx12, &Avx2IsSupported, Avx2.IsSupported, () => Avx2.Abs(Vector256 <int> .Zero).Equals(Vector256 <uint> .Zero));
        Check("Avx2.X64", Avx12, &Avx2X64IsSupported, Avx2.X64.IsSupported, null);

        Check("Bmi1", FmaBmi12, &Bmi1IsSupported, Bmi1.IsSupported, () => Bmi1.AndNot(0, 0) == 0);
        Check("Bmi1.X64", FmaBmi12, &Bmi1X64IsSupported, Bmi1.X64.IsSupported, () => Bmi1.X64.AndNot(0, 0) == 0);

        Check("Bmi2", FmaBmi12, &Bmi2IsSupported, Bmi2.IsSupported, () => Bmi2.MultiplyNoFlags(0, 0) == 0);
        Check("Bmi2.X64", FmaBmi12, &Bmi2X64IsSupported, Bmi2.X64.IsSupported, () => Bmi2.X64.MultiplyNoFlags(0, 0) == 0);

        Check("Fma", FmaBmi12, &FmaIsSupported, Fma.IsSupported, () => Fma.MultiplyAdd(Vector128 <float> .Zero, Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Fma.X64", FmaBmi12, &FmaX64IsSupported, Fma.X64.IsSupported, null);

        Check("Lzcnt", AesLzPcl, &LzcntIsSupported, Lzcnt.IsSupported, () => Lzcnt.LeadingZeroCount(0) == 32);
        Check("Lzcnt.X64", AesLzPcl, &LzcntX64IsSupported, Lzcnt.X64.IsSupported, () => Lzcnt.X64.LeadingZeroCount(0) == 64);

        Check("Pclmulqdq", AesLzPcl, &PclmulqdqIsSupported, Pclmulqdq.IsSupported, () => Pclmulqdq.CarrylessMultiply(Vector128 <long> .Zero, Vector128 <long> .Zero, 0).Equals(Vector128 <long> .Zero));
        Check("Pclmulqdq.X64", AesLzPcl, &PclmulqdqX64IsSupported, Pclmulqdq.X64.IsSupported, null);

        Check("Popcnt", PopCnt, &PopcntIsSupported, Popcnt.IsSupported, () => Popcnt.PopCount(0) == 0);
        Check("Popcnt.X64", PopCnt, &PopcntX64IsSupported, Popcnt.X64.IsSupported, () => Popcnt.X64.PopCount(0) == 0);

        Check("AvxVnni", Avxvnni, &AvxVnniIsSupported, AvxVnni.IsSupported, () => AvxVnni.MultiplyWideningAndAdd(Vector128 <int> .Zero, Vector128 <byte> .Zero, Vector128 <sbyte> .Zero).Equals(Vector128 <int> .Zero));
        Check("AvxVnni.X64", Avxvnni, &AvxVnniX64IsSupported, AvxVnni.X64.IsSupported, null);

        return(s_success ? 100 : 1);
    }
示例#28
0
        public static unsafe Vector128 <byte> End(ref State state, Span <byte> store128)
        {
            long Len = state.TotalLengthInBytes;

            Vector128 <byte> xmm0 = state.xmm0;
            Vector128 <byte> xmm1 = state.xmm1;
            Vector128 <byte> xmm2 = state.xmm2;
            Vector128 <byte> xmm3 = state.xmm3;
            Vector128 <byte> xmm4 = state.xmm4;
            Vector128 <byte> xmm5 = state.xmm5;
            Vector128 <byte> xmm6 = state.xmm6;
            Vector128 <byte> xmm7 = state.xmm7;

            Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

            fixed(byte *rax = state.Buffer)
            {
                xmm9  = Vector128 <byte> .Zero;
                xmm11 = Vector128 <byte> .Zero;

                byte *Last = (byte *)rax + (Len & 0xf0);
                long  Len8 = (Len & 0xf);

                if (Len8 > 0)
                {
                    fixed(byte *MeowMaskLen = s_meowMaskLen)
                    {
                        xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]);
                    }

                    xmm9 = Sse2.LoadVector128(Last);
                    xmm9 = Sse2.And(xmm9, xmm8);
                }

                if ((Len & 0x10) != 0)
                {
                    xmm11 = xmm9;
                    xmm9  = Sse2.LoadVector128(Last - 0x10);
                }


                xmm8  = xmm9;
                xmm10 = xmm9;
                xmm8  = Ssse3.AlignRight(xmm8, xmm11, 15);
                xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1);

                xmm12 = Vector128 <byte> .Zero;
                xmm13 = Vector128 <byte> .Zero;
                xmm14 = Vector128 <byte> .Zero;
                xmm15 = Vector128.Create((ulong)Len, 0).AsByte();
                xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15);
                xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
                MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
#endif

                // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
                MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11);

                // NOTE(casey): Append the length, to avoid problems with our 32-byte padding
                MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                //
                // NOTE(casey): Hash all full 32-byte blocks
                //
                long LaneCount = (Len >> 5) & 0x7;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount;

                //
                // NOTE(casey): Mix the eight lanes down to one 128-bit hash
                //

MixDown:

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);
                MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2);
                MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3);
                MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4);
                MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5);
                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                if (store128 != null)
                {
                    fixed(byte *store128Ptr = store128)
                    {
                        Sse2.Store(store128Ptr + 0x00, xmm0);
                        Sse2.Store(store128Ptr + 0x10, xmm1);
                        Sse2.Store(store128Ptr + 0x20, xmm2);
                        Sse2.Store(store128Ptr + 0x30, xmm3);
                        Sse2.Store(store128Ptr + 0x40, xmm4);
                        Sse2.Store(store128Ptr + 0x50, xmm5);
                        Sse2.Store(store128Ptr + 0x60, xmm6);
                        Sse2.Store(store128Ptr + 0x70, xmm7);
                    }
                }

                xmm0 = AddQ(xmm0, xmm2);
                xmm1 = AddQ(xmm1, xmm3);
                xmm4 = AddQ(xmm4, xmm6);
                xmm5 = AddQ(xmm5, xmm7);
                xmm0 = Sse2.Xor(xmm0, xmm1);
                xmm4 = Sse2.Xor(xmm4, xmm5);
                xmm0 = AddQ(xmm0, xmm4);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                return(xmm0);
            }
        }
示例#29
0
        private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer)
        {
            uint s1 = adler & 0xFFFF;
            uint s2 = (adler >> 16) & 0xFFFF;

            // Process the data in blocks.
            const int BLOCK_SIZE = 1 << 5;

            uint length = (uint)buffer.Length;
            uint blocks = length / BLOCK_SIZE;

            length -= blocks * BLOCK_SIZE;

            int index = 0;

            fixed(byte *bufferPtr = buffer)
            fixed(byte *tapPtr = Tap1Tap2)
            {
                index += (int)blocks * BLOCK_SIZE;
                var localBufferPtr = bufferPtr;

                // _mm_setr_epi8 on x86
                Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr);
                Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10));
                Vector128 <byte>  zero = Vector128 <byte> .Zero;
                var ones = Vector128.Create((short)1);

                while (blocks > 0)
                {
                    uint n = NMAX / BLOCK_SIZE;  /* The NMAX constraint. */
                    if (n > blocks)
                    {
                        n = blocks;
                    }

                    blocks -= n;

                    // Process n blocks of data. At most NMAX data bytes can be
                    // processed before s2 must be reduced modulo BASE.
                    Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n);
                    Vector128 <uint> v_s2 = Vector128.CreateScalar(s2);
                    Vector128 <uint> v_s1 = Vector128 <uint> .Zero;

                    do
                    {
                        // Load 32 input bytes.
                        Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
                        Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);

                        // Add previous block byte sum to v_ps.
                        v_ps = Sse2.Add(v_ps, v_s1);

                        // Horizontally add the bytes for s1, multiply-adds the
                        // bytes by [ 32, 31, 30, ... ] for s2.
                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
                        Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());

                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
                        Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());

                        localBufferPtr += BLOCK_SIZE;
                    }while (--n > 0);

                    v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                    // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                    const byte S2301 = 0b1011_0001;  // A B C D -> B A D C
                    const byte S1032 = 0b0100_1110;  // A B C D -> C D A B

                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));

                    s1 += v_s1.ToScalar();

                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));

                    s2 = v_s2.ToScalar();

                    // Reduce.
                    s1 %= BASE;
                    s2 %= BASE;
                }

                if (length > 0)
                {
                    if (length >= 16)
                    {
                        s2 += s1 += localBufferPtr[0];
                        s2 += s1 += localBufferPtr[1];
                        s2 += s1 += localBufferPtr[2];
                        s2 += s1 += localBufferPtr[3];
                        s2 += s1 += localBufferPtr[4];
                        s2 += s1 += localBufferPtr[5];
                        s2 += s1 += localBufferPtr[6];
                        s2 += s1 += localBufferPtr[7];
                        s2 += s1 += localBufferPtr[8];
                        s2 += s1 += localBufferPtr[9];
                        s2 += s1 += localBufferPtr[10];
                        s2 += s1 += localBufferPtr[11];
                        s2 += s1 += localBufferPtr[12];
                        s2 += s1 += localBufferPtr[13];
                        s2 += s1 += localBufferPtr[14];
                        s2 += s1 += localBufferPtr[15];

                        localBufferPtr += 16;
                        length         -= 16;
                    }

                    while (length-- > 0)
                    {
                        s2 += s1 += *localBufferPtr++;
                    }

                    if (s1 >= BASE)
                    {
                        s1 -= BASE;
                    }

                    s2 %= BASE;
                }

                return(s1 | (s2 << 16));
            }
        }
示例#30
0
        //
        // NOTE(casey): Single block version
        //
        public static unsafe Vector128 <byte> Hash(ReadOnlySpan <byte> Seed128Init, ReadOnlySpan <byte> SourceInit)
        {
            Vector128 <byte> xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;       // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes
            Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length)

            int Len = SourceInit.Length;

            fixed(byte *sourceInitPtr = SourceInit)
            fixed(byte *seedInitPtr = Seed128Init)
            {
                byte *rax = sourceInitPtr;
                byte *rcx = seedInitPtr;

                //
                // NOTE(casey): Seed the eight hash registers
                //

                xmm0 = Sse2.LoadVector128(rcx + 0x00);
                xmm1 = Sse2.LoadVector128(rcx + 0x10);
                xmm2 = Sse2.LoadVector128(rcx + 0x20);
                xmm3 = Sse2.LoadVector128(rcx + 0x30);

                xmm4 = Sse2.LoadVector128(rcx + 0x40);
                xmm5 = Sse2.LoadVector128(rcx + 0x50);
                xmm6 = Sse2.LoadVector128(rcx + 0x60);
                xmm7 = Sse2.LoadVector128(rcx + 0x70);

                // MEOW_DUMP_STATE("Seed", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);

                //
                // NOTE(casey): Hash all full 256-byte blocks
                //

                int BlockCount = (SourceInit.Length >> 8);

                if (BlockCount > MEOW_PREFETCH_LIMIT)
                {
                    // NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this loop
                    while (BlockCount-- > 0)
                    {
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0x00);
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0x40);
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0x80);
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0xc0);

                        MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00);
                        MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20);
                        MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40);
                        MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60);
                        MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80);
                        MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0);
                        MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0);
                        MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0);

                        rax += 0x100;
                    }
                }
                else
                {
                    // NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port pressure), so we use this loop.
                    while (BlockCount-- > 0)
                    {
                        MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00);
                        MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20);
                        MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40);
                        MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60);
                        MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80);
                        MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0);
                        MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0);
                        MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0);

                        rax += 0x100;
                    }
                }

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                //
                // NOTE(casey): Load any less-than-32-byte residual
                //

                xmm9  = Vector128 <byte> .Zero;
                xmm11 = Vector128 <byte> .Zero;

                //
                // TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here,
                // because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the
                // result, but it would take a little thought to make sure it couldn't read off the end of the buffer due
                // to the & 0xf on the align computation.
                //

                // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned
                byte *Last = (byte *)sourceInitPtr + (Len & ~0xf);
                int   Len8 = (Len & 0xf);
                if (Len8 > 0)
                {
                    // NOTE(casey): Load the mask early
                    fixed(byte *MeowMaskLen = s_meowMaskLen)
                    {
                        xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]);
                    }

                    byte *LastOk = (byte *)((((ulong)(((byte *)sourceInitPtr) + Len - 1)) | (MEOW_PAGESIZE - 1)) - 16);
                    int   Align  = (Last > LastOk) ? ((int)(ulong)Last) & 0xf : 0;

                    fixed(byte *MeowShiftAdjust = s_meowShiftAdjust)
                    {
                        xmm10 = Sse2.LoadVector128(&MeowShiftAdjust[Align]);
                    }

                    xmm9 = Sse2.LoadVector128(Last - Align);
                    xmm9 = Ssse3.Shuffle(xmm9, xmm10);

                    // NOTE(jeffr): and off the extra bytes
                    xmm9 = Sse2.And(xmm9, xmm8);
                }

                // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned
                if ((Len & 0x10) != 0)
                {
                    xmm11 = xmm9;
                    xmm9  = Sse2.LoadVector128(Last - 0x10);
                }

                //
                // NOTE(casey): Construct the residual and length injests
                //

                xmm8  = xmm9;
                xmm10 = xmm9;
                xmm8  = Ssse3.AlignRight(xmm8, xmm11, 15);
                xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1);

                // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but
                // the decision was made to leave them zero'd so as not to confuse people
                // about hwo to use them or what security implications they had.
                xmm12 = Vector128 <byte> .Zero;
                xmm13 = Vector128 <byte> .Zero;
                xmm14 = Vector128 <byte> .Zero;
                xmm15 = Vector128.Create((ulong)Len, 0).AsByte();
                xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15);
                xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
#endif

                // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
                MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11);

                // NOTE(casey): Append the length, to avoid problems with our 32-byte padding
                MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                //
                // NOTE(casey): Hash all full 32-byte blocks
                //
                int LaneCount = (Len >> 5) & 0x7;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount;

                //
                // NOTE(casey): Mix the eight lanes down to one 128-bit hash
                //

MixDown:

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);
                MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2);
                MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3);
                MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4);
                MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5);
                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                xmm0 = AddQ(xmm0, xmm2);
                xmm1 = AddQ(xmm1, xmm3);
                xmm4 = AddQ(xmm4, xmm6);
                xmm5 = AddQ(xmm5, xmm7);
                xmm0 = Sse2.Xor(xmm0, xmm1);
                xmm4 = Sse2.Xor(xmm4, xmm5);
                xmm0 = AddQ(xmm0, xmm4);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                return(xmm0);
            }
        }