public override ulong Run(CancellationToken cancellationToken)
        {
            if (!Pclmulqdq.IsSupported)
            {
                return(0uL);
            }

            var randomIntSpan = new Span <long>(new[] { randomInt, randomInt });
            var dst           = new Span <long>(Enumerable.Repeat(1L, 2).ToArray());
            var iterations    = 0uL;

            unsafe
            {
                fixed(long *pdst = dst)
                fixed(long *psrc = randomIntSpan)
                {
                    var srcVector = Sse2.LoadVector128(psrc);
                    var dstVector = Sse2.LoadVector128(pdst);

                    while (!cancellationToken.IsCancellationRequested)
                    {
                        for (var j = 0; j < LENGTH; j++)
                        {
                            dstVector = Pclmulqdq.CarrylessMultiply(dstVector, srcVector, 0b00);
                        }

                        Sse2.Store(pdst, dstVector);

                        iterations++;
                    }
                }
            }

            return(iterations);
        }
예제 #2
0
            public void RunStructFldScenario(PclmulqdqOpTest__CarrylessMultiplyInt641 testClass)
            {
                var result = Pclmulqdq.CarrylessMultiply(_fld1, _fld2, 1);

                Unsafe.Write(testClass._dataTable.outArrayPtr, result);
                testClass.ValidateResult(testClass._dataTable.outArrayPtr);
            }
예제 #3
0
        // InitPowersTable writes powers 1..size of hashKey to htbl.
        private static void InitPowersTable(byte *htbl, int size, byte *hashKey)
        {
            Vector128 <ulong> tmp1, tmp2, tmp3, tmp4;

            var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1));
            var t    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(hashKey));
            var h    = t;

            Sse2.Store(htbl, Sse.StaticCast <ulong, byte>(t));

            for (int i = 1; i < size; ++i)
            {
                tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00);
                tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp1 = Sse2.Xor(tmp3, tmp1);
                tmp4 = Sse2.Xor(tmp4, tmp2);

                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp3, tmp2);
                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp3, tmp2);
                t    = Sse2.Xor(tmp4, tmp1);
                Sse2.Store(&htbl[i * 16], Sse.StaticCast <ulong, byte>(t));
            }
        }
예제 #4
0
        public void RunClassFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario));

            var result = Pclmulqdq.CarrylessMultiply(_fld1, _fld2, 1);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.outArrayPtr);
        }
예제 #5
0
    private unsafe void GFMul(ReadOnlySpan <byte> x)
    {
        var a = _key.AsUInt64();
        Vector128 <ulong> b;

        fixed(byte *p = x)
        {
            var t = Sse2.LoadVector128(p);

            b = t.ReverseEndianness128().AsUInt64();
        }

        b = Sse2.Xor(b.AsByte(), _buffer).AsUInt64();

        var tmp3 = Pclmulqdq.CarrylessMultiply(a, b, 0x00).AsUInt32();
        var tmp4 = Pclmulqdq.CarrylessMultiply(a, b, 0x10).AsUInt32();
        var tmp5 = Pclmulqdq.CarrylessMultiply(a, b, 0x01).AsUInt32();
        var tmp6 = Pclmulqdq.CarrylessMultiply(a, b, 0x11).AsUInt32();

        tmp4 = Sse2.Xor(tmp4, tmp5);
        tmp5 = Sse2.ShiftLeftLogical128BitLane(tmp4, 8);
        tmp4 = Sse2.ShiftRightLogical128BitLane(tmp4, 8);
        tmp3 = Sse2.Xor(tmp3, tmp5);
        tmp6 = Sse2.Xor(tmp6, tmp4);

        var tmp7 = Sse2.ShiftRightLogical(tmp3, 31);
        var tmp8 = Sse2.ShiftRightLogical(tmp6, 31);

        tmp3 = Sse2.ShiftLeftLogical(tmp3, 1);
        tmp6 = Sse2.ShiftLeftLogical(tmp6, 1);
        var tmp9 = Sse2.ShiftRightLogical128BitLane(tmp7, 12);

        tmp8 = Sse2.ShiftLeftLogical128BitLane(tmp8, 4);
        tmp7 = Sse2.ShiftLeftLogical128BitLane(tmp7, 4);
        tmp3 = Sse2.Or(tmp3, tmp7);
        tmp6 = Sse2.Or(tmp6, tmp8);
        tmp6 = Sse2.Or(tmp6, tmp9);
        tmp7 = Sse2.ShiftLeftLogical(tmp3, 31);
        tmp8 = Sse2.ShiftLeftLogical(tmp3, 30);
        tmp9 = Sse2.ShiftLeftLogical(tmp3, 25);
        tmp7 = Sse2.Xor(tmp7, tmp8);
        tmp7 = Sse2.Xor(tmp7, tmp9);
        tmp8 = Sse2.ShiftRightLogical128BitLane(tmp7, 4);
        tmp7 = Sse2.ShiftLeftLogical128BitLane(tmp7, 12);
        tmp3 = Sse2.Xor(tmp3, tmp7);
        var tmp2 = Sse2.ShiftRightLogical(tmp3, 1);

        tmp4 = Sse2.ShiftRightLogical(tmp3, 2);
        tmp5 = Sse2.ShiftRightLogical(tmp3, 7);
        tmp2 = Sse2.Xor(tmp2, tmp4);
        tmp2 = Sse2.Xor(tmp2, tmp5);
        tmp2 = Sse2.Xor(tmp2, tmp8);
        tmp3 = Sse2.Xor(tmp3, tmp2);
        tmp6 = Sse2.Xor(tmp6, tmp3);

        _buffer = tmp6.AsByte();
    }
예제 #6
0
        public void RunStructLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario));

            var test   = TestStruct.Create();
            var result = Pclmulqdq.CarrylessMultiply(test._fld1, test._fld2, 1);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.outArrayPtr);
        }
예제 #7
0
        public void RunClassLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario));

            var test   = new PclmulqdqOpTest__CarrylessMultiplyInt641();
            var result = Pclmulqdq.CarrylessMultiply(test._fld1, test._fld2, 1);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.outArrayPtr);
        }
예제 #8
0
        public void RunLclVarScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead));

            var left   = Unsafe.Read <Vector128 <UInt64> >(_dataTable.inArray1Ptr);
            var right  = Unsafe.Read <Vector128 <UInt64> >(_dataTable.inArray2Ptr);
            var result = Pclmulqdq.CarrylessMultiply(left, right, 16);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.outArrayPtr);
        }
예제 #9
0
        public void RunLclVarScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned));

            var left   = Pclmulqdq.LoadAlignedVector128((Int64 *)(_dataTable.inArray1Ptr));
            var right  = Pclmulqdq.LoadAlignedVector128((Int64 *)(_dataTable.inArray2Ptr));
            var result = Pclmulqdq.CarrylessMultiply(left, right, 1);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.outArrayPtr);
        }
예제 #10
0
        public void RunBasicScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load));

            var result = Pclmulqdq.CarrylessMultiply(
                Pclmulqdq.LoadVector128((Int64 *)(_dataTable.inArray1Ptr)),
                Pclmulqdq.LoadVector128((Int64 *)(_dataTable.inArray2Ptr)),
                1
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.outArrayPtr);
        }
        public void RunBasicScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead));

            var result = Pclmulqdq.CarrylessMultiply(
                Unsafe.Read <Vector128 <Int64> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <Int64> >(_dataTable.inArray2Ptr),
                0
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.outArrayPtr);
        }
예제 #12
0
        public void RunReflectionScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunReflectionScenario_LoadAligned));

            var result = typeof(Pclmulqdq).GetMethod(nameof(Pclmulqdq.CarrylessMultiply), new Type[] { typeof(Vector128 <Int64>), typeof(Vector128 <Int64>), typeof(byte) })
                         .Invoke(null, new object[] {
                Pclmulqdq.LoadAlignedVector128((Int64 *)(_dataTable.inArray1Ptr)),
                Pclmulqdq.LoadAlignedVector128((Int64 *)(_dataTable.inArray2Ptr)),
                (byte)1
            });

            Unsafe.Write(_dataTable.outArrayPtr, (Vector128 <Int64>)(result));
            ValidateResult(_dataTable.outArrayPtr);
        }
예제 #13
0
    internal static ulong Step(ulong crc, byte[] data, uint length)
    {
        int               bufPos         = 16;
        const ulong       k1             = 0xe05dd497ca393ae4;
        const ulong       k2             = 0xdabe95afc7875f40;
        const ulong       mu             = 0x9c3e466c172963d5;
        const ulong       pol            = 0x92d8af2baf0e1e85;
        Vector128 <ulong> foldConstants1 = Vector128.Create(k1, k2);
        Vector128 <ulong> foldConstants2 = Vector128.Create(mu, pol);
        Vector128 <ulong> initialCrc     = Vector128.Create(~crc, 0);

        length -= 16;

        // Initial CRC can simply be added to data
        ShiftRight128(initialCrc, 0, out Vector128 <ulong> crc0, out Vector128 <ulong> crc1);

        Vector128 <ulong> accumulator =
            Sse2.Xor(Fold(Sse2.Xor(crc0, Vector128.Create(BitConverter.ToUInt64(data, 0), BitConverter.ToUInt64(data, 8))), foldConstants1),
                     crc1);

        while (length >= 32)
        {
            accumulator =
                Fold(Sse2.Xor(Vector128.Create(BitConverter.ToUInt64(data, bufPos), BitConverter.ToUInt64(data, bufPos + 8)), accumulator),
                     foldConstants1);

            length -= 16;
            bufPos += 16;
        }

        Vector128 <ulong> p = Sse2.Xor(accumulator,
                                       Vector128.Create(BitConverter.ToUInt64(data, bufPos),
                                                        BitConverter.ToUInt64(data, bufPos + 8)));

        Vector128 <ulong> r = Sse2.Xor(Pclmulqdq.CarrylessMultiply(p, foldConstants1, 0x10),
                                       Sse2.ShiftRightLogical128BitLane(p, 8));

        // Final Barrett reduction
        Vector128 <ulong> t1 = Pclmulqdq.CarrylessMultiply(r, foldConstants2, 0x00);

        Vector128 <ulong> t2 =
            Sse2.Xor(Sse2.Xor(Pclmulqdq.CarrylessMultiply(t1, foldConstants2, 0x10), Sse2.ShiftLeftLogical128BitLane(t1, 8)),
                     r);

        return(~(((ulong)Sse41.Extract(t2.AsUInt32(), 3) << 32) | Sse41.Extract(t2.AsUInt32(), 2)));
    }
예제 #14
0
 internal static uint64_t compute_quote_mask(uint64_t quote_bits)
 {
     // There should be no such thing with a processing supporting avx2
     // but not clmul.
     if (Pclmulqdq.IsSupported)
     {
         uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply(
                                                            Vector128.Create(quote_bits, 0UL /*C# reversed*/), Vector128.Create((byte)0xFF).AsUInt64(), 0));
         return(quote_mask);
     }
     else
     {
         uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
         quote_mask = quote_mask ^ (quote_mask << 2);
         quote_mask = quote_mask ^ (quote_mask << 4);
         quote_mask = quote_mask ^ (quote_mask << 8);
         quote_mask = quote_mask ^ (quote_mask << 16);
         quote_mask = quote_mask ^ (quote_mask << 32);
         return(quote_mask);
     }
 }
예제 #15
0
        static void Fold4(ref Vector128 <uint> xmmCRC0, ref Vector128 <uint> xmmCRC1, ref Vector128 <uint> xmmCRC2,
                          ref Vector128 <uint> xmmCRC3)
        {
            Vector128 <uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);

            Vector128 <uint> xTmp0 = xmmCRC0;
            Vector128 <uint> xTmp1 = xmmCRC1;
            Vector128 <uint> xTmp2 = xmmCRC2;
            Vector128 <uint> xTmp3 = xmmCRC3;

            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp0   = Pclmulqdq.CarrylessMultiply(xTmp0.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC0 = xmmCRC0.AsSingle();
            Vector128 <float> psT0   = xTmp0.AsSingle();
            Vector128 <float> psRes0 = Sse.Xor(psCRC0, psT0);

            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp1   = Pclmulqdq.CarrylessMultiply(xTmp1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC1 = xmmCRC1.AsSingle();
            Vector128 <float> psT1   = xTmp1.AsSingle();
            Vector128 <float> psRes1 = Sse.Xor(psCRC1, psT1);

            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp2   = Pclmulqdq.CarrylessMultiply(xTmp2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC2 = xmmCRC2.AsSingle();
            Vector128 <float> psT2   = xTmp2.AsSingle();
            Vector128 <float> psRes2 = Sse.Xor(psCRC2, psT2);

            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp3   = Pclmulqdq.CarrylessMultiply(xTmp3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC3 = xmmCRC3.AsSingle();
            Vector128 <float> psT3   = xTmp3.AsSingle();
            Vector128 <float> psRes3 = Sse.Xor(psCRC3, psT3);

            xmmCRC0 = psRes0.AsUInt32();
            xmmCRC1 = psRes1.AsUInt32();
            xmmCRC2 = psRes2.AsUInt32();
            xmmCRC3 = psRes3.AsUInt32();
        }
예제 #16
0
        // PolyvalPowersTable updates the POLYVAL value in polyval to include length bytes
        // of data from input, given the POLYVAL key in hashKey. It uses the precomputed
        // powers of the key given in htbl. If the length is not divisible by 16, input
        // is padded with zeros until it's a multiple of 16 bytes.
        private static void PolyvalPowersTable(byte *polyval, byte *htbl, byte *input, int length)
        {
            if (length == 0)
            {
                return;
            }

            int blocks = Math.DivRem(length, 16, out int remainder16);
            int remainder128 = length % 128 - remainder16;
            Vector128 <ulong> tmp0, tmp1, tmp2, tmp3, tmp4;

            var xhi  = Sse2.SetZeroVector128 <ulong>();
            var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1));
            var t    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval));

            if (remainder128 != 0)
            {
                int remainder128Blocks = remainder128 / 16;
                blocks -= remainder128Blocks;

                var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(input)));
                var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - 1) * 16]));

                tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                tmp2 = Sse2.Xor(tmp2, tmp3);

                for (int i = 1; i < remainder128Blocks; ++i)
                {
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&input[i * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - i - 1) * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                }

                tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                xhi  = Sse2.Xor(tmp3, tmp1);
                t    = Sse2.Xor(tmp0, tmp2);
            }

            if (blocks != 0)
            {
                var fixedInput = input + remainder128;

                if (remainder128 == 0)
                {
                    var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[7 * 16]));
                    var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16]));

                    tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[6 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[5 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[4 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[3 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));
                    tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[2 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[1 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[0 * 16])));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                    tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                    xhi  = Sse2.Xor(tmp3, tmp1);
                    t    = Sse2.Xor(tmp0, tmp2);
                }

                for (int i = remainder128 == 0 ? 8 : 0; i < blocks; i += 8)
                {
                    var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 7) * 16]));
                    var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16]));

                    tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 6) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 5) * 16]));
                    tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    t    = Sse2.Xor(t, tmp4);
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 4) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 3) * 16]));
                    tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    t    = Sse2.Xor(t, tmp4);
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 2) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    t    = Sse2.Xor(t, xhi);
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 1) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[i * 16])));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                    tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                    xhi  = Sse2.Xor(tmp3, tmp1);
                    t    = Sse2.Xor(tmp0, tmp2);
                }
            }

            if (blocks != 0 || remainder128 != 0)
            {
                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                t    = Sse2.Xor(xhi, t);
            }

            if (remainder16 != 0)
            {
                byte *b = stackalloc byte[16];
                new Span <byte>(input + length - remainder16, remainder16).CopyTo(new Span <byte>(b, 16));

                var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b)));
                var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));

                tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                xhi  = Sse2.Xor(tmp3, tmp1);
                t    = Sse2.Xor(tmp0, tmp2);

                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                t    = Sse2.Xor(xhi, t);
            }

            Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t));
        }
예제 #17
0
        internal static uint Step(byte[] src, long len, uint initialCRC)
        {
            Vector128 <uint> xmmT0, xmmT1, xmmT2;
            Vector128 <uint> xmmInitial = Sse2.ConvertScalarToVector128UInt32(initialCRC);
            Vector128 <uint> xmmCRC0    = Sse2.ConvertScalarToVector128UInt32(0x9db42487);
            Vector128 <uint> xmmCRC1    = Vector128 <uint> .Zero;
            Vector128 <uint> xmmCRC2    = Vector128 <uint> .Zero;
            Vector128 <uint> xmmCRC3    = Vector128 <uint> .Zero;
            int bufPos = 0;

            bool first = true;

            /* fold 512 to 32 step variable declarations for ISO-C90 compat. */
            Vector128 <uint> xmmMask  = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
            Vector128 <uint> xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

            while ((len -= 64) >= 0)
            {
                xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
                                         BitConverter.ToUInt32(src, bufPos + 8),
                                         BitConverter.ToUInt32(src, bufPos + 12));

                bufPos += 16;

                xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
                                         BitConverter.ToUInt32(src, bufPos + 8),
                                         BitConverter.ToUInt32(src, bufPos + 12));

                bufPos += 16;

                xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
                                         BitConverter.ToUInt32(src, bufPos + 8),
                                         BitConverter.ToUInt32(src, bufPos + 12));

                bufPos += 16;

                Vector128 <uint> xmmT3 = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
                                                          BitConverter.ToUInt32(src, bufPos + 4),
                                                          BitConverter.ToUInt32(src, bufPos + 8),
                                                          BitConverter.ToUInt32(src, bufPos + 12));

                bufPos += 16;

                if (first)
                {
                    first = false;
                    xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
                }

                Fold4(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);

                xmmCRC0 = Sse2.Xor(xmmCRC0, xmmT0);
                xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT1);
                xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT2);
                xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3);
            }

            /* fold 512 to 32 */

            /*
             * k1
             */
            Vector128 <uint> crcFold = Vector128.Create(_crcK[0], _crcK[1], _crcK[2], _crcK[3]);

            Vector128 <uint> xTmp0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x10).
                                     AsUInt32();

            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
            xmmCRC1 = Sse2.Xor(xmmCRC1, xTmp0);
            xmmCRC1 = Sse2.Xor(xmmCRC1, xmmCRC0);

            Vector128 <uint> xTmp1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x10).
                                     AsUInt32();

            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
            xmmCRC2 = Sse2.Xor(xmmCRC2, xTmp1);
            xmmCRC2 = Sse2.Xor(xmmCRC2, xmmCRC1);

            Vector128 <uint> xTmp2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x10).
                                     AsUInt32();

            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
            xmmCRC3 = Sse2.Xor(xmmCRC3, xTmp2);
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);

            /*
             * k5
             */
            crcFold = Vector128.Create(_crcK[4], _crcK[5], _crcK[6], _crcK[7]);

            xmmCRC0 = xmmCRC3;
            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32();
            xmmCRC0 = Sse2.ShiftRightLogical128BitLane(xmmCRC0, 8);
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0);

            xmmCRC0 = xmmCRC3;
            xmmCRC3 = Sse2.ShiftLeftLogical128BitLane(xmmCRC3, 4);
            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32();
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0);
            xmmCRC3 = Sse2.And(xmmCRC3, xmmMask2);

            /*
             * k7
             */
            xmmCRC1 = xmmCRC3;
            xmmCRC2 = xmmCRC3;
            crcFold = Vector128.Create(_crcK[8], _crcK[9], _crcK[10], _crcK[11]);

            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32();
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);
            xmmCRC3 = Sse2.And(xmmCRC3, xmmMask);

            xmmCRC2 = xmmCRC3;
            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32();
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC1);

            /*
             * could just as well write xmm_crc3[2], doing a movaps and truncating, but
             * no real advantage - it's a tiny bit slower per call, while no additional CPUs
             * would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL
             */
            return(~Sse41.Extract(xmmCRC3, 2));
        }
예제 #18
0
파일: Program.cs 프로젝트: z77ma/runtime
    static int Main()
    {
        s_success = true;

        // We expect the AOT compiler generated HW intrinsics with the following characteristics:
        //
        // * TRUE = IsSupported assumed to be true, no runtime check
        // * NULL = IsSupported is a runtime check, code should be behind the check or bad things happen
        // * FALSE = IsSupported assumed to be false, no runtime check, PlatformNotSupportedException if used
        //
        // The test is compiled with multiple defines to test this.

#if BASELINE_INTRINSICS
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 16;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = null;
        bool?AesLzPcl           = null;
        bool?Sse4142            = null;
        bool?PopCnt             = null;
        bool?Avx12    = false;
        bool?FmaBmi12 = false;
        bool?Avxvnni  = false;
#elif NON_VEX_INTRINSICS
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 16;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = true;
        bool?AesLzPcl           = null;
        bool?Sse4142            = true;
        bool?PopCnt             = null;
        bool?Avx12    = false;
        bool?FmaBmi12 = false;
        bool?Avxvnni  = false;
#elif VEX_INTRINSICS
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 32;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = true;
        bool?AesLzPcl           = null;
        bool?Sse4142            = true;
        bool?PopCnt             = null;
        bool?Avx12    = true;
        bool?FmaBmi12 = null;
        bool?Avxvnni  = null;
#else
#error Who dis?
#endif

        if (vectorsAccelerated != Vector.IsHardwareAccelerated)
        {
            throw new Exception($"Vectors HW acceleration state unexpected - expected {vectorsAccelerated}, got {Vector.IsHardwareAccelerated}");
        }

        if (byteVectorLength != Vector <byte> .Count)
        {
            throw new Exception($"Unexpected vector length - expected {byteVectorLength}, got {Vector<byte>.Count}");
        }

        Check("Sse", Sse2AndBelow, &SseIsSupported, Sse.IsSupported, () => Sse.Subtract(Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Sse.X64", Sse2AndBelow, &SseX64IsSupported, Sse.X64.IsSupported, () => Sse.X64.ConvertToInt64WithTruncation(Vector128 <float> .Zero) == 0);

        Check("Sse2", Sse2AndBelow, &Sse2IsSupported, Sse2.IsSupported, () => Sse2.Extract(Vector128 <ushort> .Zero, 0) == 0);
        Check("Sse2.X64", Sse2AndBelow, &Sse2X64IsSupported, Sse2.X64.IsSupported, () => Sse2.X64.ConvertToInt64(Vector128 <double> .Zero) == 0);

        Check("Sse3", Sse3Group, &Sse3IsSupported, Sse3.IsSupported, () => Sse3.MoveHighAndDuplicate(Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Sse3.X64", Sse3Group, &Sse3X64IsSupported, Sse3.X64.IsSupported, null);

        Check("Ssse3", Sse3Group, &Ssse3IsSupported, Ssse3.IsSupported, () => Ssse3.Abs(Vector128 <short> .Zero).Equals(Vector128 <ushort> .Zero));
        Check("Ssse3.X64", Sse3Group, &Ssse3X64IsSupported, Ssse3.X64.IsSupported, null);

        Check("Sse41", Sse4142, &Sse41IsSupported, Sse41.IsSupported, () => Sse41.Max(Vector128 <int> .Zero, Vector128 <int> .Zero).Equals(Vector128 <int> .Zero));
        Check("Sse41.X64", Sse4142, &Sse41X64IsSupported, Sse41.X64.IsSupported, () => Sse41.X64.Extract(Vector128 <long> .Zero, 0) == 0);

        Check("Sse42", Sse4142, &Sse42IsSupported, Sse42.IsSupported, () => Sse42.Crc32(0, 0) == 0);
        Check("Sse42.X64", Sse4142, &Sse42X64IsSupported, Sse42.X64.IsSupported, () => Sse42.X64.Crc32(0, 0) == 0);

        Check("Aes", AesLzPcl, &AesIsSupported, Aes.IsSupported, () => Aes.KeygenAssist(Vector128 <byte> .Zero, 0).Equals(Vector128.Create((byte)99)));
        Check("Aes.X64", AesLzPcl, &AesX64IsSupported, Aes.X64.IsSupported, null);

        Check("Avx", Avx12, &AvxIsSupported, Avx.IsSupported, () => Avx.Add(Vector256 <double> .Zero, Vector256 <double> .Zero).Equals(Vector256 <double> .Zero));
        Check("Avx.X64", Avx12, &AvxX64IsSupported, Avx.X64.IsSupported, null);

        Check("Avx2", Avx12, &Avx2IsSupported, Avx2.IsSupported, () => Avx2.Abs(Vector256 <int> .Zero).Equals(Vector256 <uint> .Zero));
        Check("Avx2.X64", Avx12, &Avx2X64IsSupported, Avx2.X64.IsSupported, null);

        Check("Bmi1", FmaBmi12, &Bmi1IsSupported, Bmi1.IsSupported, () => Bmi1.AndNot(0, 0) == 0);
        Check("Bmi1.X64", FmaBmi12, &Bmi1X64IsSupported, Bmi1.X64.IsSupported, () => Bmi1.X64.AndNot(0, 0) == 0);

        Check("Bmi2", FmaBmi12, &Bmi2IsSupported, Bmi2.IsSupported, () => Bmi2.MultiplyNoFlags(0, 0) == 0);
        Check("Bmi2.X64", FmaBmi12, &Bmi2X64IsSupported, Bmi2.X64.IsSupported, () => Bmi2.X64.MultiplyNoFlags(0, 0) == 0);

        Check("Fma", FmaBmi12, &FmaIsSupported, Fma.IsSupported, () => Fma.MultiplyAdd(Vector128 <float> .Zero, Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Fma.X64", FmaBmi12, &FmaX64IsSupported, Fma.X64.IsSupported, null);

        Check("Lzcnt", AesLzPcl, &LzcntIsSupported, Lzcnt.IsSupported, () => Lzcnt.LeadingZeroCount(0) == 32);
        Check("Lzcnt.X64", AesLzPcl, &LzcntX64IsSupported, Lzcnt.X64.IsSupported, () => Lzcnt.X64.LeadingZeroCount(0) == 64);

        Check("Pclmulqdq", AesLzPcl, &PclmulqdqIsSupported, Pclmulqdq.IsSupported, () => Pclmulqdq.CarrylessMultiply(Vector128 <long> .Zero, Vector128 <long> .Zero, 0).Equals(Vector128 <long> .Zero));
        Check("Pclmulqdq.X64", AesLzPcl, &PclmulqdqX64IsSupported, Pclmulqdq.X64.IsSupported, null);

        Check("Popcnt", PopCnt, &PopcntIsSupported, Popcnt.IsSupported, () => Popcnt.PopCount(0) == 0);
        Check("Popcnt.X64", PopCnt, &PopcntX64IsSupported, Popcnt.X64.IsSupported, () => Popcnt.X64.PopCount(0) == 0);

        Check("AvxVnni", Avxvnni, &AvxVnniIsSupported, AvxVnni.IsSupported, () => AvxVnni.MultiplyWideningAndAdd(Vector128 <int> .Zero, Vector128 <byte> .Zero, Vector128 <sbyte> .Zero).Equals(Vector128 <int> .Zero));
        Check("AvxVnni.X64", Avxvnni, &AvxVnniX64IsSupported, AvxVnni.X64.IsSupported, null);

        return(s_success ? 100 : 1);
    }
예제 #19
0
        // take input from buf and remove useless whitespace, input and output can be
        // the same, result is null terminated, return the string length (minus the null termination)
        public static size_t Minify(uint8_t *buf, size_t len, uint8_t * @out)
        {
            if (!Avx2.IsSupported)
            {
                throw new NotSupportedException("AVX2 is required form SimdJson");
            }

            //C#: load const vectors once (there is no `const _m256` in C#)
            Vector256 <byte> lut_cntrl        = s_lut_cntrl;
            Vector256 <byte> low_nibble_mask  = s_low_nibble_mask;
            Vector256 <byte> high_nibble_mask = s_high_nibble_mask;

            fixed(byte *mask128_epi8 = s_mask128_epi8)
            {
                // Useful constant masks
                const uint64_t even_bits = 0x5555555555555555UL;
                const uint64_t odd_bits  = ~even_bits;
                uint8_t *      initout   = @out;
                uint64_t       prev_iter_ends_odd_backslash =
                    0UL;                               // either 0 or 1, but a 64-bit value
                uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones
                size_t   idx = 0;

                if (len >= 64)
                {
                    size_t avxlen = len - 63;

                    for (; idx < avxlen; idx += 64)
                    {
                        Vector256 <byte> input_lo = Avx.LoadVector256((buf + idx + 0));
                        Vector256 <byte> input_hi = Avx.LoadVector256((buf + idx + 32));
                        uint64_t         bs_bits  = cmp_mask_against_input_mini(input_lo, input_hi,
                                                                                Vector256.Create((byte)'\\'));
                        uint64_t start_edges     = bs_bits & ~(bs_bits << 1);
                        uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                        uint64_t even_starts     = start_edges & even_start_mask;
                        uint64_t odd_starts      = start_edges & ~even_start_mask;
                        uint64_t even_carries    = bs_bits + even_starts;
                        uint64_t odd_carries;
                        bool     iter_ends_odd_backslash = add_overflow(
                            bs_bits, odd_starts, &odd_carries);
                        odd_carries |= prev_iter_ends_odd_backslash;
                        prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL;
                        uint64_t even_carry_ends    = even_carries & ~bs_bits;
                        uint64_t odd_carry_ends     = odd_carries & ~bs_bits;
                        uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                        uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                        uint64_t odd_ends           = even_start_odd_end | odd_start_even_end;
                        uint64_t quote_bits         = cmp_mask_against_input_mini(input_lo, input_hi,
                                                                                  Vector256.Create((byte)'"'));
                        quote_bits = quote_bits & ~odd_ends;
                        uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply(
                                                                           Vector128.Create(quote_bits, 0UL).AsUInt64(), Vector128.Create((byte)0xFF).AsUInt64(), 0));
                        quote_mask            ^= prev_iter_inside_quote;
                        prev_iter_inside_quote =
                            (uint64_t)((int64_t)quote_mask >>
                                       63);  // might be undefined behavior, should be fully defined in C++20, ok according to John Regher from Utah University

                        Vector256 <byte> whitespace_shufti_mask = Vector256.Create((byte)0x18);
                        Vector256 <byte> v_lo = Avx2.And(
                            Avx2.Shuffle(low_nibble_mask, input_lo),
                            Avx2.Shuffle(high_nibble_mask,
                                         Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),
                                                  Vector256.Create((byte)0x7f))));

                        Vector256 <byte> v_hi = Avx2.And(
                            Avx2.Shuffle(low_nibble_mask, input_hi),
                            Avx2.Shuffle(high_nibble_mask,
                                         Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                                                  Vector256.Create((byte)0x7f))));
                        Vector256 <byte> tmp_ws_lo = Avx2.CompareEqual(
                            Avx2.And(v_lo, whitespace_shufti_mask), Vector256.Create((byte)0));
                        Vector256 <byte> tmp_ws_hi = Avx2.CompareEqual(
                            Avx2.And(v_hi, whitespace_shufti_mask), Vector256.Create((byte)0));

                        uint64_t ws_res_0   = (uint32_t)Avx2.MoveMask(tmp_ws_lo);
                        uint64_t ws_res_1   = (uint64_t)Avx2.MoveMask(tmp_ws_hi);
                        uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
                        whitespace &= ~quote_mask;
                        int mask1  = (int)(whitespace & 0xFFFF);
                        int mask2  = (int)((whitespace >> 16) & 0xFFFF);
                        int mask3  = (int)((whitespace >> 32) & 0xFFFF);
                        int mask4  = (int)((whitespace >> 48) & 0xFFFF);
                        int pop1   = hamming((~whitespace) & 0xFFFF);
                        int pop2   = hamming((~whitespace) & (ulong)(0xFFFFFFFF));
                        int pop3   = hamming((~whitespace) & (ulong)(0xFFFFFFFFFFFF));
                        int pop4   = hamming((~whitespace));
                        var vmask1 =
                            _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask2 & 0x7FFF) * 2,
                                                (ulong *)mask128_epi8 + (mask1 & 0x7FFF) * 2);
                        var vmask2 =
                            _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask4 & 0x7FFF) * 2,
                                                (ulong *)mask128_epi8 + (mask3 & 0x7FFF) * 2);
                        var result1 = Avx2.Shuffle(input_lo, vmask1.AsByte());
                        var result2 = Avx2.Shuffle(input_hi, vmask2.AsByte());
                        _mm256_storeu2_m128i((@out + pop1), @out, result1);
                        _mm256_storeu2_m128i((@out + pop3), (@out + pop2),
                                             result2);
                        @out += pop4;
                    }
                }

                // we finish off the job... copying and pasting the code is not ideal here,
                // but it gets the job done.
                if (idx < len)
                {
                    uint8_t *buffer = stackalloc uint8_t[64];
                    memset(buffer, 0, 64);
                    memcpy(buffer, buf + idx, len - idx);
                    var      input_lo = Avx.LoadVector256((buffer));
                    var      input_hi = Avx.LoadVector256((buffer + 32));
                    uint64_t bs_bits  =
                        cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'\\'));
                    uint64_t start_edges     = bs_bits & ~(bs_bits << 1);
                    uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                    uint64_t even_starts     = start_edges & even_start_mask;
                    uint64_t odd_starts      = start_edges & ~even_start_mask;
                    uint64_t even_carries    = bs_bits + even_starts;
                    uint64_t odd_carries;
                    //bool iter_ends_odd_backslash =
                    add_overflow(bs_bits, odd_starts, &odd_carries);
                    odd_carries |= prev_iter_ends_odd_backslash;
                    //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it
                    uint64_t even_carry_ends    = even_carries & ~bs_bits;
                    uint64_t odd_carry_ends     = odd_carries & ~bs_bits;
                    uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                    uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                    uint64_t odd_ends           = even_start_odd_end | odd_start_even_end;
                    uint64_t quote_bits         =
                        cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'"'));
                    quote_bits = quote_bits & ~odd_ends;
                    uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply(
                                                                       Vector128.Create(quote_bits, 0UL), Vector128.Create((byte)0xFF).AsUInt64(), 0));
                    quote_mask ^= prev_iter_inside_quote;
                    // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we don't need this anymore

                    Vector256 <byte> mask_20 = Vector256.Create((byte)0x20); // c==32
                    Vector256 <byte> mask_70 =
                        Vector256.Create((byte)0x70);                        // adding 0x70 does not check low 4-bits
                    // but moves any value >= 16 above 128

                    Vector256 <byte> tmp_ws_lo = Avx2.Or(
                        Avx2.CompareEqual(mask_20, input_lo),
                        Avx2.Shuffle(lut_cntrl, Avx2.AddSaturate(mask_70, input_lo)));
                    Vector256 <byte> tmp_ws_hi = Avx2.Or(
                        Avx2.CompareEqual(mask_20, input_hi),
                        Avx2.Shuffle(lut_cntrl, Avx2.AddSaturate(mask_70, input_hi)));
                    uint64_t ws_res_0   = (uint32_t)Avx2.MoveMask(tmp_ws_lo);
                    uint64_t ws_res_1   = (uint64_t)Avx2.MoveMask(tmp_ws_hi);
                    uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32));
                    whitespace &= ~quote_mask;

                    if (len - idx < 64)
                    {
                        whitespace |= ((0xFFFFFFFFFFFFFFFF) << (int)(len - idx));
                    }

                    int mask1 = (int)(whitespace & 0xFFFF);
                    int mask2 = (int)((whitespace >> 16) & 0xFFFF);
                    int mask3 = (int)((whitespace >> 32) & 0xFFFF);
                    int mask4 = (int)((whitespace >> 48) & 0xFFFF);
                    int pop1  = hamming((~whitespace) & 0xFFFF);
                    int pop2  = hamming((~whitespace) & 0xFFFFFFFF);
                    int pop3  = hamming((~whitespace) & 0xFFFFFFFFFFFF);
                    int pop4  = hamming((~whitespace));

                    var vmask1 =
                        _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask2 & 0x7FFF) * 2,
                                            (ulong *)mask128_epi8 + (mask1 & 0x7FFF) * 2);
                    var vmask2 =
                        _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask4 & 0x7FFF) * 2,
                                            (ulong *)mask128_epi8 + (mask3 & 0x7FFF) * 2);
                    var result1 = Avx2.Shuffle(input_lo, vmask1.AsByte());
                    var result2 = Avx2.Shuffle(input_hi, vmask2.AsByte());
                    _mm256_storeu2_m128i((buffer + pop1), buffer,
                                         result1);
                    _mm256_storeu2_m128i((buffer + pop3), (buffer + pop2),
                                         result2);
                    memcpy(@out, buffer, (size_t)pop4);
                    @out += pop4;
                }

                *@out = (byte)'\0';  // NULL termination
                return((size_t)@out - (size_t)initout);
            }
        }
예제 #20
0
        // PolyvalHorner updates the POLYVAL value in polyval to include length bytes
        // of data from input, given the POLYVAL key in hashKey. If the length is not
        // divisible by 16, input is padded with zeros until it's a multiple of 16 bytes.
        private static void PolyvalHorner(byte *polyval, byte *hashKey, byte *input, int length)
        {
            if (length == 0)
            {
                return;
            }

            int blocks = Math.DivRem(length, 16, out int remainder);
            Vector128 <ulong> tmp1, tmp2, tmp3, tmp4;

            var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1));
            var t    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval));
            var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(hashKey));

            for (int i = 0; i < blocks; ++i)
            {
                t    = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&input[i * 16])));
                tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00);
                tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp1 = Sse2.Xor(tmp3, tmp1);
                tmp4 = Sse2.Xor(tmp4, tmp2);

                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp3, tmp2);
                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp3, tmp2);
                t    = Sse2.Xor(tmp4, tmp1);
            }

            if (remainder != 0)
            {
                byte *b = stackalloc byte[16];
                new Span <byte>(input + length - remainder, remainder).CopyTo(new Span <byte>(b, 16));

                t    = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b)));
                tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00);
                tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp1 = Sse2.Xor(tmp3, tmp1);
                tmp4 = Sse2.Xor(tmp4, tmp2);

                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp3, tmp2);
                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp3, tmp2);
                t    = Sse2.Xor(tmp4, tmp1);
            }

            Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t));
        }
예제 #21
0
        private static unsafe uint CalculateSse(uint crc, ReadOnlySpan <byte> buffer)
        {
            int chunksize = buffer.Length & ~ChunksizeMask;
            int length    = chunksize;

            fixed(byte *bufferPtr = buffer)
            fixed(ulong *k05PolyPtr = K05Poly)
            {
                byte *srcPtr = bufferPtr;

                // There's at least one block of 64.
                Vector128 <ulong> x1 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00));
                Vector128 <ulong> x2 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10));
                Vector128 <ulong> x3 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20));
                Vector128 <ulong> x4 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30));
                Vector128 <ulong> x5;

                x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64());

                // k1, k2
                Vector128 <ulong> x0 = Sse2.LoadVector128(k05PolyPtr + 0x0);

                srcPtr += 64;
                length -= 64;

                // Parallel fold blocks of 64, if any.
                while (length >= 64)
                {
                    x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                    Vector128 <ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
                    Vector128 <ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00);
                    Vector128 <ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00);

                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                    x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11);
                    x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11);
                    x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11);

                    Vector128 <ulong> y5 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00));
                    Vector128 <ulong> y6 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10));
                    Vector128 <ulong> y7 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20));
                    Vector128 <ulong> y8 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30));

                    x1 = Sse2.Xor(x1, x5);
                    x2 = Sse2.Xor(x2, x6);
                    x3 = Sse2.Xor(x3, x7);
                    x4 = Sse2.Xor(x4, x8);

                    x1 = Sse2.Xor(x1, y5);
                    x2 = Sse2.Xor(x2, y6);
                    x3 = Sse2.Xor(x3, y7);
                    x4 = Sse2.Xor(x4, y8);

                    srcPtr += 64;
                    length -= 64;
                }

                // Fold into 128-bits.
                // k3, k4
                x0 = Sse2.LoadVector128(k05PolyPtr + 0x2);

                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                x1 = Sse2.Xor(x1, x2);
                x1 = Sse2.Xor(x1, x5);

                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                x1 = Sse2.Xor(x1, x3);
                x1 = Sse2.Xor(x1, x5);

                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                x1 = Sse2.Xor(x1, x4);
                x1 = Sse2.Xor(x1, x5);

                // Single fold blocks of 16, if any.
                while (length >= 16)
                {
                    x2 = Sse2.LoadVector128((ulong *)srcPtr);

                    x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                    x1 = Sse2.Xor(x1, x2);
                    x1 = Sse2.Xor(x1, x5);

                    srcPtr += 16;
                    length -= 16;
                }

                // Fold 128 - bits to 64 - bits.
                x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10);
                x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86
                x1 = Sse2.ShiftRightLogical128BitLane(x1, 8);
                x1 = Sse2.Xor(x1, x2);

                // k5, k0
                x0 = Sse2.LoadScalarVector128(k05PolyPtr + 0x4);

                x2 = Sse2.ShiftRightLogical128BitLane(x1, 4);
                x1 = Sse2.And(x1, x3);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Sse2.Xor(x1, x2);

                // Barret reduce to 32-bits.
                // polynomial
                x0 = Sse2.LoadVector128(k05PolyPtr + 0x6);

                x2 = Sse2.And(x1, x3);
                x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10);
                x2 = Sse2.And(x2, x3);
                x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
                x1 = Sse2.Xor(x1, x2);

                crc = (uint)Sse41.Extract(x1.AsInt32(), 1);
                return(buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer[chunksize..]));
예제 #22
0
        // DecryptPowersTable decrypts ctLen bytes from ct and writes them to pt. While
        // decrypting, it updates the POLYVAL value in polyval. In order to decrypt and
        // update the POLYVAL value, it uses the expanded key from ks and the table of
        // powers in htbl. Decryption processes 6 blocks of data in parallel.
        private static void DecryptPowersTable(byte *ct, int ctLen, byte *pt, byte *polyval, byte *htbl, byte *tag, byte *ks)
        {
            Vector128 <ulong> sCtr1, sCtr2, sCtr3, sCtr4, sCtr5, sCtr6, tmp0, tmp1, tmp2, tmp3, tmp4, h;

            var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1));
            var t    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval));

            var orMask = Sse.StaticCast <uint, byte>(Sse2.SetVector128(0x80000000, 0, 0, 0));
            var ctr    = Sse2.Or(Sse2.LoadVector128(tag), orMask);

            var one = Sse2.SetVector128(0, 0, 0, 1);
            var two = Sse2.SetVector128(0, 0, 0, 2);

            int blocks = 0;

            if (ctLen >= 96)
            {
                var ctr1 = ctr;
                var ctr2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                var ctr3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two));
                var ctr4 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), one));
                var ctr5 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), two));
                var ctr6 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), one));
                ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), two));

                var key = Sse2.LoadVector128(ks);
                ctr1 = Sse2.Xor(ctr1, key);
                ctr2 = Sse2.Xor(ctr2, key);
                ctr3 = Sse2.Xor(ctr3, key);
                ctr4 = Sse2.Xor(ctr4, key);
                ctr5 = Sse2.Xor(ctr5, key);
                ctr6 = Sse2.Xor(ctr6, key);

                for (int i = 1; i < 14; ++i)
                {
                    key  = Sse2.LoadVector128(&ks[i * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);
                }

                key  = Sse2.LoadVector128(&ks[14 * 16]);
                ctr1 = Aes.EncryptLast(ctr1, key);
                ctr2 = Aes.EncryptLast(ctr2, key);
                ctr3 = Aes.EncryptLast(ctr3, key);
                ctr4 = Aes.EncryptLast(ctr4, key);
                ctr5 = Aes.EncryptLast(ctr5, key);
                ctr6 = Aes.EncryptLast(ctr6, key);

                ctr1 = Sse2.Xor(ctr1, Sse2.LoadVector128(&ct[0 * 16]));
                ctr2 = Sse2.Xor(ctr2, Sse2.LoadVector128(&ct[1 * 16]));
                ctr3 = Sse2.Xor(ctr3, Sse2.LoadVector128(&ct[2 * 16]));
                ctr4 = Sse2.Xor(ctr4, Sse2.LoadVector128(&ct[3 * 16]));
                ctr5 = Sse2.Xor(ctr5, Sse2.LoadVector128(&ct[4 * 16]));
                ctr6 = Sse2.Xor(ctr6, Sse2.LoadVector128(&ct[5 * 16]));

                Sse2.Store(&pt[0 * 16], ctr1);
                Sse2.Store(&pt[1 * 16], ctr2);
                Sse2.Store(&pt[2 * 16], ctr3);
                Sse2.Store(&pt[3 * 16], ctr4);
                Sse2.Store(&pt[4 * 16], ctr5);
                Sse2.Store(&pt[5 * 16], ctr6);

                ctLen  -= 96;
                blocks += 6;

                while (ctLen >= 96)
                {
                    sCtr6 = Sse.StaticCast <byte, ulong>(ctr6);
                    sCtr5 = Sse.StaticCast <byte, ulong>(ctr5);
                    sCtr4 = Sse.StaticCast <byte, ulong>(ctr4);
                    sCtr3 = Sse.StaticCast <byte, ulong>(ctr3);
                    sCtr2 = Sse.StaticCast <byte, ulong>(ctr2);
                    sCtr1 = Sse.StaticCast <byte, ulong>(ctr1);

                    ctr1 = ctr;
                    ctr2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                    ctr3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two));
                    ctr4 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), one));
                    ctr5 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), two));
                    ctr6 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), one));
                    ctr  = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), two));

                    key  = Sse2.LoadVector128(ks);
                    ctr1 = Sse2.Xor(ctr1, key);
                    ctr2 = Sse2.Xor(ctr2, key);
                    ctr3 = Sse2.Xor(ctr3, key);
                    ctr4 = Sse2.Xor(ctr4, key);
                    ctr5 = Sse2.Xor(ctr5, key);
                    ctr6 = Sse2.Xor(ctr6, key);

                    tmp3 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));
                    tmp1 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x11);
                    tmp2 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x00);
                    tmp0 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x01);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x10);
                    tmp0 = Sse2.Xor(tmp3, tmp0);

                    key  = Sse2.LoadVector128(&ks[1 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[2 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[3 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[4 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[5 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[6 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[7 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    sCtr1 = Sse2.Xor(t, sCtr1);
                    tmp4  = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x01);
                    tmp0  = Sse2.Xor(tmp3, tmp0);
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x11);
                    tmp1  = Sse2.Xor(tmp3, tmp1);
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x00);
                    tmp2  = Sse2.Xor(tmp3, tmp2);
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x10);
                    tmp0  = Sse2.Xor(tmp3, tmp0);

                    key  = Sse2.LoadVector128(&ks[8 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    tmp3 = Sse2.ShiftRightLogical128BitLane(tmp0, 8);
                    tmp4 = Sse2.Xor(tmp3, tmp1);
                    tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp0, 8);
                    t    = Sse2.Xor(tmp3, tmp2);

                    key  = Sse2.LoadVector128(&ks[9 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse2.Xor(tmp1, t);

                    key  = Sse2.LoadVector128(&ks[10 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[11 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[12 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[13 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[14 * 16]);
                    ctr1 = Aes.EncryptLast(ctr1, key);
                    ctr2 = Aes.EncryptLast(ctr2, key);
                    ctr3 = Aes.EncryptLast(ctr3, key);
                    ctr4 = Aes.EncryptLast(ctr4, key);
                    ctr5 = Aes.EncryptLast(ctr5, key);
                    ctr6 = Aes.EncryptLast(ctr6, key);

                    ctr1 = Sse2.Xor(ctr1, Sse2.LoadVector128(&ct[(blocks + 0) * 16]));
                    ctr2 = Sse2.Xor(ctr2, Sse2.LoadVector128(&ct[(blocks + 1) * 16]));
                    ctr3 = Sse2.Xor(ctr3, Sse2.LoadVector128(&ct[(blocks + 2) * 16]));
                    ctr4 = Sse2.Xor(ctr4, Sse2.LoadVector128(&ct[(blocks + 3) * 16]));
                    ctr5 = Sse2.Xor(ctr5, Sse2.LoadVector128(&ct[(blocks + 4) * 16]));
                    ctr6 = Sse2.Xor(ctr6, Sse2.LoadVector128(&ct[(blocks + 5) * 16]));

                    tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse2.Xor(tmp1, t);
                    t    = Sse2.Xor(tmp4, t);

                    Sse2.Store(&pt[(blocks + 0) * 16], ctr1);
                    Sse2.Store(&pt[(blocks + 1) * 16], ctr2);
                    Sse2.Store(&pt[(blocks + 2) * 16], ctr3);
                    Sse2.Store(&pt[(blocks + 3) * 16], ctr4);
                    Sse2.Store(&pt[(blocks + 4) * 16], ctr5);
                    Sse2.Store(&pt[(blocks + 5) * 16], ctr6);

                    ctLen  -= 96;
                    blocks += 6;
                }

                sCtr6 = Sse.StaticCast <byte, ulong>(ctr6);
                sCtr5 = Sse.StaticCast <byte, ulong>(ctr5);
                sCtr4 = Sse.StaticCast <byte, ulong>(ctr4);
                sCtr3 = Sse.StaticCast <byte, ulong>(ctr3);
                sCtr2 = Sse.StaticCast <byte, ulong>(ctr2);
                sCtr1 = Sse.StaticCast <byte, ulong>(ctr1);

                tmp3 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));
                tmp0 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x10);
                tmp1 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x00);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x01);
                tmp0 = Sse2.Xor(tmp3, tmp0);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                sCtr1 = Sse2.Xor(t, sCtr1);
                tmp4  = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x11);
                tmp1  = Sse2.Xor(tmp3, tmp1);
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x00);
                tmp2  = Sse2.Xor(tmp3, tmp2);
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x10);
                tmp0  = Sse2.Xor(tmp3, tmp0);
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x01);
                tmp0  = Sse2.Xor(tmp3, tmp0);

                tmp3 = Sse2.ShiftRightLogical128BitLane(tmp0, 8);
                tmp4 = Sse2.Xor(tmp3, tmp1);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp0, 8);
                t    = Sse2.Xor(tmp3, tmp2);

                tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse2.Xor(tmp1, t);
                tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse2.Xor(tmp1, t);
                t    = Sse2.Xor(tmp4, t);
            }

            h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));

            while (ctLen >= 16)
            {
                var tmp = ctr;
                ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(ks));

                for (int i = 1; i < 14; ++i)
                {
                    tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[i * 16]));
                }

                tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16]));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(&ct[blocks * 16]));
                Sse2.Store(&pt[blocks * 16], tmp);

                t    = Sse2.Xor(Sse.StaticCast <byte, ulong>(tmp), t);
                tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00);
                tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01);
                tmp2 = Sse2.Xor(tmp3, tmp2);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp4 = Sse2.Xor(tmp2, tmp4);

                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                t    = Sse2.Xor(tmp1, tmp4);

                ctLen -= 16;
                ++blocks;
            }

            if (ctLen > 0)
            {
                byte *b = stackalloc byte[16];
                new Span <byte>(ct + blocks * 16, ctLen).CopyTo(new Span <byte>(b, 16));
                var tmp = Sse2.Xor(ctr, Sse2.LoadVector128(ks));

                for (int i = 1; i < 14; ++i)
                {
                    tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[i * 16]));
                }

                tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16]));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(b));
                Sse2.Store(b, tmp);

                new Span <byte>(b, ctLen).CopyTo(new Span <byte>(&pt[blocks * 16], ctLen));
                new Span <byte>(b + ctLen, 16 - ctLen).Clear();

                t    = Sse2.Xor(Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b)), t);
                tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00);
                tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01);
                tmp2 = Sse2.Xor(tmp3, tmp2);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp4 = Sse2.Xor(tmp2, tmp4);

                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                t    = Sse2.Xor(tmp1, tmp4);
            }

            Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t));
        }
예제 #23
0
        internal static bool find_structural_bits(uint8_t* buf, size_t len, ParsedJson pj)
        {
            if (len > pj.bytecapacity)
            {
                Console.WriteLine("Your ParsedJson object only supports documents up to " + pj.bytecapacity +
                                  " bytes but you are trying to process " + len + " bytes\n");
                return false;
            }

            uint32_t* base_ptr = pj.structural_indexes;
            uint32_t @base = 0;
#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
            var has_error = Vector256<byte>.Zero;
            var previous = new avx_processed_utf_bytes();
            previous.rawbytes = Vector256<byte>.Zero;
            previous.high_nibbles = Vector256<byte>.Zero;
            previous.carried_continuations = Vector256<byte>.Zero;
            var highbit = Vector256.Create((byte)0x80);
#endif

            const uint64_t even_bits = 0x5555555555555555UL;
            const uint64_t odd_bits = ~even_bits;

            // for now, just work in 64-byte chunks
            // we have padded the input out to 64 byte multiple with the remainder being
            // zeros

            // persistent state across loop
            uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value
            uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones

            // effectively the very first char is considered to follow "whitespace" for the
            // purposes of psuedo-structural character detection
            uint64_t prev_iter_ends_pseudo_pred = 1UL;
            size_t lenminus64 = len < 64 ? 0 : len - 64;
            size_t idx = 0;
            uint64_t structurals = 0;

            // C#: assign static readonly fields to locals before the loop
            Vector256<byte> low_nibble_mask = s_low_nibble_mask;
            Vector256<byte> high_nibble_mask = s_high_nibble_mask;
            Vector256<byte> utf8ValidVec = s_utf8ValidVec;

            var structural_shufti_mask = Vector256.Create((byte)0x7);
            var whitespace_shufti_mask = Vector256.Create((byte)0x18);
            var slashVec = Vector256.Create((bytechar) '\\').AsByte();
            var ffVec = Vector128.Create((byte) 0xFF).AsUInt64();
            var doubleQuoteVec = Vector256.Create((byte)'"');
            var zeroBVec = Vector256.Create((byte) 0);
            var vec7f = Vector256.Create((byte) 0x7f);

            for (; idx < lenminus64; idx += 64)
            {
                var input_lo = Avx.LoadVector256(buf + idx + 0);
                var input_hi = Avx.LoadVector256(buf + idx + 32);
#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
                if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true)
                {
                    // it is ascii, we just check continuation
                    has_error = Avx2.Or(
                        Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(), utf8ValidVec, has_error);

                }
                else
                {
                    // it is not ascii so we have to do heavy work
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error);
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error);
                }
#endif

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 1: detect odd sequences of backslashes
                ////////////////////////////////////////////////////////////////////////////////////////////
                /// 
                uint64_t bs_bits =
                    cmp_mask_against_input(input_lo, input_hi, slashVec);
                uint64_t start_edges = bs_bits & ~(bs_bits << 1);
                // flip lowest if we have an odd-length run at the end of the prior
                // iteration
                uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                uint64_t even_starts = start_edges & even_start_mask;
                uint64_t odd_starts = start_edges & ~even_start_mask;
                uint64_t even_carries = bs_bits + even_starts;
                uint64_t odd_carries;
                // must record the carry-out of our odd-carries out of bit 63; this
                // indicates whether the sense of any edge going to the next iteration
                // should be flipped
                bool iter_ends_odd_backslash =
                    add_overflow(bs_bits, odd_starts, &odd_carries);

                odd_carries |=
                    prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                // if we had an odd-numbered run at the
                // end of the previous iteration
                prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL;
                uint64_t even_carry_ends = even_carries & ~bs_bits;
                uint64_t odd_carry_ends = odd_carries & ~bs_bits;
                uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                uint64_t odd_ends = even_start_odd_end | odd_start_even_end;

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 2: detect insides of quote pairs
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t quote_bits =
                    cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec);
                quote_bits = quote_bits & ~odd_ends;
                uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply(
                    Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0));

                uint32_t cnt = (uint32_t) hamming(structurals);
                uint32_t next_base = @base + cnt;
                while (structurals != 0)
                {
                    base_ptr[@base + 0] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 1] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 2] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 3] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 4] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 5] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 6] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 7] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    @base += 8;
                }

                @base = next_base;

                quote_mask ^= prev_iter_inside_quote;
                prev_iter_inside_quote =
                    (uint64_t) ((int64_t) quote_mask >>
                                63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code



                var v_lo = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_lo),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),
                            vec7f)));

                var v_hi = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_hi),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                            vec7f)));
                var tmp_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, structural_shufti_mask), zeroBVec);
                var tmp_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, structural_shufti_mask), zeroBVec);

                uint64_t structural_res_0 = (uint32_t) Avx2.MoveMask(tmp_lo);
                uint64_t structural_res_1 = (uint64_t) Avx2.MoveMask(tmp_hi);
                structurals = ~(structural_res_0 | (structural_res_1 << 32));

                var tmp_ws_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec);
                var tmp_ws_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec);

                uint64_t ws_res_0 = (uint32_t) Avx2.MoveMask(tmp_ws_lo);
                uint64_t ws_res_1 = (uint64_t) Avx2.MoveMask(tmp_ws_hi);
                uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));


                // mask off anything inside quotes
                structurals &= ~quote_mask;

                // add the real quote bits back into our bitmask as well, so we can
                // quickly traverse the strings we've spent all this trouble gathering
                structurals |= quote_bits;

                // Now, establish "pseudo-structural characters". These are non-whitespace
                // characters that are (a) outside quotes and (b) have a predecessor that's
                // either whitespace or a structural character. This means that subsequent
                // passes will get a chance to encounter the first character of every string
                // of non-whitespace and, if we're parsing an atom like true/false/null or a
                // number we can stop at the first whitespace or structural character
                // following it.

                // a qualified predecessor is something that can happen 1 position before an
                // psuedo-structural character
                uint64_t pseudo_pred = structurals | whitespace;
                uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
                prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
                uint64_t pseudo_structurals =
                    shifted_pseudo_pred & (~whitespace) & (~quote_mask);
                structurals |= pseudo_structurals;

                // now, we've used our close quotes all we need to. So let's switch them off
                // they will be off in the quote mask and on in quote bits.
                structurals &= ~(quote_bits & ~quote_mask);

                //Console.WriteLine($"Iter: {idx}, satur: {structurals}");

                //*(uint64_t *)(pj.structurals + idx / 8) = structurals;
            }

            ////////////////
            /// we use a giant copy-paste which is ugly.
            /// but otherwise the string needs to be properly padded or else we
            /// risk invalidating the UTF-8 checks.
            ////////////
            if (idx < len)
            {
                uint8_t* tmpbuf = stackalloc uint8_t[64];
                memset(tmpbuf, 0x20, 64);
                memcpy(tmpbuf, buf + idx, len - idx);
                Vector256<byte> input_lo = Avx.LoadVector256(tmpbuf + 0);
                Vector256<byte> input_hi = Avx.LoadVector256(tmpbuf + 32);
#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
                var highbit = Vector256.Create((byte)0x80);
                if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true)
                {
                    // it is ascii, we just check continuation
                    has_error = Avx2.Or(
                      Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(),
                                      utf8ValidVec).AsByte(), has_error);

                }
                else
                {
                    // it is not ascii so we have to do heavy work
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error);
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error);
                }
#endif
                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 1: detect odd sequences of backslashes
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t bs_bits =
                    cmp_mask_against_input(input_lo, input_hi, slashVec);
                uint64_t start_edges = bs_bits & ~(bs_bits << 1);
                // flip lowest if we have an odd-length run at the end of the prior
                // iteration
                uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                uint64_t even_starts = start_edges & even_start_mask;
                uint64_t odd_starts = start_edges & ~even_start_mask;
                uint64_t even_carries = bs_bits + even_starts;

                uint64_t odd_carries;
                // must record the carry-out of our odd-carries out of bit 63; this
                // indicates whether the sense of any edge going to the next iteration
                // should be flipped
                //bool iter_ends_odd_backslash =
                add_overflow(bs_bits, odd_starts, &odd_carries);

                odd_carries |=
                    prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                // if we had an odd-numbered run at the
                // end of the previous iteration
                //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
                uint64_t even_carry_ends = even_carries & ~bs_bits;
                uint64_t odd_carry_ends = odd_carries & ~bs_bits;
                uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                uint64_t odd_ends = even_start_odd_end | odd_start_even_end;

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 2: detect insides of quote pairs
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t quote_bits =
                    cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec);
                quote_bits = quote_bits & ~odd_ends;
                uint64_t quote_mask = (uint64_t)Sse2.X64.ConvertToInt64(Pclmulqdq.CarrylessMultiply(
                    Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0).AsInt64());
                quote_mask ^= prev_iter_inside_quote;

                //BUG? https://github.com/dotnet/coreclr/issues/22813
                //quote_mask = 60;
                //prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20

                uint32_t cnt = (uint32_t)hamming(structurals);
                uint32_t next_base = @base + cnt;
                while (structurals != 0)
                {
                    base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    @base += 8;
                }
                @base = next_base;
                // How do we build up a user traversable data structure
                // first, do a 'shufti' to detect structural JSON characters
                // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
                // these go into the first 3 buckets of the comparison (1/2/4)

                // we are also interested in the four whitespace characters
                // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
                // these go into the next 2 buckets of the comparison (8/16)

                var v_lo = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_lo),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),
                            vec7f)));

                var v_hi = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_hi),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                            vec7f)));
                var tmp_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, structural_shufti_mask), zeroBVec);
                var tmp_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, structural_shufti_mask), zeroBVec);

                uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo);
                uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi);
                structurals = ~(structural_res_0 | (structural_res_1 << 32));

                // this additional mask and transfer is non-trivially expensive,
                // unfortunately
                var tmp_ws_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec);
                var tmp_ws_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec);

                uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo);
                uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi);
                uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));


                // mask off anything inside quotes
                structurals &= ~quote_mask;

                // add the real quote bits back into our bitmask as well, so we can
                // quickly traverse the strings we've spent all this trouble gathering
                structurals |= quote_bits;

                // Now, establish "pseudo-structural characters". These are non-whitespace
                // characters that are (a) outside quotes and (b) have a predecessor that's
                // either whitespace or a structural character. This means that subsequent
                // passes will get a chance to encounter the first character of every string
                // of non-whitespace and, if we're parsing an atom like true/false/null or a
                // number we can stop at the first whitespace or structural character
                // following it.

                // a qualified predecessor is something that can happen 1 position before an
                // psuedo-structural character
                uint64_t pseudo_pred = structurals | whitespace;
                uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
                prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
                uint64_t pseudo_structurals =
                    shifted_pseudo_pred & (~whitespace) & (~quote_mask);
                structurals |= pseudo_structurals;

                // now, we've used our close quotes all we need to. So let's switch them off
                // they will be off in the quote mask and on in quote bits.
                structurals &= ~(quote_bits & ~quote_mask);
                //*(uint64_t *)(pj.structurals + idx / 8) = structurals;
                idx += 64;
            }
            uint32_t cnt2 = (uint32_t)hamming(structurals);
            uint32_t next_base2 = @base + cnt2;
            while (structurals != 0)
            {
                base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                @base += 8;
            }
            @base = next_base2;

            pj.n_structural_indexes = @base;
            if (base_ptr[pj.n_structural_indexes - 1] > len)
            {
                throw new InvalidOperationException("Internal bug");
            }
            if (len != base_ptr[pj.n_structural_indexes - 1])
            {
                // the string might not be NULL terminated, but we add a virtual NULL ending character. 
                base_ptr[pj.n_structural_indexes++] = (uint32_t)len;
            }
            base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array

#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
            return Avx.TestZ(has_error, has_error);
#else
            return true;
#endif
        }
예제 #24
0
 static Vector128 <ulong> Fold(Vector128 <ulong> input, Vector128 <ulong> foldConstants) =>
 Sse2.Xor(Pclmulqdq.CarrylessMultiply(input, foldConstants, 0x00),
          Pclmulqdq.CarrylessMultiply(input, foldConstants, 0x11));