コード例 #1
    private static void Transpose(ref Vector256 <byte> x0, ref Vector256 <byte> x1, ref Vector256 <byte> x2, ref Vector256 <byte> x3)
        Vector256 <ulong> t0 = Avx2.UnpackHigh(x0.AsUInt32(), x1.AsUInt32()).AsUInt64();

        x0 = Avx2.UnpackLow(x0.AsUInt32(), x1.AsUInt32()).AsByte();

        Vector256 <ulong> t1 = Avx2.UnpackLow(x2.AsUInt32(), x3.AsUInt32()).AsUInt64();

        x2 = Avx2.UnpackHigh(x2.AsUInt32(), x3.AsUInt32()).AsByte();

        x1 = Avx2.UnpackHigh(x0.AsUInt64(), t1).AsByte();
        x0 = Avx2.UnpackLow(x0.AsUInt64(), t1).AsByte();

        x3 = Avx2.UnpackHigh(t0, x2.AsUInt64()).AsByte();
        x2 = Avx2.UnpackLow(t0, x2.AsUInt64()).AsByte();
コード例 #2
            public static unsafe void Decode32Bytes(byte *source, byte *dest)
                Vector256 <byte> maskA   = Vector256.Create((uint)0x0000_003f).AsByte();
                Vector256 <byte> maskB   = Vector256.Create((uint)0x0000_3f00).AsByte();
                Vector256 <byte> maskC   = Vector256.Create((uint)0x003f_0000).AsByte();
                Vector256 <byte> maskD   = Vector256.Create((uint)0x3f00_0000).AsByte();
                Vector256 <byte> offsets = Vector256.Create((sbyte)-32).AsByte();

                Vector256 <byte> vecSource  = Unsafe.As <byte, Vector256 <byte> >(ref source[0]);
                Vector256 <byte> subtracted = Avx2.Add(vecSource, offsets);

                Vector256 <byte> a = Avx2.And(subtracted, maskA);
                Vector256 <byte> b = Avx2.And(subtracted, maskB);
                Vector256 <byte> c = Avx2.And(subtracted, maskC);
                Vector256 <byte> d = Avx2.And(subtracted, maskD);

                a = Avx2.ShiftLeftLogical(a.AsUInt32(), 18).AsByte();                       // 00000000 00000000 00000000 00aaaaaa -> 00000000 aaaaaa00 00000000 00000000
                b = Avx2.ShiftLeftLogical(b.AsUInt32(), 4).AsByte();                        // 00000000 00000000 00bbbbbb 00000000 -> 00000000 000000bb bbbb0000 00000000
                c = Avx2.ShiftRightLogical(c.AsUInt32(), 10).AsByte();                      // 00000000 00cccccc 00000000 00000000 -> 00000000 00000000 0000cccc cc000000
                d = Avx2.ShiftRightLogical(d.AsUInt32(), 24).AsByte();                      // 00dddddd 00000000 00000000 00000000 -> 00000000 00000000 00000000 00dddddd
                //	After Or:							  00000000 aaaaaabb bbbbcccc ccdddddd
                //                                        byte 3   byte 1   byte 2   byte 0

                // a uint: 0x00000000_00000000__00000000_00111111
                // b uint: 0x00000000_00000000__00111111_00000000
                // c uint: 0x00000000_00111111__00000000_00000000
                // d uint: 0x00111111_00000000__00000000_00000000

                a = Avx2.Or(a, b);
                c = Avx2.Or(c, d);
                a = Avx2.Or(a, c);                                      // AA BB CC 00   AA BB CC 00

                // a contains: [C,B,A,0, F,E,D,0, I,H,G,0, L,K,J,0]
                // Shuffle bytes so that it becomes: [A,B,C, D,E,F, G,H,I, J,K,L, 0,0,0,0]

                //2,   1,  0,   6,  5,  4,   10,  9,  8,  14, 13, 12,  // 3, 7, 11, 15
                //	18, 17, 16,  22, 21, 20,     // 19

                var vecShuffle = Vector256.Create(
                    0x02, 0x01, 0x00, 0x06, 0x05, 0x04, 0x0a, 0x09, 0x08, 0x0e, 0x0d, 0x0c,
                    0x80, 0x80, 0x80, 0x80,                     // 0x03, 0x07, 0x0b, 0x0f
                    0x12, 0x11, 0x10, 0x16, 0x15, 0x14, 0x1a, 0x19, 0x18, 0x1e, 0x1d, 0x1c,
                    0x80, 0x80, 0x80, 0x80);                    // 0x13, 0x17, 0x1b, 0x1f

                var vecBytes2 = Avx2.Shuffle(a, vecShuffle);

                Sse2.Store(dest, vecBytes2.GetLower());
                Sse2.Store(dest + 12, vecBytes2.GetUpper());
コード例 #3
 public static Vector256 <T> Vector256Add <T>(Vector256 <T> left, Vector256 <T> right) where T : struct
     if (typeof(T) == typeof(byte))
         return(Avx2.Add(left.AsByte(), right.AsByte()).As <byte, T>());
     else if (typeof(T) == typeof(sbyte))
         return(Avx2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>());
     else if (typeof(T) == typeof(short))
         return(Avx2.Add(left.AsInt16(), right.AsInt16()).As <short, T>());
     else if (typeof(T) == typeof(ushort))
         return(Avx2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>());
     else if (typeof(T) == typeof(int))
         return(Avx2.Add(left.AsInt32(), right.AsInt32()).As <int, T>());
     else if (typeof(T) == typeof(uint))
         return(Avx2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>());
     else if (typeof(T) == typeof(long))
         return(Avx2.Add(left.AsInt64(), right.AsInt64()).As <long, T>());
     else if (typeof(T) == typeof(ulong))
         return(Avx2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>());
     else if (typeof(T) == typeof(float))
         return(Avx.Add(left.AsSingle(), right.AsSingle()).As <float, T>());
     else if (typeof(T) == typeof(double))
         return(Avx.Add(left.AsDouble(), right.AsDouble()).As <double, T>());
         throw new NotSupportedException();
コード例 #4
            public static unsafe void Decode32Bytes_v2_temp_upper(byte *source, byte *dest)
                //Vector256<byte> vecSource   = Unsafe.As<byte, Vector256<byte>>( ref source[0] );
                Vector256 <byte> vecSource  = Unsafe.As <byte, Vector256 <byte> >(ref *source);
                Vector256 <byte> subtracted = Avx2.Add(vecSource, Vector256.Create((sbyte)-32).AsByte());

                Vector256 <byte> a = Avx2.And(subtracted, Vector256.Create((uint)0x0000_003f).AsByte());
                Vector256 <byte> b = Avx2.And(subtracted, Vector256.Create((uint)0x0000_3f00).AsByte());
                Vector256 <byte> c = Avx2.And(subtracted, Vector256.Create((uint)0x003f_0000).AsByte());
                Vector256 <byte> d = Avx2.And(subtracted, Vector256.Create((uint)0x3f00_0000).AsByte());

                a = Avx2.ShiftLeftLogical(a.AsUInt32(), 18).AsByte();                       // 00000000 00000000 00000000 00aaaaaa -> 00000000 aaaaaa00 00000000 00000000
                b = Avx2.ShiftLeftLogical(b.AsUInt32(), 4).AsByte();                        // 00000000 00000000 00bbbbbb 00000000 -> 00000000 000000bb bbbb0000 00000000
                c = Avx2.ShiftRightLogical(c.AsUInt32(), 10).AsByte();                      // 00000000 00cccccc 00000000 00000000 -> 00000000 00000000 0000cccc cc000000
                d = Avx2.ShiftRightLogical(d.AsUInt32(), 24).AsByte();                      // 00dddddd 00000000 00000000 00000000 -> 00000000 00000000 00000000 00dddddd
                //	After Or:							  00000000 aaaaaabb bbbbcccc ccdddddd
                //                                        byte 3   byte 1   byte 2   byte 0

                // a uint: 0x00000000_00000000__00000000_00111111
                // b uint: 0x00000000_00000000__00111111_00000000
                // c uint: 0x00000000_00111111__00000000_00000000
                // d uint: 0x00111111_00000000__00000000_00000000

                a = Avx2.Or(a, b);
                c = Avx2.Or(c, d);
                a = Avx2.Or(a, c);                                      // AA BB CC 00   AA BB CC 00

                var vecBytes2 = Avx2.Shuffle(a,
                                                 0x02, 0x01, 0x00, 0x06, 0x05, 0x04, 0x0a, 0x09, 0x08, 0x0e, 0x0d, 0x0c,
                                                 0x80, 0x80, 0x80, 0x80,                                        // 0x03, 0x07, 0x0b, 0x0f
                                                 0x12, 0x11, 0x10, 0x16, 0x15, 0x14, 0x1a, 0x19, 0x18, 0x1e, 0x1d, 0x1c,
                                                 0x80, 0x80, 0x80, 0x80)                                        // 0x13, 0x17, 0x1b, 0x1f

                var upper = vecBytes2.GetUpper();

                Sse2.Store(dest, vecBytes2.GetLower());
                Sse2.Store(dest + 12, upper);
コード例 #5
        internal static bool find_structural_bits(uint8_t* buf, size_t len, ParsedJson pj)
            if (len > pj.bytecapacity)
                Console.WriteLine("Your ParsedJson object only supports documents up to " + pj.bytecapacity +
                                  " bytes but you are trying to process " + len + " bytes\n");
                return false;

            uint32_t* base_ptr = pj.structural_indexes;
            uint32_t @base = 0;
            var has_error = Vector256<byte>.Zero;
            var previous = new avx_processed_utf_bytes();
            previous.rawbytes = Vector256<byte>.Zero;
            previous.high_nibbles = Vector256<byte>.Zero;
            previous.carried_continuations = Vector256<byte>.Zero;
            var highbit = Vector256.Create((byte)0x80);

            const uint64_t even_bits = 0x5555555555555555UL;
            const uint64_t odd_bits = ~even_bits;

            // for now, just work in 64-byte chunks
            // we have padded the input out to 64 byte multiple with the remainder being
            // zeros

            // persistent state across loop
            uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value
            uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones

            // effectively the very first char is considered to follow "whitespace" for the
            // purposes of psuedo-structural character detection
            uint64_t prev_iter_ends_pseudo_pred = 1UL;
            size_t lenminus64 = len < 64 ? 0 : len - 64;
            size_t idx = 0;
            uint64_t structurals = 0;

            // C#: assign static readonly fields to locals before the loop
            Vector256<byte> low_nibble_mask = s_low_nibble_mask;
            Vector256<byte> high_nibble_mask = s_high_nibble_mask;
            Vector256<byte> utf8ValidVec = s_utf8ValidVec;

            var structural_shufti_mask = Vector256.Create((byte)0x7);
            var whitespace_shufti_mask = Vector256.Create((byte)0x18);
            var slashVec = Vector256.Create((bytechar) '\\').AsByte();
            var ffVec = Vector128.Create((byte) 0xFF).AsUInt64();
            var doubleQuoteVec = Vector256.Create((byte)'"');
            var zeroBVec = Vector256.Create((byte) 0);
            var vec7f = Vector256.Create((byte) 0x7f);

            for (; idx < lenminus64; idx += 64)
                var input_lo = Avx.LoadVector256(buf + idx + 0);
                var input_hi = Avx.LoadVector256(buf + idx + 32);
                if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true)
                    // it is ascii, we just check continuation
                    has_error = Avx2.Or(
                        Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(), utf8ValidVec, has_error);

                    // it is not ascii so we have to do heavy work
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error);
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error);

                //     Step 1: detect odd sequences of backslashes
                uint64_t bs_bits =
                    cmp_mask_against_input(input_lo, input_hi, slashVec);
                uint64_t start_edges = bs_bits & ~(bs_bits << 1);
                // flip lowest if we have an odd-length run at the end of the prior
                // iteration
                uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                uint64_t even_starts = start_edges & even_start_mask;
                uint64_t odd_starts = start_edges & ~even_start_mask;
                uint64_t even_carries = bs_bits + even_starts;
                uint64_t odd_carries;
                // must record the carry-out of our odd-carries out of bit 63; this
                // indicates whether the sense of any edge going to the next iteration
                // should be flipped
                bool iter_ends_odd_backslash =
                    add_overflow(bs_bits, odd_starts, &odd_carries);

                odd_carries |=
                    prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                // if we had an odd-numbered run at the
                // end of the previous iteration
                prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL;
                uint64_t even_carry_ends = even_carries & ~bs_bits;
                uint64_t odd_carry_ends = odd_carries & ~bs_bits;
                uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                uint64_t odd_ends = even_start_odd_end | odd_start_even_end;

                //     Step 2: detect insides of quote pairs

                uint64_t quote_bits =
                    cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec);
                quote_bits = quote_bits & ~odd_ends;
                uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply(
                    Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0));

                uint32_t cnt = (uint32_t) hamming(structurals);
                uint32_t next_base = @base + cnt;
                while (structurals != 0)
                    base_ptr[@base + 0] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 1] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 2] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 3] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 4] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 5] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 6] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 7] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    @base += 8;

                @base = next_base;

                quote_mask ^= prev_iter_inside_quote;
                prev_iter_inside_quote =
                    (uint64_t) ((int64_t) quote_mask >>
                                63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code

                var v_lo = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_lo),
                        Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),

                var v_hi = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_hi),
                        Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                var tmp_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, structural_shufti_mask), zeroBVec);
                var tmp_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, structural_shufti_mask), zeroBVec);

                uint64_t structural_res_0 = (uint32_t) Avx2.MoveMask(tmp_lo);
                uint64_t structural_res_1 = (uint64_t) Avx2.MoveMask(tmp_hi);
                structurals = ~(structural_res_0 | (structural_res_1 << 32));

                var tmp_ws_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec);
                var tmp_ws_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec);

                uint64_t ws_res_0 = (uint32_t) Avx2.MoveMask(tmp_ws_lo);
                uint64_t ws_res_1 = (uint64_t) Avx2.MoveMask(tmp_ws_hi);
                uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));

                // mask off anything inside quotes
                structurals &= ~quote_mask;

                // add the real quote bits back into our bitmask as well, so we can
                // quickly traverse the strings we've spent all this trouble gathering
                structurals |= quote_bits;

                // Now, establish "pseudo-structural characters". These are non-whitespace
                // characters that are (a) outside quotes and (b) have a predecessor that's
                // either whitespace or a structural character. This means that subsequent
                // passes will get a chance to encounter the first character of every string
                // of non-whitespace and, if we're parsing an atom like true/false/null or a
                // number we can stop at the first whitespace or structural character
                // following it.

                // a qualified predecessor is something that can happen 1 position before an
                // psuedo-structural character
                uint64_t pseudo_pred = structurals | whitespace;
                uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
                prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
                uint64_t pseudo_structurals =
                    shifted_pseudo_pred & (~whitespace) & (~quote_mask);
                structurals |= pseudo_structurals;

                // now, we've used our close quotes all we need to. So let's switch them off
                // they will be off in the quote mask and on in quote bits.
                structurals &= ~(quote_bits & ~quote_mask);

                //Console.WriteLine($"Iter: {idx}, satur: {structurals}");

                //*(uint64_t *)(pj.structurals + idx / 8) = structurals;

            /// we use a giant copy-paste which is ugly.
            /// but otherwise the string needs to be properly padded or else we
            /// risk invalidating the UTF-8 checks.
            if (idx < len)
                uint8_t* tmpbuf = stackalloc uint8_t[64];
                memset(tmpbuf, 0x20, 64);
                memcpy(tmpbuf, buf + idx, len - idx);
                Vector256<byte> input_lo = Avx.LoadVector256(tmpbuf + 0);
                Vector256<byte> input_hi = Avx.LoadVector256(tmpbuf + 32);
                var highbit = Vector256.Create((byte)0x80);
                if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true)
                    // it is ascii, we just check continuation
                    has_error = Avx2.Or(
                                      utf8ValidVec).AsByte(), has_error);

                    // it is not ascii so we have to do heavy work
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error);
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error);
                //     Step 1: detect odd sequences of backslashes

                uint64_t bs_bits =
                    cmp_mask_against_input(input_lo, input_hi, slashVec);
                uint64_t start_edges = bs_bits & ~(bs_bits << 1);
                // flip lowest if we have an odd-length run at the end of the prior
                // iteration
                uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                uint64_t even_starts = start_edges & even_start_mask;
                uint64_t odd_starts = start_edges & ~even_start_mask;
                uint64_t even_carries = bs_bits + even_starts;

                uint64_t odd_carries;
                // must record the carry-out of our odd-carries out of bit 63; this
                // indicates whether the sense of any edge going to the next iteration
                // should be flipped
                //bool iter_ends_odd_backslash =
                add_overflow(bs_bits, odd_starts, &odd_carries);

                odd_carries |=
                    prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                // if we had an odd-numbered run at the
                // end of the previous iteration
                //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
                uint64_t even_carry_ends = even_carries & ~bs_bits;
                uint64_t odd_carry_ends = odd_carries & ~bs_bits;
                uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                uint64_t odd_ends = even_start_odd_end | odd_start_even_end;

                //     Step 2: detect insides of quote pairs

                uint64_t quote_bits =
                    cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec);
                quote_bits = quote_bits & ~odd_ends;
                uint64_t quote_mask = (uint64_t)Sse2.X64.ConvertToInt64(Pclmulqdq.CarrylessMultiply(
                    Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0).AsInt64());
                quote_mask ^= prev_iter_inside_quote;

                //BUG? https://github.com/dotnet/coreclr/issues/22813
                //quote_mask = 60;
                //prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20

                uint32_t cnt = (uint32_t)hamming(structurals);
                uint32_t next_base = @base + cnt;
                while (structurals != 0)
                    base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    @base += 8;
                @base = next_base;
                // How do we build up a user traversable data structure
                // first, do a 'shufti' to detect structural JSON characters
                // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
                // these go into the first 3 buckets of the comparison (1/2/4)

                // we are also interested in the four whitespace characters
                // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
                // these go into the next 2 buckets of the comparison (8/16)

                var v_lo = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_lo),
                        Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),

                var v_hi = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_hi),
                        Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                var tmp_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, structural_shufti_mask), zeroBVec);
                var tmp_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, structural_shufti_mask), zeroBVec);

                uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo);
                uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi);
                structurals = ~(structural_res_0 | (structural_res_1 << 32));

                // this additional mask and transfer is non-trivially expensive,
                // unfortunately
                var tmp_ws_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec);
                var tmp_ws_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec);

                uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo);
                uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi);
                uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));

                // mask off anything inside quotes
                structurals &= ~quote_mask;

                // add the real quote bits back into our bitmask as well, so we can
                // quickly traverse the strings we've spent all this trouble gathering
                structurals |= quote_bits;

                // Now, establish "pseudo-structural characters". These are non-whitespace
                // characters that are (a) outside quotes and (b) have a predecessor that's
                // either whitespace or a structural character. This means that subsequent
                // passes will get a chance to encounter the first character of every string
                // of non-whitespace and, if we're parsing an atom like true/false/null or a
                // number we can stop at the first whitespace or structural character
                // following it.

                // a qualified predecessor is something that can happen 1 position before an
                // psuedo-structural character
                uint64_t pseudo_pred = structurals | whitespace;
                uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
                prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
                uint64_t pseudo_structurals =
                    shifted_pseudo_pred & (~whitespace) & (~quote_mask);
                structurals |= pseudo_structurals;

                // now, we've used our close quotes all we need to. So let's switch them off
                // they will be off in the quote mask and on in quote bits.
                structurals &= ~(quote_bits & ~quote_mask);
                //*(uint64_t *)(pj.structurals + idx / 8) = structurals;
                idx += 64;
            uint32_t cnt2 = (uint32_t)hamming(structurals);
            uint32_t next_base2 = @base + cnt2;
            while (structurals != 0)
                base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                @base += 8;
            @base = next_base2;

            pj.n_structural_indexes = @base;
            if (base_ptr[pj.n_structural_indexes - 1] > len)
                throw new InvalidOperationException("Internal bug");
            if (len != base_ptr[pj.n_structural_indexes - 1])
                // the string might not be NULL terminated, but we add a virtual NULL ending character. 
                base_ptr[pj.n_structural_indexes++] = (uint32_t)len;
            base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array

            return Avx.TestZ(has_error, has_error);
            return true;
コード例 #6
        // take input from buf and remove useless whitespace, input and output can be
        // the same, result is null terminated, return the string length (minus the null termination)
        public static size_t Minify(uint8_t *buf, size_t len, uint8_t * @out)
            if (!Avx2.IsSupported)
                throw new NotSupportedException("AVX2 is required form SimdJson");

            //C#: load const vectors once (there is no `const _m256` in C#)
            Vector256 <byte> lut_cntrl        = s_lut_cntrl;
            Vector256 <byte> low_nibble_mask  = s_low_nibble_mask;
            Vector256 <byte> high_nibble_mask = s_high_nibble_mask;

            fixed(byte *mask128_epi8 = s_mask128_epi8)
                // Useful constant masks
                const uint64_t even_bits = 0x5555555555555555UL;
                const uint64_t odd_bits  = ~even_bits;
                uint8_t *      initout   = @out;
                uint64_t       prev_iter_ends_odd_backslash =
                    0UL;                               // either 0 or 1, but a 64-bit value
                uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones
                size_t   idx = 0;

                if (len >= 64)
                    size_t avxlen = len - 63;

                    for (; idx < avxlen; idx += 64)
                        Vector256 <byte> input_lo = Avx.LoadVector256((buf + idx + 0));
                        Vector256 <byte> input_hi = Avx.LoadVector256((buf + idx + 32));
                        uint64_t         bs_bits  = cmp_mask_against_input_mini(input_lo, input_hi,
                        uint64_t start_edges     = bs_bits & ~(bs_bits << 1);
                        uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                        uint64_t even_starts     = start_edges & even_start_mask;
                        uint64_t odd_starts      = start_edges & ~even_start_mask;
                        uint64_t even_carries    = bs_bits + even_starts;
                        uint64_t odd_carries;
                        bool     iter_ends_odd_backslash = add_overflow(
                            bs_bits, odd_starts, &odd_carries);
                        odd_carries |= prev_iter_ends_odd_backslash;
                        prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL;
                        uint64_t even_carry_ends    = even_carries & ~bs_bits;
                        uint64_t odd_carry_ends     = odd_carries & ~bs_bits;
                        uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                        uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                        uint64_t odd_ends           = even_start_odd_end | odd_start_even_end;
                        uint64_t quote_bits         = cmp_mask_against_input_mini(input_lo, input_hi,
                        quote_bits = quote_bits & ~odd_ends;
                        uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply(
                                                                           Vector128.Create(quote_bits, 0UL).AsUInt64(), Vector128.Create((byte)0xFF).AsUInt64(), 0));
                        quote_mask            ^= prev_iter_inside_quote;
                        prev_iter_inside_quote =
                            (uint64_t)((int64_t)quote_mask >>
                                       63);  // might be undefined behavior, should be fully defined in C++20, ok according to John Regher from Utah University

                        Vector256 <byte> whitespace_shufti_mask = Vector256.Create((byte)0x18);
                        Vector256 <byte> v_lo = Avx2.And(
                            Avx2.Shuffle(low_nibble_mask, input_lo),
                                         Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),

                        Vector256 <byte> v_hi = Avx2.And(
                            Avx2.Shuffle(low_nibble_mask, input_hi),
                                         Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                        Vector256 <byte> tmp_ws_lo = Avx2.CompareEqual(
                            Avx2.And(v_lo, whitespace_shufti_mask), Vector256.Create((byte)0));
                        Vector256 <byte> tmp_ws_hi = Avx2.CompareEqual(
                            Avx2.And(v_hi, whitespace_shufti_mask), Vector256.Create((byte)0));

                        uint64_t ws_res_0   = (uint32_t)Avx2.MoveMask(tmp_ws_lo);
                        uint64_t ws_res_1   = (uint64_t)Avx2.MoveMask(tmp_ws_hi);
                        uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
                        whitespace &= ~quote_mask;
                        int mask1  = (int)(whitespace & 0xFFFF);
                        int mask2  = (int)((whitespace >> 16) & 0xFFFF);
                        int mask3  = (int)((whitespace >> 32) & 0xFFFF);
                        int mask4  = (int)((whitespace >> 48) & 0xFFFF);
                        int pop1   = hamming((~whitespace) & 0xFFFF);
                        int pop2   = hamming((~whitespace) & (ulong)(0xFFFFFFFF));
                        int pop3   = hamming((~whitespace) & (ulong)(0xFFFFFFFFFFFF));
                        int pop4   = hamming((~whitespace));
                        var vmask1 =
                            _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask2 & 0x7FFF) * 2,
                                                (ulong *)mask128_epi8 + (mask1 & 0x7FFF) * 2);
                        var vmask2 =
                            _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask4 & 0x7FFF) * 2,
                                                (ulong *)mask128_epi8 + (mask3 & 0x7FFF) * 2);
                        var result1 = Avx2.Shuffle(input_lo, vmask1.AsByte());
                        var result2 = Avx2.Shuffle(input_hi, vmask2.AsByte());
                        _mm256_storeu2_m128i((@out + pop1), @out, result1);
                        _mm256_storeu2_m128i((@out + pop3), (@out + pop2),
                        @out += pop4;

                // we finish off the job... copying and pasting the code is not ideal here,
                // but it gets the job done.
                if (idx < len)
                    uint8_t *buffer = stackalloc uint8_t[64];
                    memset(buffer, 0, 64);
                    memcpy(buffer, buf + idx, len - idx);
                    var      input_lo = Avx.LoadVector256((buffer));
                    var      input_hi = Avx.LoadVector256((buffer + 32));
                    uint64_t bs_bits  =
                        cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'\\'));
                    uint64_t start_edges     = bs_bits & ~(bs_bits << 1);
                    uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                    uint64_t even_starts     = start_edges & even_start_mask;
                    uint64_t odd_starts      = start_edges & ~even_start_mask;
                    uint64_t even_carries    = bs_bits + even_starts;
                    uint64_t odd_carries;
                    //bool iter_ends_odd_backslash =
                    add_overflow(bs_bits, odd_starts, &odd_carries);
                    odd_carries |= prev_iter_ends_odd_backslash;
                    //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it
                    uint64_t even_carry_ends    = even_carries & ~bs_bits;
                    uint64_t odd_carry_ends     = odd_carries & ~bs_bits;
                    uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                    uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                    uint64_t odd_ends           = even_start_odd_end | odd_start_even_end;
                    uint64_t quote_bits         =
                        cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'"'));
                    quote_bits = quote_bits & ~odd_ends;
                    uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply(
                                                                       Vector128.Create(quote_bits, 0UL), Vector128.Create((byte)0xFF).AsUInt64(), 0));
                    quote_mask ^= prev_iter_inside_quote;
                    // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we don't need this anymore

                    Vector256 <byte> mask_20 = Vector256.Create((byte)0x20); // c==32
                    Vector256 <byte> mask_70 =
                        Vector256.Create((byte)0x70);                        // adding 0x70 does not check low 4-bits
                    // but moves any value >= 16 above 128

                    Vector256 <byte> tmp_ws_lo = Avx2.Or(
                        Avx2.CompareEqual(mask_20, input_lo),
                        Avx2.Shuffle(lut_cntrl, Avx2.AddSaturate(mask_70, input_lo)));
                    Vector256 <byte> tmp_ws_hi = Avx2.Or(
                        Avx2.CompareEqual(mask_20, input_hi),
                        Avx2.Shuffle(lut_cntrl, Avx2.AddSaturate(mask_70, input_hi)));
                    uint64_t ws_res_0   = (uint32_t)Avx2.MoveMask(tmp_ws_lo);
                    uint64_t ws_res_1   = (uint64_t)Avx2.MoveMask(tmp_ws_hi);
                    uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32));
                    whitespace &= ~quote_mask;

                    if (len - idx < 64)
                        whitespace |= ((0xFFFFFFFFFFFFFFFF) << (int)(len - idx));

                    int mask1 = (int)(whitespace & 0xFFFF);
                    int mask2 = (int)((whitespace >> 16) & 0xFFFF);
                    int mask3 = (int)((whitespace >> 32) & 0xFFFF);
                    int mask4 = (int)((whitespace >> 48) & 0xFFFF);
                    int pop1  = hamming((~whitespace) & 0xFFFF);
                    int pop2  = hamming((~whitespace) & 0xFFFFFFFF);
                    int pop3  = hamming((~whitespace) & 0xFFFFFFFFFFFF);
                    int pop4  = hamming((~whitespace));

                    var vmask1 =
                        _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask2 & 0x7FFF) * 2,
                                            (ulong *)mask128_epi8 + (mask1 & 0x7FFF) * 2);
                    var vmask2 =
                        _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask4 & 0x7FFF) * 2,
                                            (ulong *)mask128_epi8 + (mask3 & 0x7FFF) * 2);
                    var result1 = Avx2.Shuffle(input_lo, vmask1.AsByte());
                    var result2 = Avx2.Shuffle(input_hi, vmask2.AsByte());
                    _mm256_storeu2_m128i((buffer + pop1), buffer,
                    _mm256_storeu2_m128i((buffer + pop3), (buffer + pop2),
                    memcpy(@out, buffer, (size_t)pop4);
                    @out += pop4;

                *@out = (byte)'\0';  // NULL termination
                return((size_t)@out - (size_t)initout);
コード例 #7
ファイル: XXHash128.cs プロジェクト: shukenmg/Ryujinx
        private unsafe static void Xxh3ScrambleAcc(Span <ulong> acc, ReadOnlySpan <byte> secret)
            if (Avx2.IsSupported)
                fixed(ulong *pAcc = acc)
                    fixed(byte *pSecret = secret)
                        Vector256 <uint>   prime32 = Vector256.Create(Prime32_1);
                        Vector256 <ulong> *xAcc    = (Vector256 <ulong> *)pAcc;
                        Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 32; i++)
                            Vector256 <ulong> accVec  = xAcc[i];
                            Vector256 <ulong> shifted = Avx2.ShiftRightLogical(accVec, 47);
                            Vector256 <ulong> dataVec = Avx2.Xor(accVec, shifted);

                            Vector256 <byte> keyVec  = xSecret[i];
                            Vector256 <uint> dataKey = Avx2.Xor(dataVec.AsUInt32(), keyVec.AsUInt32());

                            Vector256 <uint>  dataKeyHi = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector256 <ulong> prodLo    = Avx2.Multiply(dataKey, prime32);
                            Vector256 <ulong> prodHi    = Avx2.Multiply(dataKeyHi, prime32);

                            xAcc[i] = Avx2.Add(prodLo, Avx2.ShiftLeftLogical(prodHi, 32));
            else if (Sse2.IsSupported)
                fixed(ulong *pAcc = acc)
                    fixed(byte *pSecret = secret)
                        Vector128 <uint>   prime32 = Vector128.Create(Prime32_1);
                        Vector128 <ulong> *xAcc    = (Vector128 <ulong> *)pAcc;
                        Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 16; i++)
                            Vector128 <ulong> accVec  = xAcc[i];
                            Vector128 <ulong> shifted = Sse2.ShiftRightLogical(accVec, 47);
                            Vector128 <ulong> dataVec = Sse2.Xor(accVec, shifted);

                            Vector128 <byte> keyVec  = xSecret[i];
                            Vector128 <uint> dataKey = Sse2.Xor(dataVec.AsUInt32(), keyVec.AsUInt32());

                            Vector128 <uint>  dataKeyHi = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector128 <ulong> prodLo    = Sse2.Multiply(dataKey, prime32);
                            Vector128 <ulong> prodHi    = Sse2.Multiply(dataKeyHi, prime32);

                            xAcc[i] = Sse2.Add(prodLo, Sse2.ShiftLeftLogical(prodHi, 32));
                for (int i = 0; i < AccNb; i++)
                    ulong key64 = BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong)));
                    ulong acc64 = acc[i];
                    acc64  = XorShift64(acc64, 47);
                    acc64 ^= key64;
                    acc64 *= Prime32_1;
                    acc[i] = acc64;
コード例 #8
ファイル: XXHash128.cs プロジェクト: shukenmg/Ryujinx
        private unsafe static void Xxh3Accumulate512(Span <ulong> acc, ReadOnlySpan <byte> input, ReadOnlySpan <byte> secret)
            if (Avx2.IsSupported)
                fixed(ulong *pAcc = acc)
                    fixed(byte *pInput = input, pSecret = secret)
                        Vector256 <ulong> *xAcc    = (Vector256 <ulong> *)pAcc;
                        Vector256 <byte> * xInput  = (Vector256 <byte> *)pInput;
                        Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 32; i++)
                            Vector256 <byte>  dataVec   = xInput[i];
                            Vector256 <byte>  keyVec    = xSecret[i];
                            Vector256 <byte>  dataKey   = Avx2.Xor(dataVec, keyVec);
                            Vector256 <uint>  dataKeyLo = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector256 <ulong> product   = Avx2.Multiply(dataKey.AsUInt32(), dataKeyLo);
                            Vector256 <uint>  dataSwap  = Avx2.Shuffle(dataVec.AsUInt32(), 0b01001110);
                            Vector256 <ulong> sum       = Avx2.Add(xAcc[i], dataSwap.AsUInt64());
                            xAcc[i] = Avx2.Add(product, sum);
            else if (Sse2.IsSupported)
                fixed(ulong *pAcc = acc)
                    fixed(byte *pInput = input, pSecret = secret)
                        Vector128 <ulong> *xAcc    = (Vector128 <ulong> *)pAcc;
                        Vector128 <byte> * xInput  = (Vector128 <byte> *)pInput;
                        Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 16; i++)
                            Vector128 <byte>  dataVec   = xInput[i];
                            Vector128 <byte>  keyVec    = xSecret[i];
                            Vector128 <byte>  dataKey   = Sse2.Xor(dataVec, keyVec);
                            Vector128 <uint>  dataKeyLo = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector128 <ulong> product   = Sse2.Multiply(dataKey.AsUInt32(), dataKeyLo);
                            Vector128 <uint>  dataSwap  = Sse2.Shuffle(dataVec.AsUInt32(), 0b01001110);
                            Vector128 <ulong> sum       = Sse2.Add(xAcc[i], dataSwap.AsUInt64());
                            xAcc[i] = Sse2.Add(product, sum);
                for (int i = 0; i < AccNb; i++)
                    ulong dataVal = BinaryPrimitives.ReadUInt64LittleEndian(input.Slice(i * sizeof(ulong)));
                    ulong dataKey = dataVal ^ BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong)));
                    acc[i ^ 1] += dataVal;
                    acc[i]     += Mult32To64((uint)dataKey, dataKey >> 32);
コード例 #9
ファイル: Blake2bSse4.cs プロジェクト: pangfd/Spreads
 private static Vector256 <ulong> ror64_32_avx(ref Vector256 <ulong> x) => Avx2.Shuffle(x.AsUInt32(), 0b_10_11_00_01).AsUInt64();
コード例 #10
ファイル: Adler32.cs プロジェクト: Erior/sharpcompress
        public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan <byte> buffer)
            uint s1     = adler & 0xFFFF;
            uint s2     = (adler >> 16) & 0xFFFF;
            uint length = (uint)buffer.Length;

            fixed(byte *bufferPtr = &MemoryMarshal.GetReference(buffer))
                byte *localBufferPtr = bufferPtr;

                Vector256 <byte> zero = Vector256 <byte> .Zero;
                var dot3v             = Vector256.Create((short)1);
                var dot2v             = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);

                // Process n blocks of data. At most NMAX data bytes can be
                // processed before s2 must be reduced modulo BASE.
                var vs1 = Vector256.CreateScalar(s1);
                var vs2 = Vector256.CreateScalar(s2);

                while (length >= 32)
                    int k = length < NMAX ? (int)length : (int)NMAX;
                    k      -= k % 32;
                    length -= (uint)k;

                    Vector256 <uint> vs10 = vs1;
                    Vector256 <uint> vs3  = Vector256 <uint> .Zero;

                    while (k >= 32)
                        // Load 32 input bytes.
                        Vector256 <byte> block = Avx.LoadVector256(localBufferPtr);

                        // Sum of abs diff, resulting in 2 x int32's
                        Vector256 <ushort> vs1sad = Avx2.SumAbsoluteDifferences(block, zero);

                        vs1 = Avx2.Add(vs1, vs1sad.AsUInt32());
                        vs3 = Avx2.Add(vs3, vs10);

                        // sum 32 uint8s to 16 shorts.
                        Vector256 <short> vshortsum2 = Avx2.MultiplyAddAdjacent(block, dot2v);

                        // sum 16 shorts to 8 uint32s.
                        Vector256 <int> vsum2 = Avx2.MultiplyAddAdjacent(vshortsum2, dot3v);

                        vs2  = Avx2.Add(vsum2.AsUInt32(), vs2);
                        vs10 = vs1;

                        localBufferPtr += BlockSize;
                        k -= 32;

                    // Defer the multiplication with 32 to outside of the loop.
                    vs3 = Avx2.ShiftLeftLogical(vs3, 5);
                    vs2 = Avx2.Add(vs2, vs3);

                    s1 = (uint)Numerics.EvenReduceSum(vs1.AsInt32());
                    s2 = (uint)Numerics.ReduceSum(vs2.AsInt32());

                    s1 %= BASE;
                    s2 %= BASE;

                    vs1 = Vector256.CreateScalar(s1);
                    vs2 = Vector256.CreateScalar(s2);

                if (length > 0)
                    HandleLeftOver(localBufferPtr, length, ref s1, ref s2);

                return(s1 | (s2 << 16));
コード例 #11
            private static void MulSimd(Span <VectorizedStaticModInt <T> > s, Span <VectorizedStaticModInt <T> > t, Span <VectorizedStaticModInt <T> > u)
                for (int i = 0; i < B * B8; i++)
                    var cmpS = Avx2.CompareGreaterThan(s[i].Value.AsInt32(), VectorizedStaticModInt <T> .M1.AsInt32()).AsUInt32();
                    var cmpT = Avx2.CompareGreaterThan(t[i].Value.AsInt32(), VectorizedStaticModInt <T> .M1.AsInt32()).AsUInt32();
                    var difS = Avx2.And(cmpS, VectorizedStaticModInt <T> .M1);
                    var difT = Avx2.And(cmpT, VectorizedStaticModInt <T> .M1);
                    s[i] = Avx2.Subtract(s[i].Value, difS);
                    t[i] = Avx2.Subtract(t[i].Value, difT);

                var m1v = VectorizedStaticModInt <T> .M1.GetElement(0);

                var m2v = VectorizedStaticModInt <T> .M2.GetElement(0);

                var zero = new VectorizedStaticModInt <T>().Value;
                var th1  = new VectorizedStaticModInt <T>(0, m1v, 0, m1v, 0, m1v, 0, m1v).Value.AsInt64();
                var th2  = new VectorizedStaticModInt <T>(0, m2v, 0, m2v, 0, m2v, 0, m2v).Value.AsInt64();

                for (int i = 0; i < B; i += 8)
                    for (int j = 0; j < B8; j += 1)
                        Vector256 <ulong> prod0200 = default; Vector256 <ulong> prod1300 = default;
                        Vector256 <ulong> prod0210 = default; Vector256 <ulong> prod1310 = default;
                        Vector256 <ulong> prod0220 = default; Vector256 <ulong> prod1320 = default;
                        Vector256 <ulong> prod0230 = default; Vector256 <ulong> prod1330 = default;
                        Vector256 <ulong> prod0240 = default; Vector256 <ulong> prod1340 = default;
                        Vector256 <ulong> prod0250 = default; Vector256 <ulong> prod1350 = default;
                        Vector256 <ulong> prod0260 = default; Vector256 <ulong> prod1360 = default;
                        Vector256 <ulong> prod0270 = default; Vector256 <ulong> prod1370 = default;
                        for (int k = 0; k < B; k += 8)
                            for (int l = 0; l < 8; l++)
                                Vector256 <uint> T0 = t[j * B + k + l].Value; var T130 = Avx2.Shuffle(T0, 0xF5);
                                var S00    = Vector256.Create(s[(i + 0) * B8 + k / 8].Value.GetElement(l));
                                var ST0200 = Avx2.Multiply(S00, T0);
                                var ST1300 = Avx2.Multiply(S00, T130);
                                prod0200 = Avx2.Add(prod0200, ST0200);
                                prod1300 = Avx2.Add(prod1300, ST1300);
                                var S10    = Vector256.Create(s[(i + 1) * B8 + k / 8].Value.GetElement(l));
                                var ST0210 = Avx2.Multiply(S10, T0);
                                var ST1310 = Avx2.Multiply(S10, T130);
                                prod0210 = Avx2.Add(prod0210, ST0210);
                                prod1310 = Avx2.Add(prod1310, ST1310);
                                var S20    = Vector256.Create(s[(i + 2) * B8 + k / 8].Value.GetElement(l));
                                var ST0220 = Avx2.Multiply(S20, T0);
                                var ST1320 = Avx2.Multiply(S20, T130);
                                prod0220 = Avx2.Add(prod0220, ST0220);
                                prod1320 = Avx2.Add(prod1320, ST1320);
                                var S30    = Vector256.Create(s[(i + 3) * B8 + k / 8].Value.GetElement(l));
                                var ST0230 = Avx2.Multiply(S30, T0);
                                var ST1330 = Avx2.Multiply(S30, T130);
                                prod0230 = Avx2.Add(prod0230, ST0230);
                                prod1330 = Avx2.Add(prod1330, ST1330);
                                var S40    = Vector256.Create(s[(i + 4) * B8 + k / 8].Value.GetElement(l));
                                var ST0240 = Avx2.Multiply(S40, T0);
                                var ST1340 = Avx2.Multiply(S40, T130);
                                prod0240 = Avx2.Add(prod0240, ST0240);
                                prod1340 = Avx2.Add(prod1340, ST1340);
                                var S50    = Vector256.Create(s[(i + 5) * B8 + k / 8].Value.GetElement(l));
                                var ST0250 = Avx2.Multiply(S50, T0);
                                var ST1350 = Avx2.Multiply(S50, T130);
                                prod0250 = Avx2.Add(prod0250, ST0250);
                                prod1350 = Avx2.Add(prod1350, ST1350);
                                var S60    = Vector256.Create(s[(i + 6) * B8 + k / 8].Value.GetElement(l));
                                var ST0260 = Avx2.Multiply(S60, T0);
                                var ST1360 = Avx2.Multiply(S60, T130);
                                prod0260 = Avx2.Add(prod0260, ST0260);
                                prod1360 = Avx2.Add(prod1360, ST1360);
                                var S70    = Vector256.Create(s[(i + 7) * B8 + k / 8].Value.GetElement(l));
                                var ST0270 = Avx2.Multiply(S70, T0);
                                var ST1370 = Avx2.Multiply(S70, T130);
                                prod0270 = Avx2.Add(prod0270, ST0270);
                                prod1370 = Avx2.Add(prod1370, ST1370);
                            var cmp0200 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0200.AsInt64());
                            var cmp1300 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1300.AsInt64());
                            var dif0200 = Avx2.And(cmp0200, th2);
                            var dif1300 = Avx2.And(cmp1300, th2);
                            prod0200 = Avx2.Subtract(prod0200, dif0200.AsUInt64());
                            prod1300 = Avx2.Subtract(prod1300, dif1300.AsUInt64());
                            var cmp0210 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0210.AsInt64());
                            var cmp1310 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1310.AsInt64());
                            var dif0210 = Avx2.And(cmp0210, th2);
                            var dif1310 = Avx2.And(cmp1310, th2);
                            prod0210 = Avx2.Subtract(prod0210, dif0210.AsUInt64());
                            prod1310 = Avx2.Subtract(prod1310, dif1310.AsUInt64());
                            var cmp0220 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0220.AsInt64());
                            var cmp1320 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1320.AsInt64());
                            var dif0220 = Avx2.And(cmp0220, th2);
                            var dif1320 = Avx2.And(cmp1320, th2);
                            prod0220 = Avx2.Subtract(prod0220, dif0220.AsUInt64());
                            prod1320 = Avx2.Subtract(prod1320, dif1320.AsUInt64());
                            var cmp0230 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0230.AsInt64());
                            var cmp1330 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1330.AsInt64());
                            var dif0230 = Avx2.And(cmp0230, th2);
                            var dif1330 = Avx2.And(cmp1330, th2);
                            prod0230 = Avx2.Subtract(prod0230, dif0230.AsUInt64());
                            prod1330 = Avx2.Subtract(prod1330, dif1330.AsUInt64());
                            var cmp0240 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0240.AsInt64());
                            var cmp1340 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1340.AsInt64());
                            var dif0240 = Avx2.And(cmp0240, th2);
                            var dif1340 = Avx2.And(cmp1340, th2);
                            prod0240 = Avx2.Subtract(prod0240, dif0240.AsUInt64());
                            prod1340 = Avx2.Subtract(prod1340, dif1340.AsUInt64());
                            var cmp0250 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0250.AsInt64());
                            var cmp1350 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1350.AsInt64());
                            var dif0250 = Avx2.And(cmp0250, th2);
                            var dif1350 = Avx2.And(cmp1350, th2);
                            prod0250 = Avx2.Subtract(prod0250, dif0250.AsUInt64());
                            prod1350 = Avx2.Subtract(prod1350, dif1350.AsUInt64());
                            var cmp0260 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0260.AsInt64());
                            var cmp1360 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1360.AsInt64());
                            var dif0260 = Avx2.And(cmp0260, th2);
                            var dif1360 = Avx2.And(cmp1360, th2);
                            prod0260 = Avx2.Subtract(prod0260, dif0260.AsUInt64());
                            prod1360 = Avx2.Subtract(prod1360, dif1360.AsUInt64());
                            var cmp0270 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0270.AsInt64());
                            var cmp1370 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1370.AsInt64());
                            var dif0270 = Avx2.And(cmp0270, th2);
                            var dif1370 = Avx2.And(cmp1370, th2);
                            prod0270 = Avx2.Subtract(prod0270, dif0270.AsUInt64());
                            prod1370 = Avx2.Subtract(prod1370, dif1370.AsUInt64());

                        for (int _ = 0; _ < 2; _++)
                            var cmp02 = Avx2.CompareGreaterThan(prod0200.AsInt64(), th1);
                            var cmp13 = Avx2.CompareGreaterThan(prod1300.AsInt64(), th1);
                            var dif02 = Avx2.And(cmp02, th1);
                            var dif13 = Avx2.And(cmp13, th1);
                            prod0200 = Avx2.Subtract(prod0200, dif02.AsUInt64());
                            prod1300 = Avx2.Subtract(prod1300, dif13.AsUInt64());
                        u[(i + 0) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0200.AsUInt32(), prod1300.AsUInt32());

                        for (int _ = 0; _ < 2; _++)
                            var cmp02 = Avx2.CompareGreaterThan(prod0210.AsInt64(), th1);
                            var cmp13 = Avx2.CompareGreaterThan(prod1310.AsInt64(), th1);
                            var dif02 = Avx2.And(cmp02, th1);
                            var dif13 = Avx2.And(cmp13, th1);
                            prod0210 = Avx2.Subtract(prod0210, dif02.AsUInt64());
                            prod1310 = Avx2.Subtract(prod1310, dif13.AsUInt64());
                        u[(i + 1) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0210.AsUInt32(), prod1310.AsUInt32());

                        for (int _ = 0; _ < 2; _++)
                            var cmp02 = Avx2.CompareGreaterThan(prod0220.AsInt64(), th1);
                            var cmp13 = Avx2.CompareGreaterThan(prod1320.AsInt64(), th1);
                            var dif02 = Avx2.And(cmp02, th1);
                            var dif13 = Avx2.And(cmp13, th1);
                            prod0220 = Avx2.Subtract(prod0220, dif02.AsUInt64());
                            prod1320 = Avx2.Subtract(prod1320, dif13.AsUInt64());
                        u[(i + 2) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0220.AsUInt32(), prod1320.AsUInt32());

                        for (int _ = 0; _ < 2; _++)
                            var cmp02 = Avx2.CompareGreaterThan(prod0230.AsInt64(), th1);
                            var cmp13 = Avx2.CompareGreaterThan(prod1330.AsInt64(), th1);
                            var dif02 = Avx2.And(cmp02, th1);
                            var dif13 = Avx2.And(cmp13, th1);
                            prod0230 = Avx2.Subtract(prod0230, dif02.AsUInt64());
                            prod1330 = Avx2.Subtract(prod1330, dif13.AsUInt64());
                        u[(i + 3) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0230.AsUInt32(), prod1330.AsUInt32());

                        for (int _ = 0; _ < 2; _++)
                            var cmp02 = Avx2.CompareGreaterThan(prod0240.AsInt64(), th1);
                            var cmp13 = Avx2.CompareGreaterThan(prod1340.AsInt64(), th1);
                            var dif02 = Avx2.And(cmp02, th1);
                            var dif13 = Avx2.And(cmp13, th1);
                            prod0240 = Avx2.Subtract(prod0240, dif02.AsUInt64());
                            prod1340 = Avx2.Subtract(prod1340, dif13.AsUInt64());
                        u[(i + 4) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0240.AsUInt32(), prod1340.AsUInt32());

                        for (int _ = 0; _ < 2; _++)
                            var cmp02 = Avx2.CompareGreaterThan(prod0250.AsInt64(), th1);
                            var cmp13 = Avx2.CompareGreaterThan(prod1350.AsInt64(), th1);
                            var dif02 = Avx2.And(cmp02, th1);
                            var dif13 = Avx2.And(cmp13, th1);
                            prod0250 = Avx2.Subtract(prod0250, dif02.AsUInt64());
                            prod1350 = Avx2.Subtract(prod1350, dif13.AsUInt64());
                        u[(i + 5) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0250.AsUInt32(), prod1350.AsUInt32());

                        for (int _ = 0; _ < 2; _++)
                            var cmp02 = Avx2.CompareGreaterThan(prod0260.AsInt64(), th1);
                            var cmp13 = Avx2.CompareGreaterThan(prod1360.AsInt64(), th1);
                            var dif02 = Avx2.And(cmp02, th1);
                            var dif13 = Avx2.And(cmp13, th1);
                            prod0260 = Avx2.Subtract(prod0260, dif02.AsUInt64());
                            prod1360 = Avx2.Subtract(prod1360, dif13.AsUInt64());
                        u[(i + 6) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0260.AsUInt32(), prod1360.AsUInt32());

                        for (int _ = 0; _ < 2; _++)
                            var cmp02 = Avx2.CompareGreaterThan(prod0270.AsInt64(), th1);
                            var cmp13 = Avx2.CompareGreaterThan(prod1370.AsInt64(), th1);
                            var dif02 = Avx2.And(cmp02, th1);
                            var dif13 = Avx2.And(cmp13, th1);
                            prod0270 = Avx2.Subtract(prod0270, dif02.AsUInt64());
                            prod1370 = Avx2.Subtract(prod1370, dif13.AsUInt64());
                        u[(i + 7) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0270.AsUInt32(), prod1370.AsUInt32());
コード例 #12
 public static Vector256 <T> RotateLeftUInt32 <T>(this Vector256 <T> value, byte offset) where T : struct
     return(Avx2.Or(Avx2.ShiftLeftLogical(value.AsUInt32(), offset), Avx2.ShiftRightLogical(value.AsUInt32(), (byte)(32 - offset))).As <uint, T>());