private static void Transpose(ref Vector256 <byte> x0, ref Vector256 <byte> x1, ref Vector256 <byte> x2, ref Vector256 <byte> x3) { Vector256 <ulong> t0 = Avx2.UnpackHigh(x0.AsUInt32(), x1.AsUInt32()).AsUInt64(); x0 = Avx2.UnpackLow(x0.AsUInt32(), x1.AsUInt32()).AsByte(); Vector256 <ulong> t1 = Avx2.UnpackLow(x2.AsUInt32(), x3.AsUInt32()).AsUInt64(); x2 = Avx2.UnpackHigh(x2.AsUInt32(), x3.AsUInt32()).AsByte(); x1 = Avx2.UnpackHigh(x0.AsUInt64(), t1).AsByte(); x0 = Avx2.UnpackLow(x0.AsUInt64(), t1).AsByte(); x3 = Avx2.UnpackHigh(t0, x2.AsUInt64()).AsByte(); x2 = Avx2.UnpackLow(t0, x2.AsUInt64()).AsByte(); }
public static unsafe void Decode32Bytes(byte *source, byte *dest) { Vector256 <byte> maskA = Vector256.Create((uint)0x0000_003f).AsByte(); Vector256 <byte> maskB = Vector256.Create((uint)0x0000_3f00).AsByte(); Vector256 <byte> maskC = Vector256.Create((uint)0x003f_0000).AsByte(); Vector256 <byte> maskD = Vector256.Create((uint)0x3f00_0000).AsByte(); Vector256 <byte> offsets = Vector256.Create((sbyte)-32).AsByte(); Vector256 <byte> vecSource = Unsafe.As <byte, Vector256 <byte> >(ref source[0]); Vector256 <byte> subtracted = Avx2.Add(vecSource, offsets); Vector256 <byte> a = Avx2.And(subtracted, maskA); Vector256 <byte> b = Avx2.And(subtracted, maskB); Vector256 <byte> c = Avx2.And(subtracted, maskC); Vector256 <byte> d = Avx2.And(subtracted, maskD); a = Avx2.ShiftLeftLogical(a.AsUInt32(), 18).AsByte(); // 00000000 00000000 00000000 00aaaaaa -> 00000000 aaaaaa00 00000000 00000000 b = Avx2.ShiftLeftLogical(b.AsUInt32(), 4).AsByte(); // 00000000 00000000 00bbbbbb 00000000 -> 00000000 000000bb bbbb0000 00000000 c = Avx2.ShiftRightLogical(c.AsUInt32(), 10).AsByte(); // 00000000 00cccccc 00000000 00000000 -> 00000000 00000000 0000cccc cc000000 d = Avx2.ShiftRightLogical(d.AsUInt32(), 24).AsByte(); // 00dddddd 00000000 00000000 00000000 -> 00000000 00000000 00000000 00dddddd // After Or: 00000000 aaaaaabb bbbbcccc ccdddddd // byte 3 byte 1 byte 2 byte 0 // a uint: 0x00000000_00000000__00000000_00111111 // b uint: 0x00000000_00000000__00111111_00000000 // c uint: 0x00000000_00111111__00000000_00000000 // d uint: 0x00111111_00000000__00000000_00000000 a = Avx2.Or(a, b); c = Avx2.Or(c, d); a = Avx2.Or(a, c); // AA BB CC 00 AA BB CC 00 // a contains: [C,B,A,0, F,E,D,0, I,H,G,0, L,K,J,0] // Shuffle bytes so that it becomes: [A,B,C, D,E,F, G,H,I, J,K,L, 0,0,0,0] //2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, // 3, 7, 11, 15 // 18, 17, 16, 22, 21, 20, // 19 var vecShuffle = Vector256.Create( 0x02, 0x01, 0x00, 0x06, 0x05, 0x04, 0x0a, 0x09, 0x08, 0x0e, 0x0d, 0x0c, 0x80, 0x80, 0x80, 0x80, // 0x03, 0x07, 0x0b, 0x0f 0x12, 0x11, 0x10, 0x16, 0x15, 0x14, 0x1a, 0x19, 0x18, 0x1e, 0x1d, 0x1c, 0x80, 0x80, 0x80, 0x80); // 0x13, 0x17, 0x1b, 0x1f var vecBytes2 = Avx2.Shuffle(a, vecShuffle); Sse2.Store(dest, vecBytes2.GetLower()); Sse2.Store(dest + 12, vecBytes2.GetUpper()); }
public static Vector256 <T> Vector256Add <T>(Vector256 <T> left, Vector256 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Avx2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Avx2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Avx2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Avx2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Avx2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Avx2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Avx2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Avx2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Avx.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Avx.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
public static unsafe void Decode32Bytes_v2_temp_upper(byte *source, byte *dest) { //Vector256<byte> vecSource = Unsafe.As<byte, Vector256<byte>>( ref source[0] ); Vector256 <byte> vecSource = Unsafe.As <byte, Vector256 <byte> >(ref *source); Vector256 <byte> subtracted = Avx2.Add(vecSource, Vector256.Create((sbyte)-32).AsByte()); Vector256 <byte> a = Avx2.And(subtracted, Vector256.Create((uint)0x0000_003f).AsByte()); Vector256 <byte> b = Avx2.And(subtracted, Vector256.Create((uint)0x0000_3f00).AsByte()); Vector256 <byte> c = Avx2.And(subtracted, Vector256.Create((uint)0x003f_0000).AsByte()); Vector256 <byte> d = Avx2.And(subtracted, Vector256.Create((uint)0x3f00_0000).AsByte()); a = Avx2.ShiftLeftLogical(a.AsUInt32(), 18).AsByte(); // 00000000 00000000 00000000 00aaaaaa -> 00000000 aaaaaa00 00000000 00000000 b = Avx2.ShiftLeftLogical(b.AsUInt32(), 4).AsByte(); // 00000000 00000000 00bbbbbb 00000000 -> 00000000 000000bb bbbb0000 00000000 c = Avx2.ShiftRightLogical(c.AsUInt32(), 10).AsByte(); // 00000000 00cccccc 00000000 00000000 -> 00000000 00000000 0000cccc cc000000 d = Avx2.ShiftRightLogical(d.AsUInt32(), 24).AsByte(); // 00dddddd 00000000 00000000 00000000 -> 00000000 00000000 00000000 00dddddd // After Or: 00000000 aaaaaabb bbbbcccc ccdddddd // byte 3 byte 1 byte 2 byte 0 // a uint: 0x00000000_00000000__00000000_00111111 // b uint: 0x00000000_00000000__00111111_00000000 // c uint: 0x00000000_00111111__00000000_00000000 // d uint: 0x00111111_00000000__00000000_00000000 a = Avx2.Or(a, b); c = Avx2.Or(c, d); a = Avx2.Or(a, c); // AA BB CC 00 AA BB CC 00 var vecBytes2 = Avx2.Shuffle(a, Vector256.Create( 0x02, 0x01, 0x00, 0x06, 0x05, 0x04, 0x0a, 0x09, 0x08, 0x0e, 0x0d, 0x0c, 0x80, 0x80, 0x80, 0x80, // 0x03, 0x07, 0x0b, 0x0f 0x12, 0x11, 0x10, 0x16, 0x15, 0x14, 0x1a, 0x19, 0x18, 0x1e, 0x1d, 0x1c, 0x80, 0x80, 0x80, 0x80) // 0x13, 0x17, 0x1b, 0x1f ); var upper = vecBytes2.GetUpper(); Sse2.Store(dest, vecBytes2.GetLower()); Sse2.Store(dest + 12, upper); }
internal static bool find_structural_bits(uint8_t* buf, size_t len, ParsedJson pj) { if (len > pj.bytecapacity) { Console.WriteLine("Your ParsedJson object only supports documents up to " + pj.bytecapacity + " bytes but you are trying to process " + len + " bytes\n"); return false; } uint32_t* base_ptr = pj.structural_indexes; uint32_t @base = 0; #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! var has_error = Vector256<byte>.Zero; var previous = new avx_processed_utf_bytes(); previous.rawbytes = Vector256<byte>.Zero; previous.high_nibbles = Vector256<byte>.Zero; previous.carried_continuations = Vector256<byte>.Zero; var highbit = Vector256.Create((byte)0x80); #endif const uint64_t even_bits = 0x5555555555555555UL; const uint64_t odd_bits = ~even_bits; // for now, just work in 64-byte chunks // we have padded the input out to 64 byte multiple with the remainder being // zeros // persistent state across loop uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones // effectively the very first char is considered to follow "whitespace" for the // purposes of psuedo-structural character detection uint64_t prev_iter_ends_pseudo_pred = 1UL; size_t lenminus64 = len < 64 ? 0 : len - 64; size_t idx = 0; uint64_t structurals = 0; // C#: assign static readonly fields to locals before the loop Vector256<byte> low_nibble_mask = s_low_nibble_mask; Vector256<byte> high_nibble_mask = s_high_nibble_mask; Vector256<byte> utf8ValidVec = s_utf8ValidVec; var structural_shufti_mask = Vector256.Create((byte)0x7); var whitespace_shufti_mask = Vector256.Create((byte)0x18); var slashVec = Vector256.Create((bytechar) '\\').AsByte(); var ffVec = Vector128.Create((byte) 0xFF).AsUInt64(); var doubleQuoteVec = Vector256.Create((byte)'"'); var zeroBVec = Vector256.Create((byte) 0); var vec7f = Vector256.Create((byte) 0x7f); for (; idx < lenminus64; idx += 64) { var input_lo = Avx.LoadVector256(buf + idx + 0); var input_hi = Avx.LoadVector256(buf + idx + 32); #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true) { // it is ascii, we just check continuation has_error = Avx2.Or( Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(), utf8ValidVec, has_error); } else { // it is not ascii so we have to do heavy work previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error); previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error); } #endif //////////////////////////////////////////////////////////////////////////////////////////// // Step 1: detect odd sequences of backslashes //////////////////////////////////////////////////////////////////////////////////////////// /// uint64_t bs_bits = cmp_mask_against_input(input_lo, input_hi, slashVec); uint64_t start_edges = bs_bits & ~(bs_bits << 1); // flip lowest if we have an odd-length run at the end of the prior // iteration uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // must record the carry-out of our odd-carries out of bit 63; this // indicates whether the sense of any edge going to the next iteration // should be flipped bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end // if we had an odd-numbered run at the // end of the previous iteration prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; //////////////////////////////////////////////////////////////////////////////////////////// // Step 2: detect insides of quote pairs //////////////////////////////////////////////////////////////////////////////////////////// uint64_t quote_bits = cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0)); uint32_t cnt = (uint32_t) hamming(structurals); uint32_t next_base = @base + cnt; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base; quote_mask ^= prev_iter_inside_quote; prev_iter_inside_quote = (uint64_t) ((int64_t) quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code var v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), vec7f))); var v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), vec7f))); var tmp_lo = Avx2.CompareEqual( Avx2.And(v_lo, structural_shufti_mask), zeroBVec); var tmp_hi = Avx2.CompareEqual( Avx2.And(v_hi, structural_shufti_mask), zeroBVec); uint64_t structural_res_0 = (uint32_t) Avx2.MoveMask(tmp_lo); uint64_t structural_res_1 = (uint64_t) Avx2.MoveMask(tmp_hi); structurals = ~(structural_res_0 | (structural_res_1 << 32)); var tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec); var tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec); uint64_t ws_res_0 = (uint32_t) Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t) Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); // mask off anything inside quotes structurals &= ~quote_mask; // add the real quote bits back into our bitmask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace // characters that are (a) outside quotes and (b) have a predecessor that's // either whitespace or a structural character. This means that subsequent // passes will get a chance to encounter the first character of every string // of non-whitespace and, if we're parsing an atom like true/false/null or a // number we can stop at the first whitespace or structural character // following it. // a qualified predecessor is something that can happen 1 position before an // psuedo-structural character uint64_t pseudo_pred = structurals | whitespace; uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; prev_iter_ends_pseudo_pred = pseudo_pred >> 63; uint64_t pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask); structurals |= pseudo_structurals; // now, we've used our close quotes all we need to. So let's switch them off // they will be off in the quote mask and on in quote bits. structurals &= ~(quote_bits & ~quote_mask); //Console.WriteLine($"Iter: {idx}, satur: {structurals}"); //*(uint64_t *)(pj.structurals + idx / 8) = structurals; } //////////////// /// we use a giant copy-paste which is ugly. /// but otherwise the string needs to be properly padded or else we /// risk invalidating the UTF-8 checks. //////////// if (idx < len) { uint8_t* tmpbuf = stackalloc uint8_t[64]; memset(tmpbuf, 0x20, 64); memcpy(tmpbuf, buf + idx, len - idx); Vector256<byte> input_lo = Avx.LoadVector256(tmpbuf + 0); Vector256<byte> input_hi = Avx.LoadVector256(tmpbuf + 32); #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! var highbit = Vector256.Create((byte)0x80); if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true) { // it is ascii, we just check continuation has_error = Avx2.Or( Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(), utf8ValidVec).AsByte(), has_error); } else { // it is not ascii so we have to do heavy work previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error); previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error); } #endif //////////////////////////////////////////////////////////////////////////////////////////// // Step 1: detect odd sequences of backslashes //////////////////////////////////////////////////////////////////////////////////////////// uint64_t bs_bits = cmp_mask_against_input(input_lo, input_hi, slashVec); uint64_t start_edges = bs_bits & ~(bs_bits << 1); // flip lowest if we have an odd-length run at the end of the prior // iteration uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // must record the carry-out of our odd-carries out of bit 63; this // indicates whether the sense of any edge going to the next iteration // should be flipped //bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end // if we had an odd-numbered run at the // end of the previous iteration //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; //////////////////////////////////////////////////////////////////////////////////////////// // Step 2: detect insides of quote pairs //////////////////////////////////////////////////////////////////////////////////////////// uint64_t quote_bits = cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = (uint64_t)Sse2.X64.ConvertToInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0).AsInt64()); quote_mask ^= prev_iter_inside_quote; //BUG? https://github.com/dotnet/coreclr/issues/22813 //quote_mask = 60; //prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20 uint32_t cnt = (uint32_t)hamming(structurals); uint32_t next_base = @base + cnt; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base; // How do we build up a user traversable data structure // first, do a 'shufti' to detect structural JSON characters // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c // these go into the first 3 buckets of the comparison (1/2/4) // we are also interested in the four whitespace characters // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d // these go into the next 2 buckets of the comparison (8/16) var v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), vec7f))); var v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), vec7f))); var tmp_lo = Avx2.CompareEqual( Avx2.And(v_lo, structural_shufti_mask), zeroBVec); var tmp_hi = Avx2.CompareEqual( Avx2.And(v_hi, structural_shufti_mask), zeroBVec); uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo); uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi); structurals = ~(structural_res_0 | (structural_res_1 << 32)); // this additional mask and transfer is non-trivially expensive, // unfortunately var tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec); var tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); // mask off anything inside quotes structurals &= ~quote_mask; // add the real quote bits back into our bitmask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace // characters that are (a) outside quotes and (b) have a predecessor that's // either whitespace or a structural character. This means that subsequent // passes will get a chance to encounter the first character of every string // of non-whitespace and, if we're parsing an atom like true/false/null or a // number we can stop at the first whitespace or structural character // following it. // a qualified predecessor is something that can happen 1 position before an // psuedo-structural character uint64_t pseudo_pred = structurals | whitespace; uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; prev_iter_ends_pseudo_pred = pseudo_pred >> 63; uint64_t pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask); structurals |= pseudo_structurals; // now, we've used our close quotes all we need to. So let's switch them off // they will be off in the quote mask and on in quote bits. structurals &= ~(quote_bits & ~quote_mask); //*(uint64_t *)(pj.structurals + idx / 8) = structurals; idx += 64; } uint32_t cnt2 = (uint32_t)hamming(structurals); uint32_t next_base2 = @base + cnt2; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base2; pj.n_structural_indexes = @base; if (base_ptr[pj.n_structural_indexes - 1] > len) { throw new InvalidOperationException("Internal bug"); } if (len != base_ptr[pj.n_structural_indexes - 1]) { // the string might not be NULL terminated, but we add a virtual NULL ending character. base_ptr[pj.n_structural_indexes++] = (uint32_t)len; } base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! return Avx.TestZ(has_error, has_error); #else return true; #endif }
// take input from buf and remove useless whitespace, input and output can be // the same, result is null terminated, return the string length (minus the null termination) public static size_t Minify(uint8_t *buf, size_t len, uint8_t * @out) { if (!Avx2.IsSupported) { throw new NotSupportedException("AVX2 is required form SimdJson"); } //C#: load const vectors once (there is no `const _m256` in C#) Vector256 <byte> lut_cntrl = s_lut_cntrl; Vector256 <byte> low_nibble_mask = s_low_nibble_mask; Vector256 <byte> high_nibble_mask = s_high_nibble_mask; fixed(byte *mask128_epi8 = s_mask128_epi8) { // Useful constant masks const uint64_t even_bits = 0x5555555555555555UL; const uint64_t odd_bits = ~even_bits; uint8_t * initout = @out; uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones size_t idx = 0; if (len >= 64) { size_t avxlen = len - 63; for (; idx < avxlen; idx += 64) { Vector256 <byte> input_lo = Avx.LoadVector256((buf + idx + 0)); Vector256 <byte> input_hi = Avx.LoadVector256((buf + idx + 32)); uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'\\')); uint64_t start_edges = bs_bits & ~(bs_bits << 1); uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; bool iter_ends_odd_backslash = add_overflow( bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'"')); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL).AsUInt64(), Vector128.Create((byte)0xFF).AsUInt64(), 0)); quote_mask ^= prev_iter_inside_quote; prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // might be undefined behavior, should be fully defined in C++20, ok according to John Regher from Utah University Vector256 <byte> whitespace_shufti_mask = Vector256.Create((byte)0x18); Vector256 <byte> v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), Vector256.Create((byte)0x7f)))); Vector256 <byte> v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), Vector256.Create((byte)0x7f)))); Vector256 <byte> tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), Vector256.Create((byte)0)); Vector256 <byte> tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), Vector256.Create((byte)0)); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); whitespace &= ~quote_mask; int mask1 = (int)(whitespace & 0xFFFF); int mask2 = (int)((whitespace >> 16) & 0xFFFF); int mask3 = (int)((whitespace >> 32) & 0xFFFF); int mask4 = (int)((whitespace >> 48) & 0xFFFF); int pop1 = hamming((~whitespace) & 0xFFFF); int pop2 = hamming((~whitespace) & (ulong)(0xFFFFFFFF)); int pop3 = hamming((~whitespace) & (ulong)(0xFFFFFFFFFFFF)); int pop4 = hamming((~whitespace)); var vmask1 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask2 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask1 & 0x7FFF) * 2); var vmask2 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask4 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask3 & 0x7FFF) * 2); var result1 = Avx2.Shuffle(input_lo, vmask1.AsByte()); var result2 = Avx2.Shuffle(input_hi, vmask2.AsByte()); _mm256_storeu2_m128i((@out + pop1), @out, result1); _mm256_storeu2_m128i((@out + pop3), (@out + pop2), result2); @out += pop4; } } // we finish off the job... copying and pasting the code is not ideal here, // but it gets the job done. if (idx < len) { uint8_t *buffer = stackalloc uint8_t[64]; memset(buffer, 0, 64); memcpy(buffer, buf + idx, len - idx); var input_lo = Avx.LoadVector256((buffer)); var input_hi = Avx.LoadVector256((buffer + 32)); uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'\\')); uint64_t start_edges = bs_bits & ~(bs_bits << 1); uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; //bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'"')); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL), Vector128.Create((byte)0xFF).AsUInt64(), 0)); quote_mask ^= prev_iter_inside_quote; // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we don't need this anymore Vector256 <byte> mask_20 = Vector256.Create((byte)0x20); // c==32 Vector256 <byte> mask_70 = Vector256.Create((byte)0x70); // adding 0x70 does not check low 4-bits // but moves any value >= 16 above 128 Vector256 <byte> tmp_ws_lo = Avx2.Or( Avx2.CompareEqual(mask_20, input_lo), Avx2.Shuffle(lut_cntrl, Avx2.AddSaturate(mask_70, input_lo))); Vector256 <byte> tmp_ws_hi = Avx2.Or( Avx2.CompareEqual(mask_20, input_hi), Avx2.Shuffle(lut_cntrl, Avx2.AddSaturate(mask_70, input_hi))); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32)); whitespace &= ~quote_mask; if (len - idx < 64) { whitespace |= ((0xFFFFFFFFFFFFFFFF) << (int)(len - idx)); } int mask1 = (int)(whitespace & 0xFFFF); int mask2 = (int)((whitespace >> 16) & 0xFFFF); int mask3 = (int)((whitespace >> 32) & 0xFFFF); int mask4 = (int)((whitespace >> 48) & 0xFFFF); int pop1 = hamming((~whitespace) & 0xFFFF); int pop2 = hamming((~whitespace) & 0xFFFFFFFF); int pop3 = hamming((~whitespace) & 0xFFFFFFFFFFFF); int pop4 = hamming((~whitespace)); var vmask1 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask2 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask1 & 0x7FFF) * 2); var vmask2 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask4 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask3 & 0x7FFF) * 2); var result1 = Avx2.Shuffle(input_lo, vmask1.AsByte()); var result2 = Avx2.Shuffle(input_hi, vmask2.AsByte()); _mm256_storeu2_m128i((buffer + pop1), buffer, result1); _mm256_storeu2_m128i((buffer + pop3), (buffer + pop2), result2); memcpy(@out, buffer, (size_t)pop4); @out += pop4; } *@out = (byte)'\0'; // NULL termination return((size_t)@out - (size_t)initout); } }
private unsafe static void Xxh3ScrambleAcc(Span <ulong> acc, ReadOnlySpan <byte> secret) { if (Avx2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pSecret = secret) { Vector256 <uint> prime32 = Vector256.Create(Prime32_1); Vector256 <ulong> *xAcc = (Vector256 <ulong> *)pAcc; Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 32; i++) { Vector256 <ulong> accVec = xAcc[i]; Vector256 <ulong> shifted = Avx2.ShiftRightLogical(accVec, 47); Vector256 <ulong> dataVec = Avx2.Xor(accVec, shifted); Vector256 <byte> keyVec = xSecret[i]; Vector256 <uint> dataKey = Avx2.Xor(dataVec.AsUInt32(), keyVec.AsUInt32()); Vector256 <uint> dataKeyHi = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector256 <ulong> prodLo = Avx2.Multiply(dataKey, prime32); Vector256 <ulong> prodHi = Avx2.Multiply(dataKeyHi, prime32); xAcc[i] = Avx2.Add(prodLo, Avx2.ShiftLeftLogical(prodHi, 32)); } } } } else if (Sse2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pSecret = secret) { Vector128 <uint> prime32 = Vector128.Create(Prime32_1); Vector128 <ulong> *xAcc = (Vector128 <ulong> *)pAcc; Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 16; i++) { Vector128 <ulong> accVec = xAcc[i]; Vector128 <ulong> shifted = Sse2.ShiftRightLogical(accVec, 47); Vector128 <ulong> dataVec = Sse2.Xor(accVec, shifted); Vector128 <byte> keyVec = xSecret[i]; Vector128 <uint> dataKey = Sse2.Xor(dataVec.AsUInt32(), keyVec.AsUInt32()); Vector128 <uint> dataKeyHi = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector128 <ulong> prodLo = Sse2.Multiply(dataKey, prime32); Vector128 <ulong> prodHi = Sse2.Multiply(dataKeyHi, prime32); xAcc[i] = Sse2.Add(prodLo, Sse2.ShiftLeftLogical(prodHi, 32)); } } } } else { for (int i = 0; i < AccNb; i++) { ulong key64 = BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong))); ulong acc64 = acc[i]; acc64 = XorShift64(acc64, 47); acc64 ^= key64; acc64 *= Prime32_1; acc[i] = acc64; } } }
private unsafe static void Xxh3Accumulate512(Span <ulong> acc, ReadOnlySpan <byte> input, ReadOnlySpan <byte> secret) { if (Avx2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pInput = input, pSecret = secret) { Vector256 <ulong> *xAcc = (Vector256 <ulong> *)pAcc; Vector256 <byte> * xInput = (Vector256 <byte> *)pInput; Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 32; i++) { Vector256 <byte> dataVec = xInput[i]; Vector256 <byte> keyVec = xSecret[i]; Vector256 <byte> dataKey = Avx2.Xor(dataVec, keyVec); Vector256 <uint> dataKeyLo = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector256 <ulong> product = Avx2.Multiply(dataKey.AsUInt32(), dataKeyLo); Vector256 <uint> dataSwap = Avx2.Shuffle(dataVec.AsUInt32(), 0b01001110); Vector256 <ulong> sum = Avx2.Add(xAcc[i], dataSwap.AsUInt64()); xAcc[i] = Avx2.Add(product, sum); } } } } else if (Sse2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pInput = input, pSecret = secret) { Vector128 <ulong> *xAcc = (Vector128 <ulong> *)pAcc; Vector128 <byte> * xInput = (Vector128 <byte> *)pInput; Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 16; i++) { Vector128 <byte> dataVec = xInput[i]; Vector128 <byte> keyVec = xSecret[i]; Vector128 <byte> dataKey = Sse2.Xor(dataVec, keyVec); Vector128 <uint> dataKeyLo = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector128 <ulong> product = Sse2.Multiply(dataKey.AsUInt32(), dataKeyLo); Vector128 <uint> dataSwap = Sse2.Shuffle(dataVec.AsUInt32(), 0b01001110); Vector128 <ulong> sum = Sse2.Add(xAcc[i], dataSwap.AsUInt64()); xAcc[i] = Sse2.Add(product, sum); } } } } else { for (int i = 0; i < AccNb; i++) { ulong dataVal = BinaryPrimitives.ReadUInt64LittleEndian(input.Slice(i * sizeof(ulong))); ulong dataKey = dataVal ^ BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong))); acc[i ^ 1] += dataVal; acc[i] += Mult32To64((uint)dataKey, dataKey >> 32); } } }
private static Vector256 <ulong> ror64_32_avx(ref Vector256 <ulong> x) => Avx2.Shuffle(x.AsUInt32(), 0b_10_11_00_01).AsUInt64();
public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; uint length = (uint)buffer.Length; fixed(byte *bufferPtr = &MemoryMarshal.GetReference(buffer)) { byte *localBufferPtr = bufferPtr; Vector256 <byte> zero = Vector256 <byte> .Zero; var dot3v = Vector256.Create((short)1); var dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. var vs1 = Vector256.CreateScalar(s1); var vs2 = Vector256.CreateScalar(s2); while (length >= 32) { int k = length < NMAX ? (int)length : (int)NMAX; k -= k % 32; length -= (uint)k; Vector256 <uint> vs10 = vs1; Vector256 <uint> vs3 = Vector256 <uint> .Zero; while (k >= 32) { // Load 32 input bytes. Vector256 <byte> block = Avx.LoadVector256(localBufferPtr); // Sum of abs diff, resulting in 2 x int32's Vector256 <ushort> vs1sad = Avx2.SumAbsoluteDifferences(block, zero); vs1 = Avx2.Add(vs1, vs1sad.AsUInt32()); vs3 = Avx2.Add(vs3, vs10); // sum 32 uint8s to 16 shorts. Vector256 <short> vshortsum2 = Avx2.MultiplyAddAdjacent(block, dot2v); // sum 16 shorts to 8 uint32s. Vector256 <int> vsum2 = Avx2.MultiplyAddAdjacent(vshortsum2, dot3v); vs2 = Avx2.Add(vsum2.AsUInt32(), vs2); vs10 = vs1; localBufferPtr += BlockSize; k -= 32; } // Defer the multiplication with 32 to outside of the loop. vs3 = Avx2.ShiftLeftLogical(vs3, 5); vs2 = Avx2.Add(vs2, vs3); s1 = (uint)Numerics.EvenReduceSum(vs1.AsInt32()); s2 = (uint)Numerics.ReduceSum(vs2.AsInt32()); s1 %= BASE; s2 %= BASE; vs1 = Vector256.CreateScalar(s1); vs2 = Vector256.CreateScalar(s2); } if (length > 0) { HandleLeftOver(localBufferPtr, length, ref s1, ref s2); } return(s1 | (s2 << 16)); } }
private static void MulSimd(Span <VectorizedStaticModInt <T> > s, Span <VectorizedStaticModInt <T> > t, Span <VectorizedStaticModInt <T> > u) { for (int i = 0; i < B * B8; i++) { var cmpS = Avx2.CompareGreaterThan(s[i].Value.AsInt32(), VectorizedStaticModInt <T> .M1.AsInt32()).AsUInt32(); var cmpT = Avx2.CompareGreaterThan(t[i].Value.AsInt32(), VectorizedStaticModInt <T> .M1.AsInt32()).AsUInt32(); var difS = Avx2.And(cmpS, VectorizedStaticModInt <T> .M1); var difT = Avx2.And(cmpT, VectorizedStaticModInt <T> .M1); s[i] = Avx2.Subtract(s[i].Value, difS); t[i] = Avx2.Subtract(t[i].Value, difT); } var m1v = VectorizedStaticModInt <T> .M1.GetElement(0); var m2v = VectorizedStaticModInt <T> .M2.GetElement(0); var zero = new VectorizedStaticModInt <T>().Value; var th1 = new VectorizedStaticModInt <T>(0, m1v, 0, m1v, 0, m1v, 0, m1v).Value.AsInt64(); var th2 = new VectorizedStaticModInt <T>(0, m2v, 0, m2v, 0, m2v, 0, m2v).Value.AsInt64(); for (int i = 0; i < B; i += 8) { for (int j = 0; j < B8; j += 1) { Vector256 <ulong> prod0200 = default; Vector256 <ulong> prod1300 = default; Vector256 <ulong> prod0210 = default; Vector256 <ulong> prod1310 = default; Vector256 <ulong> prod0220 = default; Vector256 <ulong> prod1320 = default; Vector256 <ulong> prod0230 = default; Vector256 <ulong> prod1330 = default; Vector256 <ulong> prod0240 = default; Vector256 <ulong> prod1340 = default; Vector256 <ulong> prod0250 = default; Vector256 <ulong> prod1350 = default; Vector256 <ulong> prod0260 = default; Vector256 <ulong> prod1360 = default; Vector256 <ulong> prod0270 = default; Vector256 <ulong> prod1370 = default; for (int k = 0; k < B; k += 8) { for (int l = 0; l < 8; l++) { Vector256 <uint> T0 = t[j * B + k + l].Value; var T130 = Avx2.Shuffle(T0, 0xF5); var S00 = Vector256.Create(s[(i + 0) * B8 + k / 8].Value.GetElement(l)); var ST0200 = Avx2.Multiply(S00, T0); var ST1300 = Avx2.Multiply(S00, T130); prod0200 = Avx2.Add(prod0200, ST0200); prod1300 = Avx2.Add(prod1300, ST1300); var S10 = Vector256.Create(s[(i + 1) * B8 + k / 8].Value.GetElement(l)); var ST0210 = Avx2.Multiply(S10, T0); var ST1310 = Avx2.Multiply(S10, T130); prod0210 = Avx2.Add(prod0210, ST0210); prod1310 = Avx2.Add(prod1310, ST1310); var S20 = Vector256.Create(s[(i + 2) * B8 + k / 8].Value.GetElement(l)); var ST0220 = Avx2.Multiply(S20, T0); var ST1320 = Avx2.Multiply(S20, T130); prod0220 = Avx2.Add(prod0220, ST0220); prod1320 = Avx2.Add(prod1320, ST1320); var S30 = Vector256.Create(s[(i + 3) * B8 + k / 8].Value.GetElement(l)); var ST0230 = Avx2.Multiply(S30, T0); var ST1330 = Avx2.Multiply(S30, T130); prod0230 = Avx2.Add(prod0230, ST0230); prod1330 = Avx2.Add(prod1330, ST1330); var S40 = Vector256.Create(s[(i + 4) * B8 + k / 8].Value.GetElement(l)); var ST0240 = Avx2.Multiply(S40, T0); var ST1340 = Avx2.Multiply(S40, T130); prod0240 = Avx2.Add(prod0240, ST0240); prod1340 = Avx2.Add(prod1340, ST1340); var S50 = Vector256.Create(s[(i + 5) * B8 + k / 8].Value.GetElement(l)); var ST0250 = Avx2.Multiply(S50, T0); var ST1350 = Avx2.Multiply(S50, T130); prod0250 = Avx2.Add(prod0250, ST0250); prod1350 = Avx2.Add(prod1350, ST1350); var S60 = Vector256.Create(s[(i + 6) * B8 + k / 8].Value.GetElement(l)); var ST0260 = Avx2.Multiply(S60, T0); var ST1360 = Avx2.Multiply(S60, T130); prod0260 = Avx2.Add(prod0260, ST0260); prod1360 = Avx2.Add(prod1360, ST1360); var S70 = Vector256.Create(s[(i + 7) * B8 + k / 8].Value.GetElement(l)); var ST0270 = Avx2.Multiply(S70, T0); var ST1370 = Avx2.Multiply(S70, T130); prod0270 = Avx2.Add(prod0270, ST0270); prod1370 = Avx2.Add(prod1370, ST1370); } var cmp0200 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0200.AsInt64()); var cmp1300 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1300.AsInt64()); var dif0200 = Avx2.And(cmp0200, th2); var dif1300 = Avx2.And(cmp1300, th2); prod0200 = Avx2.Subtract(prod0200, dif0200.AsUInt64()); prod1300 = Avx2.Subtract(prod1300, dif1300.AsUInt64()); var cmp0210 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0210.AsInt64()); var cmp1310 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1310.AsInt64()); var dif0210 = Avx2.And(cmp0210, th2); var dif1310 = Avx2.And(cmp1310, th2); prod0210 = Avx2.Subtract(prod0210, dif0210.AsUInt64()); prod1310 = Avx2.Subtract(prod1310, dif1310.AsUInt64()); var cmp0220 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0220.AsInt64()); var cmp1320 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1320.AsInt64()); var dif0220 = Avx2.And(cmp0220, th2); var dif1320 = Avx2.And(cmp1320, th2); prod0220 = Avx2.Subtract(prod0220, dif0220.AsUInt64()); prod1320 = Avx2.Subtract(prod1320, dif1320.AsUInt64()); var cmp0230 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0230.AsInt64()); var cmp1330 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1330.AsInt64()); var dif0230 = Avx2.And(cmp0230, th2); var dif1330 = Avx2.And(cmp1330, th2); prod0230 = Avx2.Subtract(prod0230, dif0230.AsUInt64()); prod1330 = Avx2.Subtract(prod1330, dif1330.AsUInt64()); var cmp0240 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0240.AsInt64()); var cmp1340 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1340.AsInt64()); var dif0240 = Avx2.And(cmp0240, th2); var dif1340 = Avx2.And(cmp1340, th2); prod0240 = Avx2.Subtract(prod0240, dif0240.AsUInt64()); prod1340 = Avx2.Subtract(prod1340, dif1340.AsUInt64()); var cmp0250 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0250.AsInt64()); var cmp1350 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1350.AsInt64()); var dif0250 = Avx2.And(cmp0250, th2); var dif1350 = Avx2.And(cmp1350, th2); prod0250 = Avx2.Subtract(prod0250, dif0250.AsUInt64()); prod1350 = Avx2.Subtract(prod1350, dif1350.AsUInt64()); var cmp0260 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0260.AsInt64()); var cmp1360 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1360.AsInt64()); var dif0260 = Avx2.And(cmp0260, th2); var dif1360 = Avx2.And(cmp1360, th2); prod0260 = Avx2.Subtract(prod0260, dif0260.AsUInt64()); prod1360 = Avx2.Subtract(prod1360, dif1360.AsUInt64()); var cmp0270 = Avx2.CompareGreaterThan(zero.AsInt64(), prod0270.AsInt64()); var cmp1370 = Avx2.CompareGreaterThan(zero.AsInt64(), prod1370.AsInt64()); var dif0270 = Avx2.And(cmp0270, th2); var dif1370 = Avx2.And(cmp1370, th2); prod0270 = Avx2.Subtract(prod0270, dif0270.AsUInt64()); prod1370 = Avx2.Subtract(prod1370, dif1370.AsUInt64()); } for (int _ = 0; _ < 2; _++) { var cmp02 = Avx2.CompareGreaterThan(prod0200.AsInt64(), th1); var cmp13 = Avx2.CompareGreaterThan(prod1300.AsInt64(), th1); var dif02 = Avx2.And(cmp02, th1); var dif13 = Avx2.And(cmp13, th1); prod0200 = Avx2.Subtract(prod0200, dif02.AsUInt64()); prod1300 = Avx2.Subtract(prod1300, dif13.AsUInt64()); } u[(i + 0) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0200.AsUInt32(), prod1300.AsUInt32()); for (int _ = 0; _ < 2; _++) { var cmp02 = Avx2.CompareGreaterThan(prod0210.AsInt64(), th1); var cmp13 = Avx2.CompareGreaterThan(prod1310.AsInt64(), th1); var dif02 = Avx2.And(cmp02, th1); var dif13 = Avx2.And(cmp13, th1); prod0210 = Avx2.Subtract(prod0210, dif02.AsUInt64()); prod1310 = Avx2.Subtract(prod1310, dif13.AsUInt64()); } u[(i + 1) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0210.AsUInt32(), prod1310.AsUInt32()); for (int _ = 0; _ < 2; _++) { var cmp02 = Avx2.CompareGreaterThan(prod0220.AsInt64(), th1); var cmp13 = Avx2.CompareGreaterThan(prod1320.AsInt64(), th1); var dif02 = Avx2.And(cmp02, th1); var dif13 = Avx2.And(cmp13, th1); prod0220 = Avx2.Subtract(prod0220, dif02.AsUInt64()); prod1320 = Avx2.Subtract(prod1320, dif13.AsUInt64()); } u[(i + 2) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0220.AsUInt32(), prod1320.AsUInt32()); for (int _ = 0; _ < 2; _++) { var cmp02 = Avx2.CompareGreaterThan(prod0230.AsInt64(), th1); var cmp13 = Avx2.CompareGreaterThan(prod1330.AsInt64(), th1); var dif02 = Avx2.And(cmp02, th1); var dif13 = Avx2.And(cmp13, th1); prod0230 = Avx2.Subtract(prod0230, dif02.AsUInt64()); prod1330 = Avx2.Subtract(prod1330, dif13.AsUInt64()); } u[(i + 3) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0230.AsUInt32(), prod1330.AsUInt32()); for (int _ = 0; _ < 2; _++) { var cmp02 = Avx2.CompareGreaterThan(prod0240.AsInt64(), th1); var cmp13 = Avx2.CompareGreaterThan(prod1340.AsInt64(), th1); var dif02 = Avx2.And(cmp02, th1); var dif13 = Avx2.And(cmp13, th1); prod0240 = Avx2.Subtract(prod0240, dif02.AsUInt64()); prod1340 = Avx2.Subtract(prod1340, dif13.AsUInt64()); } u[(i + 4) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0240.AsUInt32(), prod1340.AsUInt32()); for (int _ = 0; _ < 2; _++) { var cmp02 = Avx2.CompareGreaterThan(prod0250.AsInt64(), th1); var cmp13 = Avx2.CompareGreaterThan(prod1350.AsInt64(), th1); var dif02 = Avx2.And(cmp02, th1); var dif13 = Avx2.And(cmp13, th1); prod0250 = Avx2.Subtract(prod0250, dif02.AsUInt64()); prod1350 = Avx2.Subtract(prod1350, dif13.AsUInt64()); } u[(i + 5) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0250.AsUInt32(), prod1350.AsUInt32()); for (int _ = 0; _ < 2; _++) { var cmp02 = Avx2.CompareGreaterThan(prod0260.AsInt64(), th1); var cmp13 = Avx2.CompareGreaterThan(prod1360.AsInt64(), th1); var dif02 = Avx2.And(cmp02, th1); var dif13 = Avx2.And(cmp13, th1); prod0260 = Avx2.Subtract(prod0260, dif02.AsUInt64()); prod1360 = Avx2.Subtract(prod1360, dif13.AsUInt64()); } u[(i + 6) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0260.AsUInt32(), prod1360.AsUInt32()); for (int _ = 0; _ < 2; _++) { var cmp02 = Avx2.CompareGreaterThan(prod0270.AsInt64(), th1); var cmp13 = Avx2.CompareGreaterThan(prod1370.AsInt64(), th1); var dif02 = Avx2.And(cmp02, th1); var dif13 = Avx2.And(cmp13, th1); prod0270 = Avx2.Subtract(prod0270, dif02.AsUInt64()); prod1370 = Avx2.Subtract(prod1370, dif13.AsUInt64()); } u[(i + 7) * B8 + j + 0] = VectorizedStaticModInt <T> .Reduce(prod0270.AsUInt32(), prod1370.AsUInt32()); } } }
public static Vector256 <T> RotateLeftUInt32 <T>(this Vector256 <T> value, byte offset) where T : struct { return(Avx2.Or(Avx2.ShiftLeftLogical(value.AsUInt32(), offset), Avx2.ShiftRightLogical(value.AsUInt32(), (byte)(32 - offset))).As <uint, T>()); }