public override ulong Run(CancellationToken cancellationToken) { if (!Pclmulqdq.IsSupported) { return(0uL); } var randomIntSpan = new Span <long>(new[] { randomInt, randomInt }); var dst = new Span <long>(Enumerable.Repeat(1L, 2).ToArray()); var iterations = 0uL; unsafe { fixed(long *pdst = dst) fixed(long *psrc = randomIntSpan) { var srcVector = Sse2.LoadVector128(psrc); var dstVector = Sse2.LoadVector128(pdst); while (!cancellationToken.IsCancellationRequested) { for (var j = 0; j < LENGTH; j++) { dstVector = Pclmulqdq.CarrylessMultiply(dstVector, srcVector, 0b00); } Sse2.Store(pdst, dstVector); iterations++; } } } return(iterations); }
// InitPowersTable writes powers 1..size of hashKey to htbl. private static void InitPowersTable(byte *htbl, int size, byte *hashKey) { Vector128 <ulong> tmp1, tmp2, tmp3, tmp4; var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1)); var t = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(hashKey)); var h = t; Sse2.Store(htbl, Sse.StaticCast <ulong, byte>(t)); for (int i = 1; i < size; ++i) { tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00); tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10); tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp1 = Sse2.Xor(tmp3, tmp1); tmp4 = Sse2.Xor(tmp4, tmp2); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp3, tmp2); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp3, tmp2); t = Sse2.Xor(tmp4, tmp1); Sse2.Store(&htbl[i * 16], Sse.StaticCast <ulong, byte>(t)); } }
public void RunStructFldScenario(PclmulqdqOpTest__CarrylessMultiplyInt641 testClass) { var result = Pclmulqdq.CarrylessMultiply(_fld1, _fld2, 1); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(testClass._dataTable.outArrayPtr); }
private unsafe void GFMul(ReadOnlySpan <byte> x) { var a = _key.AsUInt64(); Vector128 <ulong> b; fixed(byte *p = x) { var t = Sse2.LoadVector128(p); b = t.ReverseEndianness128().AsUInt64(); } b = Sse2.Xor(b.AsByte(), _buffer).AsUInt64(); var tmp3 = Pclmulqdq.CarrylessMultiply(a, b, 0x00).AsUInt32(); var tmp4 = Pclmulqdq.CarrylessMultiply(a, b, 0x10).AsUInt32(); var tmp5 = Pclmulqdq.CarrylessMultiply(a, b, 0x01).AsUInt32(); var tmp6 = Pclmulqdq.CarrylessMultiply(a, b, 0x11).AsUInt32(); tmp4 = Sse2.Xor(tmp4, tmp5); tmp5 = Sse2.ShiftLeftLogical128BitLane(tmp4, 8); tmp4 = Sse2.ShiftRightLogical128BitLane(tmp4, 8); tmp3 = Sse2.Xor(tmp3, tmp5); tmp6 = Sse2.Xor(tmp6, tmp4); var tmp7 = Sse2.ShiftRightLogical(tmp3, 31); var tmp8 = Sse2.ShiftRightLogical(tmp6, 31); tmp3 = Sse2.ShiftLeftLogical(tmp3, 1); tmp6 = Sse2.ShiftLeftLogical(tmp6, 1); var tmp9 = Sse2.ShiftRightLogical128BitLane(tmp7, 12); tmp8 = Sse2.ShiftLeftLogical128BitLane(tmp8, 4); tmp7 = Sse2.ShiftLeftLogical128BitLane(tmp7, 4); tmp3 = Sse2.Or(tmp3, tmp7); tmp6 = Sse2.Or(tmp6, tmp8); tmp6 = Sse2.Or(tmp6, tmp9); tmp7 = Sse2.ShiftLeftLogical(tmp3, 31); tmp8 = Sse2.ShiftLeftLogical(tmp3, 30); tmp9 = Sse2.ShiftLeftLogical(tmp3, 25); tmp7 = Sse2.Xor(tmp7, tmp8); tmp7 = Sse2.Xor(tmp7, tmp9); tmp8 = Sse2.ShiftRightLogical128BitLane(tmp7, 4); tmp7 = Sse2.ShiftLeftLogical128BitLane(tmp7, 12); tmp3 = Sse2.Xor(tmp3, tmp7); var tmp2 = Sse2.ShiftRightLogical(tmp3, 1); tmp4 = Sse2.ShiftRightLogical(tmp3, 2); tmp5 = Sse2.ShiftRightLogical(tmp3, 7); tmp2 = Sse2.Xor(tmp2, tmp4); tmp2 = Sse2.Xor(tmp2, tmp5); tmp2 = Sse2.Xor(tmp2, tmp8); tmp3 = Sse2.Xor(tmp3, tmp2); tmp6 = Sse2.Xor(tmp6, tmp3); _buffer = tmp6.AsByte(); }
public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); var result = Pclmulqdq.CarrylessMultiply(_fld1, _fld2, 1); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.outArrayPtr); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new PclmulqdqOpTest__CarrylessMultiplyInt641(); var result = Pclmulqdq.CarrylessMultiply(test._fld1, test._fld2, 1); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Pclmulqdq.CarrylessMultiply(test._fld1, test._fld2, 1); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var left = Unsafe.Read <Vector128 <UInt64> >(_dataTable.inArray1Ptr); var right = Unsafe.Read <Vector128 <UInt64> >(_dataTable.inArray2Ptr); var result = Pclmulqdq.CarrylessMultiply(left, right, 16); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var left = Pclmulqdq.LoadAlignedVector128((Int64 *)(_dataTable.inArray1Ptr)); var right = Pclmulqdq.LoadAlignedVector128((Int64 *)(_dataTable.inArray2Ptr)); var result = Pclmulqdq.CarrylessMultiply(left, right, 1); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.outArrayPtr); }
public void RunBasicScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load)); var result = Pclmulqdq.CarrylessMultiply( Pclmulqdq.LoadVector128((Int64 *)(_dataTable.inArray1Ptr)), Pclmulqdq.LoadVector128((Int64 *)(_dataTable.inArray2Ptr)), 1 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Pclmulqdq.CarrylessMultiply( Unsafe.Read <Vector128 <Int64> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <Int64> >(_dataTable.inArray2Ptr), 0 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.outArrayPtr); }
internal static ulong Step(ulong crc, byte[] data, uint length) { int bufPos = 16; const ulong k1 = 0xe05dd497ca393ae4; const ulong k2 = 0xdabe95afc7875f40; const ulong mu = 0x9c3e466c172963d5; const ulong pol = 0x92d8af2baf0e1e85; Vector128 <ulong> foldConstants1 = Vector128.Create(k1, k2); Vector128 <ulong> foldConstants2 = Vector128.Create(mu, pol); Vector128 <ulong> initialCrc = Vector128.Create(~crc, 0); length -= 16; // Initial CRC can simply be added to data ShiftRight128(initialCrc, 0, out Vector128 <ulong> crc0, out Vector128 <ulong> crc1); Vector128 <ulong> accumulator = Sse2.Xor(Fold(Sse2.Xor(crc0, Vector128.Create(BitConverter.ToUInt64(data, 0), BitConverter.ToUInt64(data, 8))), foldConstants1), crc1); while (length >= 32) { accumulator = Fold(Sse2.Xor(Vector128.Create(BitConverter.ToUInt64(data, bufPos), BitConverter.ToUInt64(data, bufPos + 8)), accumulator), foldConstants1); length -= 16; bufPos += 16; } Vector128 <ulong> p = Sse2.Xor(accumulator, Vector128.Create(BitConverter.ToUInt64(data, bufPos), BitConverter.ToUInt64(data, bufPos + 8))); Vector128 <ulong> r = Sse2.Xor(Pclmulqdq.CarrylessMultiply(p, foldConstants1, 0x10), Sse2.ShiftRightLogical128BitLane(p, 8)); // Final Barrett reduction Vector128 <ulong> t1 = Pclmulqdq.CarrylessMultiply(r, foldConstants2, 0x00); Vector128 <ulong> t2 = Sse2.Xor(Sse2.Xor(Pclmulqdq.CarrylessMultiply(t1, foldConstants2, 0x10), Sse2.ShiftLeftLogical128BitLane(t1, 8)), r); return(~(((ulong)Sse41.Extract(t2.AsUInt32(), 3) << 32) | Sse41.Extract(t2.AsUInt32(), 2))); }
internal static uint64_t compute_quote_mask(uint64_t quote_bits) { // There should be no such thing with a processing supporting avx2 // but not clmul. if (Pclmulqdq.IsSupported) { uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), Vector128.Create((byte)0xFF).AsUInt64(), 0)); return(quote_mask); } else { uint64_t quote_mask = quote_bits ^ (quote_bits << 1); quote_mask = quote_mask ^ (quote_mask << 2); quote_mask = quote_mask ^ (quote_mask << 4); quote_mask = quote_mask ^ (quote_mask << 8); quote_mask = quote_mask ^ (quote_mask << 16); quote_mask = quote_mask ^ (quote_mask << 32); return(quote_mask); } }
static void Fold4(ref Vector128 <uint> xmmCRC0, ref Vector128 <uint> xmmCRC1, ref Vector128 <uint> xmmCRC2, ref Vector128 <uint> xmmCRC3) { Vector128 <uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001); Vector128 <uint> xTmp0 = xmmCRC0; Vector128 <uint> xTmp1 = xmmCRC1; Vector128 <uint> xTmp2 = xmmCRC2; Vector128 <uint> xTmp3 = xmmCRC3; xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp0 = Pclmulqdq.CarrylessMultiply(xTmp0.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC0 = xmmCRC0.AsSingle(); Vector128 <float> psT0 = xTmp0.AsSingle(); Vector128 <float> psRes0 = Sse.Xor(psCRC0, psT0); xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp1 = Pclmulqdq.CarrylessMultiply(xTmp1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC1 = xmmCRC1.AsSingle(); Vector128 <float> psT1 = xTmp1.AsSingle(); Vector128 <float> psRes1 = Sse.Xor(psCRC1, psT1); xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp2 = Pclmulqdq.CarrylessMultiply(xTmp2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC2 = xmmCRC2.AsSingle(); Vector128 <float> psT2 = xTmp2.AsSingle(); Vector128 <float> psRes2 = Sse.Xor(psCRC2, psT2); xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp3 = Pclmulqdq.CarrylessMultiply(xTmp3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC3 = xmmCRC3.AsSingle(); Vector128 <float> psT3 = xTmp3.AsSingle(); Vector128 <float> psRes3 = Sse.Xor(psCRC3, psT3); xmmCRC0 = psRes0.AsUInt32(); xmmCRC1 = psRes1.AsUInt32(); xmmCRC2 = psRes2.AsUInt32(); xmmCRC3 = psRes3.AsUInt32(); }
private static unsafe uint CalculateSse(uint crc, ReadOnlySpan <byte> buffer) { int chunksize = buffer.Length & ~ChunksizeMask; int length = chunksize; fixed(byte *bufferPtr = buffer) fixed(ulong *k05PolyPtr = K05Poly) { byte *srcPtr = bufferPtr; // There's at least one block of 64. Vector128 <ulong> x1 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00)); Vector128 <ulong> x2 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10)); Vector128 <ulong> x3 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20)); Vector128 <ulong> x4 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30)); Vector128 <ulong> x5; x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64()); // k1, k2 Vector128 <ulong> x0 = Sse2.LoadVector128(k05PolyPtr + 0x0); srcPtr += 64; length -= 64; // Parallel fold blocks of 64, if any. while (length >= 64) { x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); Vector128 <ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); Vector128 <ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00); Vector128 <ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11); x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11); x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11); Vector128 <ulong> y5 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00)); Vector128 <ulong> y6 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10)); Vector128 <ulong> y7 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20)); Vector128 <ulong> y8 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30)); x1 = Sse2.Xor(x1, x5); x2 = Sse2.Xor(x2, x6); x3 = Sse2.Xor(x3, x7); x4 = Sse2.Xor(x4, x8); x1 = Sse2.Xor(x1, y5); x2 = Sse2.Xor(x2, y6); x3 = Sse2.Xor(x3, y7); x4 = Sse2.Xor(x4, y8); srcPtr += 64; length -= 64; } // Fold into 128-bits. // k3, k4 x0 = Sse2.LoadVector128(k05PolyPtr + 0x2); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x2); x1 = Sse2.Xor(x1, x5); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x3); x1 = Sse2.Xor(x1, x5); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x4); x1 = Sse2.Xor(x1, x5); // Single fold blocks of 16, if any. while (length >= 16) { x2 = Sse2.LoadVector128((ulong *)srcPtr); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x2); x1 = Sse2.Xor(x1, x5); srcPtr += 16; length -= 16; } // Fold 128 - bits to 64 - bits. x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10); x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86 x1 = Sse2.ShiftRightLogical128BitLane(x1, 8); x1 = Sse2.Xor(x1, x2); // k5, k0 x0 = Sse2.LoadScalarVector128(k05PolyPtr + 0x4); x2 = Sse2.ShiftRightLogical128BitLane(x1, 4); x1 = Sse2.And(x1, x3); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Sse2.Xor(x1, x2); // Barret reduce to 32-bits. // polynomial x0 = Sse2.LoadVector128(k05PolyPtr + 0x6); x2 = Sse2.And(x1, x3); x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10); x2 = Sse2.And(x2, x3); x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); x1 = Sse2.Xor(x1, x2); crc = (uint)Sse41.Extract(x1.AsInt32(), 1); return(buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer[chunksize..]));
internal static uint Step(byte[] src, long len, uint initialCRC) { Vector128 <uint> xmmT0, xmmT1, xmmT2; Vector128 <uint> xmmInitial = Sse2.ConvertScalarToVector128UInt32(initialCRC); Vector128 <uint> xmmCRC0 = Sse2.ConvertScalarToVector128UInt32(0x9db42487); Vector128 <uint> xmmCRC1 = Vector128 <uint> .Zero; Vector128 <uint> xmmCRC2 = Vector128 <uint> .Zero; Vector128 <uint> xmmCRC3 = Vector128 <uint> .Zero; int bufPos = 0; bool first = true; /* fold 512 to 32 step variable declarations for ISO-C90 compat. */ Vector128 <uint> xmmMask = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); Vector128 <uint> xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); while ((len -= 64) >= 0) { xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; Vector128 <uint> xmmT3 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; if (first) { first = false; xmmT0 = Sse2.Xor(xmmT0, xmmInitial); } Fold4(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3); xmmCRC0 = Sse2.Xor(xmmCRC0, xmmT0); xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT1); xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT2); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3); } /* fold 512 to 32 */ /* * k1 */ Vector128 <uint> crcFold = Vector128.Create(_crcK[0], _crcK[1], _crcK[2], _crcK[3]); Vector128 <uint> xTmp0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x10). AsUInt32(); xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); xmmCRC1 = Sse2.Xor(xmmCRC1, xTmp0); xmmCRC1 = Sse2.Xor(xmmCRC1, xmmCRC0); Vector128 <uint> xTmp1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x10). AsUInt32(); xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); xmmCRC2 = Sse2.Xor(xmmCRC2, xTmp1); xmmCRC2 = Sse2.Xor(xmmCRC2, xmmCRC1); Vector128 <uint> xTmp2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x10). AsUInt32(); xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xTmp2); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); /* * k5 */ crcFold = Vector128.Create(_crcK[4], _crcK[5], _crcK[6], _crcK[7]); xmmCRC0 = xmmCRC3; xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32(); xmmCRC0 = Sse2.ShiftRightLogical128BitLane(xmmCRC0, 8); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0); xmmCRC0 = xmmCRC3; xmmCRC3 = Sse2.ShiftLeftLogical128BitLane(xmmCRC3, 4); xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0); xmmCRC3 = Sse2.And(xmmCRC3, xmmMask2); /* * k7 */ xmmCRC1 = xmmCRC3; xmmCRC2 = xmmCRC3; crcFold = Vector128.Create(_crcK[8], _crcK[9], _crcK[10], _crcK[11]); xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); xmmCRC3 = Sse2.And(xmmCRC3, xmmMask); xmmCRC2 = xmmCRC3; xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC1); /* * could just as well write xmm_crc3[2], doing a movaps and truncating, but * no real advantage - it's a tiny bit slower per call, while no additional CPUs * would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL */ return(~Sse41.Extract(xmmCRC3, 2)); }
static int Main() { s_success = true; // We expect the AOT compiler generated HW intrinsics with the following characteristics: // // * TRUE = IsSupported assumed to be true, no runtime check // * NULL = IsSupported is a runtime check, code should be behind the check or bad things happen // * FALSE = IsSupported assumed to be false, no runtime check, PlatformNotSupportedException if used // // The test is compiled with multiple defines to test this. #if BASELINE_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 16; bool?Sse2AndBelow = true; bool?Sse3Group = null; bool?AesLzPcl = null; bool?Sse4142 = null; bool?PopCnt = null; bool?Avx12 = false; bool?FmaBmi12 = false; bool?Avxvnni = false; #elif NON_VEX_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 16; bool?Sse2AndBelow = true; bool?Sse3Group = true; bool?AesLzPcl = null; bool?Sse4142 = true; bool?PopCnt = null; bool?Avx12 = false; bool?FmaBmi12 = false; bool?Avxvnni = false; #elif VEX_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 32; bool?Sse2AndBelow = true; bool?Sse3Group = true; bool?AesLzPcl = null; bool?Sse4142 = true; bool?PopCnt = null; bool?Avx12 = true; bool?FmaBmi12 = null; bool?Avxvnni = null; #else #error Who dis? #endif if (vectorsAccelerated != Vector.IsHardwareAccelerated) { throw new Exception($"Vectors HW acceleration state unexpected - expected {vectorsAccelerated}, got {Vector.IsHardwareAccelerated}"); } if (byteVectorLength != Vector <byte> .Count) { throw new Exception($"Unexpected vector length - expected {byteVectorLength}, got {Vector<byte>.Count}"); } Check("Sse", Sse2AndBelow, &SseIsSupported, Sse.IsSupported, () => Sse.Subtract(Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Sse.X64", Sse2AndBelow, &SseX64IsSupported, Sse.X64.IsSupported, () => Sse.X64.ConvertToInt64WithTruncation(Vector128 <float> .Zero) == 0); Check("Sse2", Sse2AndBelow, &Sse2IsSupported, Sse2.IsSupported, () => Sse2.Extract(Vector128 <ushort> .Zero, 0) == 0); Check("Sse2.X64", Sse2AndBelow, &Sse2X64IsSupported, Sse2.X64.IsSupported, () => Sse2.X64.ConvertToInt64(Vector128 <double> .Zero) == 0); Check("Sse3", Sse3Group, &Sse3IsSupported, Sse3.IsSupported, () => Sse3.MoveHighAndDuplicate(Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Sse3.X64", Sse3Group, &Sse3X64IsSupported, Sse3.X64.IsSupported, null); Check("Ssse3", Sse3Group, &Ssse3IsSupported, Ssse3.IsSupported, () => Ssse3.Abs(Vector128 <short> .Zero).Equals(Vector128 <ushort> .Zero)); Check("Ssse3.X64", Sse3Group, &Ssse3X64IsSupported, Ssse3.X64.IsSupported, null); Check("Sse41", Sse4142, &Sse41IsSupported, Sse41.IsSupported, () => Sse41.Max(Vector128 <int> .Zero, Vector128 <int> .Zero).Equals(Vector128 <int> .Zero)); Check("Sse41.X64", Sse4142, &Sse41X64IsSupported, Sse41.X64.IsSupported, () => Sse41.X64.Extract(Vector128 <long> .Zero, 0) == 0); Check("Sse42", Sse4142, &Sse42IsSupported, Sse42.IsSupported, () => Sse42.Crc32(0, 0) == 0); Check("Sse42.X64", Sse4142, &Sse42X64IsSupported, Sse42.X64.IsSupported, () => Sse42.X64.Crc32(0, 0) == 0); Check("Aes", AesLzPcl, &AesIsSupported, Aes.IsSupported, () => Aes.KeygenAssist(Vector128 <byte> .Zero, 0).Equals(Vector128.Create((byte)99))); Check("Aes.X64", AesLzPcl, &AesX64IsSupported, Aes.X64.IsSupported, null); Check("Avx", Avx12, &AvxIsSupported, Avx.IsSupported, () => Avx.Add(Vector256 <double> .Zero, Vector256 <double> .Zero).Equals(Vector256 <double> .Zero)); Check("Avx.X64", Avx12, &AvxX64IsSupported, Avx.X64.IsSupported, null); Check("Avx2", Avx12, &Avx2IsSupported, Avx2.IsSupported, () => Avx2.Abs(Vector256 <int> .Zero).Equals(Vector256 <uint> .Zero)); Check("Avx2.X64", Avx12, &Avx2X64IsSupported, Avx2.X64.IsSupported, null); Check("Bmi1", FmaBmi12, &Bmi1IsSupported, Bmi1.IsSupported, () => Bmi1.AndNot(0, 0) == 0); Check("Bmi1.X64", FmaBmi12, &Bmi1X64IsSupported, Bmi1.X64.IsSupported, () => Bmi1.X64.AndNot(0, 0) == 0); Check("Bmi2", FmaBmi12, &Bmi2IsSupported, Bmi2.IsSupported, () => Bmi2.MultiplyNoFlags(0, 0) == 0); Check("Bmi2.X64", FmaBmi12, &Bmi2X64IsSupported, Bmi2.X64.IsSupported, () => Bmi2.X64.MultiplyNoFlags(0, 0) == 0); Check("Fma", FmaBmi12, &FmaIsSupported, Fma.IsSupported, () => Fma.MultiplyAdd(Vector128 <float> .Zero, Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Fma.X64", FmaBmi12, &FmaX64IsSupported, Fma.X64.IsSupported, null); Check("Lzcnt", AesLzPcl, &LzcntIsSupported, Lzcnt.IsSupported, () => Lzcnt.LeadingZeroCount(0) == 32); Check("Lzcnt.X64", AesLzPcl, &LzcntX64IsSupported, Lzcnt.X64.IsSupported, () => Lzcnt.X64.LeadingZeroCount(0) == 64); Check("Pclmulqdq", AesLzPcl, &PclmulqdqIsSupported, Pclmulqdq.IsSupported, () => Pclmulqdq.CarrylessMultiply(Vector128 <long> .Zero, Vector128 <long> .Zero, 0).Equals(Vector128 <long> .Zero)); Check("Pclmulqdq.X64", AesLzPcl, &PclmulqdqX64IsSupported, Pclmulqdq.X64.IsSupported, null); Check("Popcnt", PopCnt, &PopcntIsSupported, Popcnt.IsSupported, () => Popcnt.PopCount(0) == 0); Check("Popcnt.X64", PopCnt, &PopcntX64IsSupported, Popcnt.X64.IsSupported, () => Popcnt.X64.PopCount(0) == 0); Check("AvxVnni", Avxvnni, &AvxVnniIsSupported, AvxVnni.IsSupported, () => AvxVnni.MultiplyWideningAndAdd(Vector128 <int> .Zero, Vector128 <byte> .Zero, Vector128 <sbyte> .Zero).Equals(Vector128 <int> .Zero)); Check("AvxVnni.X64", Avxvnni, &AvxVnniX64IsSupported, AvxVnni.X64.IsSupported, null); return(s_success ? 100 : 1); }
// take input from buf and remove useless whitespace, input and output can be // the same, result is null terminated, return the string length (minus the null termination) public static size_t Minify(uint8_t *buf, size_t len, uint8_t * @out) { if (!Avx2.IsSupported) { throw new NotSupportedException("AVX2 is required form SimdJson"); } //C#: load const vectors once (there is no `const _m256` in C#) Vector256 <byte> lut_cntrl = s_lut_cntrl; Vector256 <byte> low_nibble_mask = s_low_nibble_mask; Vector256 <byte> high_nibble_mask = s_high_nibble_mask; fixed(byte *mask128_epi8 = s_mask128_epi8) { // Useful constant masks const uint64_t even_bits = 0x5555555555555555UL; const uint64_t odd_bits = ~even_bits; uint8_t * initout = @out; uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones size_t idx = 0; if (len >= 64) { size_t avxlen = len - 63; for (; idx < avxlen; idx += 64) { Vector256 <byte> input_lo = Avx.LoadVector256((buf + idx + 0)); Vector256 <byte> input_hi = Avx.LoadVector256((buf + idx + 32)); uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'\\')); uint64_t start_edges = bs_bits & ~(bs_bits << 1); uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; bool iter_ends_odd_backslash = add_overflow( bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'"')); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL).AsUInt64(), Vector128.Create((byte)0xFF).AsUInt64(), 0)); quote_mask ^= prev_iter_inside_quote; prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // might be undefined behavior, should be fully defined in C++20, ok according to John Regher from Utah University Vector256 <byte> whitespace_shufti_mask = Vector256.Create((byte)0x18); Vector256 <byte> v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), Vector256.Create((byte)0x7f)))); Vector256 <byte> v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), Vector256.Create((byte)0x7f)))); Vector256 <byte> tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), Vector256.Create((byte)0)); Vector256 <byte> tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), Vector256.Create((byte)0)); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); whitespace &= ~quote_mask; int mask1 = (int)(whitespace & 0xFFFF); int mask2 = (int)((whitespace >> 16) & 0xFFFF); int mask3 = (int)((whitespace >> 32) & 0xFFFF); int mask4 = (int)((whitespace >> 48) & 0xFFFF); int pop1 = hamming((~whitespace) & 0xFFFF); int pop2 = hamming((~whitespace) & (ulong)(0xFFFFFFFF)); int pop3 = hamming((~whitespace) & (ulong)(0xFFFFFFFFFFFF)); int pop4 = hamming((~whitespace)); var vmask1 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask2 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask1 & 0x7FFF) * 2); var vmask2 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask4 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask3 & 0x7FFF) * 2); var result1 = Avx2.Shuffle(input_lo, vmask1.AsByte()); var result2 = Avx2.Shuffle(input_hi, vmask2.AsByte()); _mm256_storeu2_m128i((@out + pop1), @out, result1); _mm256_storeu2_m128i((@out + pop3), (@out + pop2), result2); @out += pop4; } } // we finish off the job... copying and pasting the code is not ideal here, // but it gets the job done. if (idx < len) { uint8_t *buffer = stackalloc uint8_t[64]; memset(buffer, 0, 64); memcpy(buffer, buf + idx, len - idx); var input_lo = Avx.LoadVector256((buffer)); var input_hi = Avx.LoadVector256((buffer + 32)); uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'\\')); uint64_t start_edges = bs_bits & ~(bs_bits << 1); uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; //bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'"')); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL), Vector128.Create((byte)0xFF).AsUInt64(), 0)); quote_mask ^= prev_iter_inside_quote; // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we don't need this anymore Vector256 <byte> mask_20 = Vector256.Create((byte)0x20); // c==32 Vector256 <byte> mask_70 = Vector256.Create((byte)0x70); // adding 0x70 does not check low 4-bits // but moves any value >= 16 above 128 Vector256 <byte> tmp_ws_lo = Avx2.Or( Avx2.CompareEqual(mask_20, input_lo), Avx2.Shuffle(lut_cntrl, Avx2.AddSaturate(mask_70, input_lo))); Vector256 <byte> tmp_ws_hi = Avx2.Or( Avx2.CompareEqual(mask_20, input_hi), Avx2.Shuffle(lut_cntrl, Avx2.AddSaturate(mask_70, input_hi))); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32)); whitespace &= ~quote_mask; if (len - idx < 64) { whitespace |= ((0xFFFFFFFFFFFFFFFF) << (int)(len - idx)); } int mask1 = (int)(whitespace & 0xFFFF); int mask2 = (int)((whitespace >> 16) & 0xFFFF); int mask3 = (int)((whitespace >> 32) & 0xFFFF); int mask4 = (int)((whitespace >> 48) & 0xFFFF); int pop1 = hamming((~whitespace) & 0xFFFF); int pop2 = hamming((~whitespace) & 0xFFFFFFFF); int pop3 = hamming((~whitespace) & 0xFFFFFFFFFFFF); int pop4 = hamming((~whitespace)); var vmask1 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask2 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask1 & 0x7FFF) * 2); var vmask2 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask4 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask3 & 0x7FFF) * 2); var result1 = Avx2.Shuffle(input_lo, vmask1.AsByte()); var result2 = Avx2.Shuffle(input_hi, vmask2.AsByte()); _mm256_storeu2_m128i((buffer + pop1), buffer, result1); _mm256_storeu2_m128i((buffer + pop3), (buffer + pop2), result2); memcpy(@out, buffer, (size_t)pop4); @out += pop4; } *@out = (byte)'\0'; // NULL termination return((size_t)@out - (size_t)initout); } }
// DecryptPowersTable decrypts ctLen bytes from ct and writes them to pt. While // decrypting, it updates the POLYVAL value in polyval. In order to decrypt and // update the POLYVAL value, it uses the expanded key from ks and the table of // powers in htbl. Decryption processes 6 blocks of data in parallel. private static void DecryptPowersTable(byte *ct, int ctLen, byte *pt, byte *polyval, byte *htbl, byte *tag, byte *ks) { Vector128 <ulong> sCtr1, sCtr2, sCtr3, sCtr4, sCtr5, sCtr6, tmp0, tmp1, tmp2, tmp3, tmp4, h; var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1)); var t = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval)); var orMask = Sse.StaticCast <uint, byte>(Sse2.SetVector128(0x80000000, 0, 0, 0)); var ctr = Sse2.Or(Sse2.LoadVector128(tag), orMask); var one = Sse2.SetVector128(0, 0, 0, 1); var two = Sse2.SetVector128(0, 0, 0, 2); int blocks = 0; if (ctLen >= 96) { var ctr1 = ctr; var ctr2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one)); var ctr3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two)); var ctr4 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), one)); var ctr5 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), two)); var ctr6 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), one)); ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), two)); var key = Sse2.LoadVector128(ks); ctr1 = Sse2.Xor(ctr1, key); ctr2 = Sse2.Xor(ctr2, key); ctr3 = Sse2.Xor(ctr3, key); ctr4 = Sse2.Xor(ctr4, key); ctr5 = Sse2.Xor(ctr5, key); ctr6 = Sse2.Xor(ctr6, key); for (int i = 1; i < 14; ++i) { key = Sse2.LoadVector128(&ks[i * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); } key = Sse2.LoadVector128(&ks[14 * 16]); ctr1 = Aes.EncryptLast(ctr1, key); ctr2 = Aes.EncryptLast(ctr2, key); ctr3 = Aes.EncryptLast(ctr3, key); ctr4 = Aes.EncryptLast(ctr4, key); ctr5 = Aes.EncryptLast(ctr5, key); ctr6 = Aes.EncryptLast(ctr6, key); ctr1 = Sse2.Xor(ctr1, Sse2.LoadVector128(&ct[0 * 16])); ctr2 = Sse2.Xor(ctr2, Sse2.LoadVector128(&ct[1 * 16])); ctr3 = Sse2.Xor(ctr3, Sse2.LoadVector128(&ct[2 * 16])); ctr4 = Sse2.Xor(ctr4, Sse2.LoadVector128(&ct[3 * 16])); ctr5 = Sse2.Xor(ctr5, Sse2.LoadVector128(&ct[4 * 16])); ctr6 = Sse2.Xor(ctr6, Sse2.LoadVector128(&ct[5 * 16])); Sse2.Store(&pt[0 * 16], ctr1); Sse2.Store(&pt[1 * 16], ctr2); Sse2.Store(&pt[2 * 16], ctr3); Sse2.Store(&pt[3 * 16], ctr4); Sse2.Store(&pt[4 * 16], ctr5); Sse2.Store(&pt[5 * 16], ctr6); ctLen -= 96; blocks += 6; while (ctLen >= 96) { sCtr6 = Sse.StaticCast <byte, ulong>(ctr6); sCtr5 = Sse.StaticCast <byte, ulong>(ctr5); sCtr4 = Sse.StaticCast <byte, ulong>(ctr4); sCtr3 = Sse.StaticCast <byte, ulong>(ctr3); sCtr2 = Sse.StaticCast <byte, ulong>(ctr2); sCtr1 = Sse.StaticCast <byte, ulong>(ctr1); ctr1 = ctr; ctr2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one)); ctr3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two)); ctr4 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), one)); ctr5 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), two)); ctr6 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), one)); ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), two)); key = Sse2.LoadVector128(ks); ctr1 = Sse2.Xor(ctr1, key); ctr2 = Sse2.Xor(ctr2, key); ctr3 = Sse2.Xor(ctr3, key); ctr4 = Sse2.Xor(ctr4, key); ctr5 = Sse2.Xor(ctr5, key); ctr6 = Sse2.Xor(ctr6, key); tmp3 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl)); tmp1 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x00); tmp0 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x01); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x10); tmp0 = Sse2.Xor(tmp3, tmp0); key = Sse2.LoadVector128(&ks[1 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); key = Sse2.LoadVector128(&ks[2 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); key = Sse2.LoadVector128(&ks[3 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); key = Sse2.LoadVector128(&ks[4 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); key = Sse2.LoadVector128(&ks[5 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[6 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[7 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); sCtr1 = Sse2.Xor(t, sCtr1); tmp4 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x01); tmp0 = Sse2.Xor(tmp3, tmp0); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x11); tmp1 = Sse2.Xor(tmp3, tmp1); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x00); tmp2 = Sse2.Xor(tmp3, tmp2); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x10); tmp0 = Sse2.Xor(tmp3, tmp0); key = Sse2.LoadVector128(&ks[8 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp0, 8); tmp4 = Sse2.Xor(tmp3, tmp1); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp0, 8); t = Sse2.Xor(tmp3, tmp2); key = Sse2.LoadVector128(&ks[9 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse2.Xor(tmp1, t); key = Sse2.LoadVector128(&ks[10 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[11 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[12 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[13 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[14 * 16]); ctr1 = Aes.EncryptLast(ctr1, key); ctr2 = Aes.EncryptLast(ctr2, key); ctr3 = Aes.EncryptLast(ctr3, key); ctr4 = Aes.EncryptLast(ctr4, key); ctr5 = Aes.EncryptLast(ctr5, key); ctr6 = Aes.EncryptLast(ctr6, key); ctr1 = Sse2.Xor(ctr1, Sse2.LoadVector128(&ct[(blocks + 0) * 16])); ctr2 = Sse2.Xor(ctr2, Sse2.LoadVector128(&ct[(blocks + 1) * 16])); ctr3 = Sse2.Xor(ctr3, Sse2.LoadVector128(&ct[(blocks + 2) * 16])); ctr4 = Sse2.Xor(ctr4, Sse2.LoadVector128(&ct[(blocks + 3) * 16])); ctr5 = Sse2.Xor(ctr5, Sse2.LoadVector128(&ct[(blocks + 4) * 16])); ctr6 = Sse2.Xor(ctr6, Sse2.LoadVector128(&ct[(blocks + 5) * 16])); tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse2.Xor(tmp1, t); t = Sse2.Xor(tmp4, t); Sse2.Store(&pt[(blocks + 0) * 16], ctr1); Sse2.Store(&pt[(blocks + 1) * 16], ctr2); Sse2.Store(&pt[(blocks + 2) * 16], ctr3); Sse2.Store(&pt[(blocks + 3) * 16], ctr4); Sse2.Store(&pt[(blocks + 4) * 16], ctr5); Sse2.Store(&pt[(blocks + 5) * 16], ctr6); ctLen -= 96; blocks += 6; } sCtr6 = Sse.StaticCast <byte, ulong>(ctr6); sCtr5 = Sse.StaticCast <byte, ulong>(ctr5); sCtr4 = Sse.StaticCast <byte, ulong>(ctr4); sCtr3 = Sse.StaticCast <byte, ulong>(ctr3); sCtr2 = Sse.StaticCast <byte, ulong>(ctr2); sCtr1 = Sse.StaticCast <byte, ulong>(ctr1); tmp3 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl)); tmp0 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x10); tmp1 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x00); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x01); tmp0 = Sse2.Xor(tmp3, tmp0); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); sCtr1 = Sse2.Xor(t, sCtr1); tmp4 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x11); tmp1 = Sse2.Xor(tmp3, tmp1); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x00); tmp2 = Sse2.Xor(tmp3, tmp2); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x10); tmp0 = Sse2.Xor(tmp3, tmp0); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x01); tmp0 = Sse2.Xor(tmp3, tmp0); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp0, 8); tmp4 = Sse2.Xor(tmp3, tmp1); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp0, 8); t = Sse2.Xor(tmp3, tmp2); tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse2.Xor(tmp1, t); tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse2.Xor(tmp1, t); t = Sse2.Xor(tmp4, t); } h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl)); while (ctLen >= 16) { var tmp = ctr; ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one)); tmp = Sse2.Xor(tmp, Sse2.LoadVector128(ks)); for (int i = 1; i < 14; ++i) { tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[i * 16])); } tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16])); tmp = Sse2.Xor(tmp, Sse2.LoadVector128(&ct[blocks * 16])); Sse2.Store(&pt[blocks * 16], tmp); t = Sse2.Xor(Sse.StaticCast <byte, ulong>(tmp), t); tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00); tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10); tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01); tmp2 = Sse2.Xor(tmp3, tmp2); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp1 = Sse2.Xor(tmp1, tmp3); tmp4 = Sse2.Xor(tmp2, tmp4); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp2, tmp3); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(tmp1, tmp4); ctLen -= 16; ++blocks; } if (ctLen > 0) { byte *b = stackalloc byte[16]; new Span <byte>(ct + blocks * 16, ctLen).CopyTo(new Span <byte>(b, 16)); var tmp = Sse2.Xor(ctr, Sse2.LoadVector128(ks)); for (int i = 1; i < 14; ++i) { tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[i * 16])); } tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16])); tmp = Sse2.Xor(tmp, Sse2.LoadVector128(b)); Sse2.Store(b, tmp); new Span <byte>(b, ctLen).CopyTo(new Span <byte>(&pt[blocks * 16], ctLen)); new Span <byte>(b + ctLen, 16 - ctLen).Clear(); t = Sse2.Xor(Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b)), t); tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00); tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10); tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01); tmp2 = Sse2.Xor(tmp3, tmp2); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp1 = Sse2.Xor(tmp1, tmp3); tmp4 = Sse2.Xor(tmp2, tmp4); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp2, tmp3); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(tmp1, tmp4); } Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t)); }
// PolyvalHorner updates the POLYVAL value in polyval to include length bytes // of data from input, given the POLYVAL key in hashKey. If the length is not // divisible by 16, input is padded with zeros until it's a multiple of 16 bytes. private static void PolyvalHorner(byte *polyval, byte *hashKey, byte *input, int length) { if (length == 0) { return; } int blocks = Math.DivRem(length, 16, out int remainder); Vector128 <ulong> tmp1, tmp2, tmp3, tmp4; var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1)); var t = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval)); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(hashKey)); for (int i = 0; i < blocks; ++i) { t = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&input[i * 16]))); tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00); tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10); tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp1 = Sse2.Xor(tmp3, tmp1); tmp4 = Sse2.Xor(tmp4, tmp2); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp3, tmp2); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp3, tmp2); t = Sse2.Xor(tmp4, tmp1); } if (remainder != 0) { byte *b = stackalloc byte[16]; new Span <byte>(input + length - remainder, remainder).CopyTo(new Span <byte>(b, 16)); t = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b))); tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00); tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10); tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp1 = Sse2.Xor(tmp3, tmp1); tmp4 = Sse2.Xor(tmp4, tmp2); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp3, tmp2); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp3, tmp2); t = Sse2.Xor(tmp4, tmp1); } Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t)); }
// PolyvalPowersTable updates the POLYVAL value in polyval to include length bytes // of data from input, given the POLYVAL key in hashKey. It uses the precomputed // powers of the key given in htbl. If the length is not divisible by 16, input // is padded with zeros until it's a multiple of 16 bytes. private static void PolyvalPowersTable(byte *polyval, byte *htbl, byte *input, int length) { if (length == 0) { return; } int blocks = Math.DivRem(length, 16, out int remainder16); int remainder128 = length % 128 - remainder16; Vector128 <ulong> tmp0, tmp1, tmp2, tmp3, tmp4; var xhi = Sse2.SetZeroVector128 <ulong>(); var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1)); var t = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval)); if (remainder128 != 0) { int remainder128Blocks = remainder128 / 16; blocks -= remainder128Blocks; var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(input))); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - 1) * 16])); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); for (int i = 1; i < remainder128Blocks; ++i) { data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&input[i * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - i - 1) * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); } tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); } if (blocks != 0) { var fixedInput = input + remainder128; if (remainder128 == 0) { var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[7 * 16])); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16])); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[6 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[5 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[4 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[3 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16])); tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[2 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[1 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[0 * 16]))); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); } for (int i = remainder128 == 0 ? 8 : 0; i < blocks; i += 8) { var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 7) * 16])); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16])); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 6) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 5) * 16])); tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(t, tmp4); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 4) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 3) * 16])); tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(t, tmp4); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 2) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(t, xhi); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 1) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[i * 16]))); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); } } if (blocks != 0 || remainder128 != 0) { tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); t = Sse2.Xor(xhi, t); } if (remainder16 != 0) { byte *b = stackalloc byte[16]; new Span <byte>(input + length - remainder16, remainder16).CopyTo(new Span <byte>(b, 16)); var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b))); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl)); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); t = Sse2.Xor(xhi, t); } Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t)); }
internal static bool find_structural_bits(uint8_t* buf, size_t len, ParsedJson pj) { if (len > pj.bytecapacity) { Console.WriteLine("Your ParsedJson object only supports documents up to " + pj.bytecapacity + " bytes but you are trying to process " + len + " bytes\n"); return false; } uint32_t* base_ptr = pj.structural_indexes; uint32_t @base = 0; #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! var has_error = Vector256<byte>.Zero; var previous = new avx_processed_utf_bytes(); previous.rawbytes = Vector256<byte>.Zero; previous.high_nibbles = Vector256<byte>.Zero; previous.carried_continuations = Vector256<byte>.Zero; var highbit = Vector256.Create((byte)0x80); #endif const uint64_t even_bits = 0x5555555555555555UL; const uint64_t odd_bits = ~even_bits; // for now, just work in 64-byte chunks // we have padded the input out to 64 byte multiple with the remainder being // zeros // persistent state across loop uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones // effectively the very first char is considered to follow "whitespace" for the // purposes of psuedo-structural character detection uint64_t prev_iter_ends_pseudo_pred = 1UL; size_t lenminus64 = len < 64 ? 0 : len - 64; size_t idx = 0; uint64_t structurals = 0; // C#: assign static readonly fields to locals before the loop Vector256<byte> low_nibble_mask = s_low_nibble_mask; Vector256<byte> high_nibble_mask = s_high_nibble_mask; Vector256<byte> utf8ValidVec = s_utf8ValidVec; var structural_shufti_mask = Vector256.Create((byte)0x7); var whitespace_shufti_mask = Vector256.Create((byte)0x18); var slashVec = Vector256.Create((bytechar) '\\').AsByte(); var ffVec = Vector128.Create((byte) 0xFF).AsUInt64(); var doubleQuoteVec = Vector256.Create((byte)'"'); var zeroBVec = Vector256.Create((byte) 0); var vec7f = Vector256.Create((byte) 0x7f); for (; idx < lenminus64; idx += 64) { var input_lo = Avx.LoadVector256(buf + idx + 0); var input_hi = Avx.LoadVector256(buf + idx + 32); #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true) { // it is ascii, we just check continuation has_error = Avx2.Or( Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(), utf8ValidVec, has_error); } else { // it is not ascii so we have to do heavy work previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error); previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error); } #endif //////////////////////////////////////////////////////////////////////////////////////////// // Step 1: detect odd sequences of backslashes //////////////////////////////////////////////////////////////////////////////////////////// /// uint64_t bs_bits = cmp_mask_against_input(input_lo, input_hi, slashVec); uint64_t start_edges = bs_bits & ~(bs_bits << 1); // flip lowest if we have an odd-length run at the end of the prior // iteration uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // must record the carry-out of our odd-carries out of bit 63; this // indicates whether the sense of any edge going to the next iteration // should be flipped bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end // if we had an odd-numbered run at the // end of the previous iteration prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; //////////////////////////////////////////////////////////////////////////////////////////// // Step 2: detect insides of quote pairs //////////////////////////////////////////////////////////////////////////////////////////// uint64_t quote_bits = cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0)); uint32_t cnt = (uint32_t) hamming(structurals); uint32_t next_base = @base + cnt; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base; quote_mask ^= prev_iter_inside_quote; prev_iter_inside_quote = (uint64_t) ((int64_t) quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code var v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), vec7f))); var v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), vec7f))); var tmp_lo = Avx2.CompareEqual( Avx2.And(v_lo, structural_shufti_mask), zeroBVec); var tmp_hi = Avx2.CompareEqual( Avx2.And(v_hi, structural_shufti_mask), zeroBVec); uint64_t structural_res_0 = (uint32_t) Avx2.MoveMask(tmp_lo); uint64_t structural_res_1 = (uint64_t) Avx2.MoveMask(tmp_hi); structurals = ~(structural_res_0 | (structural_res_1 << 32)); var tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec); var tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec); uint64_t ws_res_0 = (uint32_t) Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t) Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); // mask off anything inside quotes structurals &= ~quote_mask; // add the real quote bits back into our bitmask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace // characters that are (a) outside quotes and (b) have a predecessor that's // either whitespace or a structural character. This means that subsequent // passes will get a chance to encounter the first character of every string // of non-whitespace and, if we're parsing an atom like true/false/null or a // number we can stop at the first whitespace or structural character // following it. // a qualified predecessor is something that can happen 1 position before an // psuedo-structural character uint64_t pseudo_pred = structurals | whitespace; uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; prev_iter_ends_pseudo_pred = pseudo_pred >> 63; uint64_t pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask); structurals |= pseudo_structurals; // now, we've used our close quotes all we need to. So let's switch them off // they will be off in the quote mask and on in quote bits. structurals &= ~(quote_bits & ~quote_mask); //Console.WriteLine($"Iter: {idx}, satur: {structurals}"); //*(uint64_t *)(pj.structurals + idx / 8) = structurals; } //////////////// /// we use a giant copy-paste which is ugly. /// but otherwise the string needs to be properly padded or else we /// risk invalidating the UTF-8 checks. //////////// if (idx < len) { uint8_t* tmpbuf = stackalloc uint8_t[64]; memset(tmpbuf, 0x20, 64); memcpy(tmpbuf, buf + idx, len - idx); Vector256<byte> input_lo = Avx.LoadVector256(tmpbuf + 0); Vector256<byte> input_hi = Avx.LoadVector256(tmpbuf + 32); #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! var highbit = Vector256.Create((byte)0x80); if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true) { // it is ascii, we just check continuation has_error = Avx2.Or( Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(), utf8ValidVec).AsByte(), has_error); } else { // it is not ascii so we have to do heavy work previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error); previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error); } #endif //////////////////////////////////////////////////////////////////////////////////////////// // Step 1: detect odd sequences of backslashes //////////////////////////////////////////////////////////////////////////////////////////// uint64_t bs_bits = cmp_mask_against_input(input_lo, input_hi, slashVec); uint64_t start_edges = bs_bits & ~(bs_bits << 1); // flip lowest if we have an odd-length run at the end of the prior // iteration uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // must record the carry-out of our odd-carries out of bit 63; this // indicates whether the sense of any edge going to the next iteration // should be flipped //bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end // if we had an odd-numbered run at the // end of the previous iteration //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; //////////////////////////////////////////////////////////////////////////////////////////// // Step 2: detect insides of quote pairs //////////////////////////////////////////////////////////////////////////////////////////// uint64_t quote_bits = cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = (uint64_t)Sse2.X64.ConvertToInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0).AsInt64()); quote_mask ^= prev_iter_inside_quote; //BUG? https://github.com/dotnet/coreclr/issues/22813 //quote_mask = 60; //prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20 uint32_t cnt = (uint32_t)hamming(structurals); uint32_t next_base = @base + cnt; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base; // How do we build up a user traversable data structure // first, do a 'shufti' to detect structural JSON characters // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c // these go into the first 3 buckets of the comparison (1/2/4) // we are also interested in the four whitespace characters // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d // these go into the next 2 buckets of the comparison (8/16) var v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), vec7f))); var v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), vec7f))); var tmp_lo = Avx2.CompareEqual( Avx2.And(v_lo, structural_shufti_mask), zeroBVec); var tmp_hi = Avx2.CompareEqual( Avx2.And(v_hi, structural_shufti_mask), zeroBVec); uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo); uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi); structurals = ~(structural_res_0 | (structural_res_1 << 32)); // this additional mask and transfer is non-trivially expensive, // unfortunately var tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec); var tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); // mask off anything inside quotes structurals &= ~quote_mask; // add the real quote bits back into our bitmask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace // characters that are (a) outside quotes and (b) have a predecessor that's // either whitespace or a structural character. This means that subsequent // passes will get a chance to encounter the first character of every string // of non-whitespace and, if we're parsing an atom like true/false/null or a // number we can stop at the first whitespace or structural character // following it. // a qualified predecessor is something that can happen 1 position before an // psuedo-structural character uint64_t pseudo_pred = structurals | whitespace; uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; prev_iter_ends_pseudo_pred = pseudo_pred >> 63; uint64_t pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask); structurals |= pseudo_structurals; // now, we've used our close quotes all we need to. So let's switch them off // they will be off in the quote mask and on in quote bits. structurals &= ~(quote_bits & ~quote_mask); //*(uint64_t *)(pj.structurals + idx / 8) = structurals; idx += 64; } uint32_t cnt2 = (uint32_t)hamming(structurals); uint32_t next_base2 = @base + cnt2; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base2; pj.n_structural_indexes = @base; if (base_ptr[pj.n_structural_indexes - 1] > len) { throw new InvalidOperationException("Internal bug"); } if (len != base_ptr[pj.n_structural_indexes - 1]) { // the string might not be NULL terminated, but we add a virtual NULL ending character. base_ptr[pj.n_structural_indexes++] = (uint32_t)len; } base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! return Avx.TestZ(has_error, has_error); #else return true; #endif }
static Vector128 <ulong> Fold(Vector128 <ulong> input, Vector128 <ulong> foldConstants) => Sse2.Xor(Pclmulqdq.CarrylessMultiply(input, foldConstants, 0x00), Pclmulqdq.CarrylessMultiply(input, foldConstants, 0x11));