static int Main(string[] args) { bool pass = true; if (Sse2.IsSupported) { Vector128 <byte> src = Vector128.Create((byte)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); Vector128 <uint> srcAsUInt32 = src.AsUInt32(); Vector128 <uint> result = Sse2.Shuffle(srcAsUInt32, _MM_SHUFFLE(0, 1, 2, 3)); pass = result.Equals(Sse2.Shuffle(srcAsUInt32, (byte)(0 << 6 | 1 << 4 | 2 << 2 | 3))); } return(pass ? 100 : 0); }
private static void MakeRoundKey(Vector128 <byte>[] keys, int i, byte rcon) { Vector128 <byte> s = keys[i - 1]; Vector128 <byte> t = keys[i - 1]; t = Aes.KeygenAssist(t, rcon); t = Sse2.Shuffle(t.AsUInt32(), 0xFF).AsByte(); s = Sse2.Xor(s, Sse2.ShiftLeftLogical128BitLane(s, 4)); s = Sse2.Xor(s, Sse2.ShiftLeftLogical128BitLane(s, 8)); keys[i] = Sse2.Xor(s, t); }
private static void KeyRound(ref Vector128 <byte> a, ref Vector128 <byte> b, ref Vector128 <byte> c) { var t = Sse2.ShiftLeftLogical128BitLane(a, 4); b = Sse2.Shuffle(b.AsUInt32(), 0b01_01_01_01).AsByte(); a = Sse2.Xor(a, t); t = Sse2.ShiftLeftLogical128BitLane(t, 4); a = Sse2.Xor(a, t); t = Sse2.ShiftLeftLogical128BitLane(t, 4); a = Sse2.Xor(a, t); a = Sse2.Xor(a, b); b = Sse2.Shuffle(a.AsUInt32(), 0b11_11_11_11).AsByte(); t = Sse2.ShiftLeftLogical128BitLane(c, 4); c = Sse2.Xor(c, t); c = Sse2.Xor(c, b); }
public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
static void Fold4(ref Vector128 <uint> xmmCRC0, ref Vector128 <uint> xmmCRC1, ref Vector128 <uint> xmmCRC2, ref Vector128 <uint> xmmCRC3) { Vector128 <uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001); Vector128 <uint> xTmp0 = xmmCRC0; Vector128 <uint> xTmp1 = xmmCRC1; Vector128 <uint> xTmp2 = xmmCRC2; Vector128 <uint> xTmp3 = xmmCRC3; xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp0 = Pclmulqdq.CarrylessMultiply(xTmp0.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC0 = xmmCRC0.AsSingle(); Vector128 <float> psT0 = xTmp0.AsSingle(); Vector128 <float> psRes0 = Sse.Xor(psCRC0, psT0); xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp1 = Pclmulqdq.CarrylessMultiply(xTmp1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC1 = xmmCRC1.AsSingle(); Vector128 <float> psT1 = xTmp1.AsSingle(); Vector128 <float> psRes1 = Sse.Xor(psCRC1, psT1); xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp2 = Pclmulqdq.CarrylessMultiply(xTmp2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC2 = xmmCRC2.AsSingle(); Vector128 <float> psT2 = xTmp2.AsSingle(); Vector128 <float> psRes2 = Sse.Xor(psCRC2, psT2); xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp3 = Pclmulqdq.CarrylessMultiply(xTmp3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC3 = xmmCRC3.AsSingle(); Vector128 <float> psT3 = xTmp3.AsSingle(); Vector128 <float> psRes3 = Sse.Xor(psCRC3, psT3); xmmCRC0 = psRes0.AsUInt32(); xmmCRC1 = psRes1.AsUInt32(); xmmCRC2 = psRes2.AsUInt32(); xmmCRC3 = psRes3.AsUInt32(); }
internal static unsafe uint GetSse(ReadOnlySpan <byte> buffer, uint s1, uint s2) { uint len = (uint)buffer.Length; uint blocks = len / BLOCK_SIZE; len = len - blocks * BLOCK_SIZE; Vector128 <sbyte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); Vector128 <sbyte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); Vector128 <byte> zero = Vector128 <byte> .Zero; Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1); fixed(byte *bufPtr = &MemoryMarshal.GetReference(buffer)) { var buf = bufPtr; while (blocks != 0) { uint n = NMAX32 / BLOCK_SIZE; if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.Create(0, 0, 0, s1 * n); Vector128 <uint> v_s2 = Vector128.Create(0, 0, 0, s2); Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0); do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse2.LoadVector128(&buf[0]); Vector128 <byte> bytes2 = Sse2.LoadVector128(&buf[16]); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. Vector128 <ushort> sad1 = Sse2.SumAbsoluteDifferences(bytes1, zero); v_s1 = Sse2.Add(v_s1, sad1.AsUInt32()); Vector128 <short> mad11 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); Vector128 <int> mad12 = Sse2.MultiplyAddAdjacent(mad11, ones); v_s2 = Sse2.Add(v_s2, mad12.AsUInt32()); Vector128 <ushort> sad2 = Sse2.SumAbsoluteDifferences(bytes2, zero); v_s1 = Sse2.Add(v_s1, sad2.AsUInt32()); Vector128 <short> mad21 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); Vector128 <int> mad22 = Sse2.MultiplyAddAdjacent(mad21, ones); v_s2 = Sse2.Add(v_s2, mad22.AsUInt32()); buf += BLOCK_SIZE; n--; } while (n != 0); var shift = Sse2.ShiftLeftLogical(v_ps, 5); v_s2 = Sse2.Add(v_s2, shift); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). // A B C D -> B A D C const int S2301 = 2 << 6 | 3 << 4 | 0 << 2 | 1; // A B C D -> C D A B const int S1032 = 1 << 6 | 0 << 4 | 3 << 2 | 2; v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += Sse2.ConvertToUInt32(v_s1); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = Sse2.ConvertToUInt32(v_s2); s1 %= MOD32; s2 %= MOD32; } if (len > 0) { if (len >= 16) { s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); len -= 16; } while (len-- > 0) { s2 += (s1 += *buf++); } if (s1 >= MOD32) { s1 -= MOD32; } s2 %= MOD32; } return(s1 | (s2 << 16)); } }
private unsafe static void Xxh3ScrambleAcc(Span <ulong> acc, ReadOnlySpan <byte> secret) { if (Avx2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pSecret = secret) { Vector256 <uint> prime32 = Vector256.Create(Prime32_1); Vector256 <ulong> *xAcc = (Vector256 <ulong> *)pAcc; Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 32; i++) { Vector256 <ulong> accVec = xAcc[i]; Vector256 <ulong> shifted = Avx2.ShiftRightLogical(accVec, 47); Vector256 <ulong> dataVec = Avx2.Xor(accVec, shifted); Vector256 <byte> keyVec = xSecret[i]; Vector256 <uint> dataKey = Avx2.Xor(dataVec.AsUInt32(), keyVec.AsUInt32()); Vector256 <uint> dataKeyHi = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector256 <ulong> prodLo = Avx2.Multiply(dataKey, prime32); Vector256 <ulong> prodHi = Avx2.Multiply(dataKeyHi, prime32); xAcc[i] = Avx2.Add(prodLo, Avx2.ShiftLeftLogical(prodHi, 32)); } } } } else if (Sse2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pSecret = secret) { Vector128 <uint> prime32 = Vector128.Create(Prime32_1); Vector128 <ulong> *xAcc = (Vector128 <ulong> *)pAcc; Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 16; i++) { Vector128 <ulong> accVec = xAcc[i]; Vector128 <ulong> shifted = Sse2.ShiftRightLogical(accVec, 47); Vector128 <ulong> dataVec = Sse2.Xor(accVec, shifted); Vector128 <byte> keyVec = xSecret[i]; Vector128 <uint> dataKey = Sse2.Xor(dataVec.AsUInt32(), keyVec.AsUInt32()); Vector128 <uint> dataKeyHi = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector128 <ulong> prodLo = Sse2.Multiply(dataKey, prime32); Vector128 <ulong> prodHi = Sse2.Multiply(dataKeyHi, prime32); xAcc[i] = Sse2.Add(prodLo, Sse2.ShiftLeftLogical(prodHi, 32)); } } } } else { for (int i = 0; i < AccNb; i++) { ulong key64 = BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong))); ulong acc64 = acc[i]; acc64 = XorShift64(acc64, 47); acc64 ^= key64; acc64 *= Prime32_1; acc[i] = acc64; } } }
private unsafe static void Xxh3Accumulate512(Span <ulong> acc, ReadOnlySpan <byte> input, ReadOnlySpan <byte> secret) { if (Avx2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pInput = input, pSecret = secret) { Vector256 <ulong> *xAcc = (Vector256 <ulong> *)pAcc; Vector256 <byte> * xInput = (Vector256 <byte> *)pInput; Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 32; i++) { Vector256 <byte> dataVec = xInput[i]; Vector256 <byte> keyVec = xSecret[i]; Vector256 <byte> dataKey = Avx2.Xor(dataVec, keyVec); Vector256 <uint> dataKeyLo = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector256 <ulong> product = Avx2.Multiply(dataKey.AsUInt32(), dataKeyLo); Vector256 <uint> dataSwap = Avx2.Shuffle(dataVec.AsUInt32(), 0b01001110); Vector256 <ulong> sum = Avx2.Add(xAcc[i], dataSwap.AsUInt64()); xAcc[i] = Avx2.Add(product, sum); } } } } else if (Sse2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pInput = input, pSecret = secret) { Vector128 <ulong> *xAcc = (Vector128 <ulong> *)pAcc; Vector128 <byte> * xInput = (Vector128 <byte> *)pInput; Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 16; i++) { Vector128 <byte> dataVec = xInput[i]; Vector128 <byte> keyVec = xSecret[i]; Vector128 <byte> dataKey = Sse2.Xor(dataVec, keyVec); Vector128 <uint> dataKeyLo = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector128 <ulong> product = Sse2.Multiply(dataKey.AsUInt32(), dataKeyLo); Vector128 <uint> dataSwap = Sse2.Shuffle(dataVec.AsUInt32(), 0b01001110); Vector128 <ulong> sum = Sse2.Add(xAcc[i], dataSwap.AsUInt64()); xAcc[i] = Sse2.Add(product, sum); } } } } else { for (int i = 0; i < AccNb; i++) { ulong dataVal = BinaryPrimitives.ReadUInt64LittleEndian(input.Slice(i * sizeof(ulong))); ulong dataKey = dataVal ^ BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong))); acc[i ^ 1] += dataVal; acc[i] += Mult32To64((uint)dataKey, dataKey >> 32); } } }
private static Vector128 <ulong> ror64_32(ref Vector128 <ulong> x) => Sse2.Shuffle(x.AsUInt32(), 0b_10_11_00_01).AsUInt64();
private unsafe nuint GetIndexOfFirstByteToEncodeSsse3(byte *pData, nuint lengthInBytes) { Debug.Assert(Ssse3.IsSupported); Debug.Assert(BitConverter.IsLittleEndian); Vector128 <byte> vecZero = Vector128 <byte> .Zero; Vector128 <byte> vec0x7 = Vector128.Create((byte)0x7); Vector128 <byte> vecPowersOfTwo = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0); Vector128 <byte> allowedCodePoints = _allowedAsciiCodePoints.AsVector; int pmovmskb; nuint i = 0; if (lengthInBytes >= 16) { nuint lastLegalIterationFor16CharRead = lengthInBytes & unchecked ((nuint)(nint) ~0xF); do { // Read 16 bytes at a time into a single 128-bit vector. Vector128 <byte> packed = Sse2.LoadVector128(pData + i); // unaligned read // Each element of the packed vector corresponds to a byte of untrusted source data. It will // have the format [ ..., 0xYZ, ... ]. We use the low nibble of each byte to index into // the 'allowedCodePoints' vector, and we use the high nibble of each byte to select a bit // from the corresponding element in the 'allowedCodePoints' vector. // // Example: let packed := [ ..., 0x6D ('m'), ... ] // The final 'result' vector will contain a non-zero value in the corresponding space iff the // 0xD element in the 'allowedCodePoints' vector has its 1 << 0x6 bit set. // // We rely on the fact that the pshufb operation will turn each non-ASCII byte (high bit set) // into 0x00 in the resulting 'shuffled' vector. That results in the corresponding element // in the 'result' vector also being 0x00, meaning that escaping is required. var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed); var vecPowersOfTwoShuffled = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7)); var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled); // Now, each element of 'result' contains a non-zero value if the corresponding element in // 'packed' is allowed; and it contains a zero value if the corresponding element in 'packed' // is disallowed. We'll compare 'result' against an all-zero vector to normalize 0x00 -> 0xFF // and (anything other than 0x00) -> 0x00. Then 'pmovmskb' will have its nth bit set iff // the nth entry in 'packed' requires escaping. An all-zero pmovmskb means no escaping is required. pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero)); if ((pmovmskb & 0xFFFF) != 0) { goto MaskContainsDataWhichRequiresEscaping; } } while ((i += 16) < lastLegalIterationFor16CharRead); } if ((lengthInBytes & 8) != 0) { // Read 8 bytes at a time into a single 128-bit vector. // Same logic as the 16-byte case, but we only care about the low byte of the final pmovmskb value. // Everything except the low byte of pmovksmb contains garbage and must be discarded. var packed = Sse2.LoadScalarVector128((/* unaligned */ ulong *)(pData + i)).AsByte(); var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed); var vecPowersOfTwoShuffled = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7)); var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled); pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero)); if ((byte)pmovmskb != 0) { goto MaskContainsDataWhichRequiresEscaping; } i += 8; } if ((lengthInBytes & 4) != 0) { // Read 4 bytes at a time into a single 128-bit vector. // Same logic as the 16-byte case, but we only care about the low nibble of the final pmovmskb value. // Everything except the low nibble of pmovksmb contains garbage and must be discarded. var packed = Sse2.LoadScalarVector128((/* unaligned */ uint *)(pData + i)).AsByte(); var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed); var vecPowersOfTwoShuffled = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7)); var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled); pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero)); if ((pmovmskb & 0xF) != 0) { goto MaskContainsDataWhichRequiresEscaping; } i += 4; } // Beyond this point, vectorization isn't worthwhile. Just do a normal loop. if ((lengthInBytes & 3) != 0) { Debug.Assert(lengthInBytes - i <= 3); do { if (!_allowedAsciiCodePoints.IsAllowedAsciiCodePoint(pData[i])) { break; } } while (++i != lengthInBytes); } Return: return(i); MaskContainsDataWhichRequiresEscaping: Debug.Assert(pmovmskb != 0); i += (uint)BitOperations.TrailingZeroCount(pmovmskb); // location of lowest set bit is where we must begin escaping goto Return; }
public static Vector128 <T> AndNot_Software <T>(Vector128 <T> left, Vector128 <T> right) where T : struct { return(AndNot_Software(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); }
public static Vector128 <T> Not_Software <T>(Vector128 <T> vector) where T : struct { return(Not_Software(vector.AsUInt32()).As <uint, T>()); }
/// <summary> /// Figure 8. Code Sample -Performing GhashUsing an Aggregated Reduction Method /// Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel /// </summary> public static Vector128 <ulong> Reduce4( Vector128 <ulong> h1, Vector128 <ulong> h2, Vector128 <ulong> h3, Vector128 <ulong> h4, Vector128 <ulong> x1, Vector128 <ulong> x2, Vector128 <ulong> x3, Vector128 <ulong> x4) { Vector128 <ulong> h1x1Lo, h1x1Hi, h2x2Lo, h2x2Hi, h3x3Lo, h3x3Hi, h4x4Lo, h4x4Hi, lo, hi; Vector128 <ulong> tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; h1x1Lo = CarrylessMultiply(h1, x1, 0x00); h2x2Lo = CarrylessMultiply(h2, x2, 0x00); h3x3Lo = CarrylessMultiply(h3, x3, 0x00); h4x4Lo = CarrylessMultiply(h4, x4, 0x00); lo = Xor(h1x1Lo, h2x2Lo); lo = Xor(lo, h3x3Lo); lo = Xor(lo, h4x4Lo); h1x1Hi = CarrylessMultiply(h1, x1, 0x11); h2x2Hi = CarrylessMultiply(h2, x2, 0x11); h3x3Hi = CarrylessMultiply(h3, x3, 0x11); h4x4Hi = CarrylessMultiply(h4, x4, 0x11); hi = Xor(h1x1Hi, h2x2Hi); hi = Xor(hi, h3x3Hi); hi = Xor(hi, h4x4Hi); tmp0 = Shuffle(h1.AsUInt32(), 78).AsUInt64(); tmp4 = Shuffle(x1.AsUInt32(), 78).AsUInt64(); tmp0 = Xor(tmp0, h1); tmp4 = Xor(tmp4, x1); tmp1 = Shuffle(h2.AsUInt32(), 78).AsUInt64(); tmp5 = Shuffle(x2.AsUInt32(), 78).AsUInt64(); tmp1 = Xor(tmp1, h2); tmp5 = Xor(tmp5, x2); tmp2 = Shuffle(h3.AsUInt32(), 78).AsUInt64(); tmp6 = Shuffle(x3.AsUInt32(), 78).AsUInt64(); tmp2 = Xor(tmp2, h3); tmp6 = Xor(tmp6, x3); tmp3 = Shuffle(h4.AsUInt32(), 78).AsUInt64(); tmp7 = Shuffle(x4.AsUInt32(), 78).AsUInt64(); tmp3 = Xor(tmp3, h4); tmp7 = Xor(tmp7, x4); tmp0 = CarrylessMultiply(tmp0, tmp4, 0x00); tmp1 = CarrylessMultiply(tmp1, tmp5, 0x00); tmp2 = CarrylessMultiply(tmp2, tmp6, 0x00); tmp3 = CarrylessMultiply(tmp3, tmp7, 0x00); tmp0 = Xor(tmp0, lo); tmp0 = Xor(tmp0, hi); tmp0 = Xor(tmp1, tmp0); tmp0 = Xor(tmp2, tmp0); tmp0 = Xor(tmp3, tmp0); tmp4 = ShiftLeftLogical128BitLane(tmp0, 8); tmp0 = ShiftRightLogical128BitLane(tmp0, 8); lo = Xor(tmp4, lo); hi = Xor(tmp0, hi); tmp3 = lo; tmp6 = hi; tmp7 = ShiftRightLogical(tmp3.AsUInt32(), 31).AsUInt64(); tmp8 = ShiftRightLogical(tmp6.AsUInt32(), 31).AsUInt64(); tmp3 = ShiftLeftLogical(tmp3.AsUInt32(), 1).AsUInt64(); tmp6 = ShiftLeftLogical(tmp6.AsUInt32(), 1).AsUInt64(); tmp9 = ShiftRightLogical128BitLane(tmp7, 12); tmp8 = ShiftLeftLogical128BitLane(tmp8, 4); tmp7 = ShiftLeftLogical128BitLane(tmp7, 4); tmp3 = Or(tmp3, tmp7); tmp6 = Or(tmp6, tmp8); tmp6 = Or(tmp6, tmp9); tmp7 = ShiftLeftLogical(tmp3.AsUInt32(), 31).AsUInt64(); tmp8 = ShiftLeftLogical(tmp3.AsUInt32(), 30).AsUInt64(); tmp9 = ShiftLeftLogical(tmp3.AsUInt32(), 25).AsUInt64(); tmp7 = Xor(tmp7, tmp8); tmp7 = Xor(tmp7, tmp9); tmp8 = ShiftRightLogical128BitLane(tmp7, 4); tmp7 = ShiftLeftLogical128BitLane(tmp7, 12); tmp3 = Xor(tmp3, tmp7); tmp2 = ShiftRightLogical(tmp3.AsUInt32(), 1).AsUInt64(); tmp4 = ShiftRightLogical(tmp3.AsUInt32(), 2).AsUInt64(); tmp5 = ShiftRightLogical(tmp3.AsUInt32(), 7).AsUInt64(); tmp2 = Xor(tmp2, tmp4); tmp2 = Xor(tmp2, tmp5); tmp2 = Xor(tmp2, tmp8); tmp3 = Xor(tmp3, tmp2); tmp6 = Xor(tmp6, tmp3); return(tmp6); }
public static Vector128 <T> RotateLeftUInt32 <T>(this Vector128 <T> value, byte offset) where T : struct { return(Sse2.Or(Sse2.ShiftLeftLogical(value.AsUInt32(), offset), Sse2.ShiftRightLogical(value.AsUInt32(), (byte)(32 - offset))).As <uint, T>()); }