public void RunLclVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_Load)); var left = Sse2.LoadVector128((Int16 *)(_dataTable.inArray1Ptr)); var right = Sse2.LoadVector128((Int16 *)(_dataTable.inArray2Ptr)); var result = Ssse3.AlignRight(left, right, 2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var left = Unsafe.Read <Vector128 <Int64> >(_dataTable.inArray1Ptr); var right = Unsafe.Read <Vector128 <Int64> >(_dataTable.inArray2Ptr); var result = Ssse3.AlignRight(left, right, 8); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario)); var result = Ssse3.AlignRight( _clsVar1, _clsVar2, 0 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_LoadAligned)); var result = Ssse3.AlignRight( Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray1Ptr)), Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray2Ptr)), 0 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Ssse3.AlignRight( Unsafe.Read <Vector128 <UInt16> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <UInt16> >(_dataTable.inArray2Ptr), 0 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
private static void diagonalize(ref Vector128 <ulong> row1l, ref Vector128 <ulong> row2l, ref Vector128 <ulong> row3l, ref Vector128 <ulong> row4l, ref Vector128 <ulong> row1h, ref Vector128 <ulong> row2h, ref Vector128 <ulong> row3h, ref Vector128 <ulong> row4h, ref Vector128 <ulong> b0) { var t0 = Ssse3.AlignRight(row2h.AsSByte(), row2l.AsSByte(), 8); var t1 = Ssse3.AlignRight(row2l.AsSByte(), row2h.AsSByte(), 8); row2l = t0.AsUInt64(); row2h = t1.AsUInt64(); b0 = row3l; row3l = row3h; row3h = b0; t0 = Ssse3.AlignRight(row4h.AsSByte(), row4l.AsSByte(), 8); t1 = Ssse3.AlignRight(row4l.AsSByte(), row4h.AsSByte(), 8); row4l = t1.AsUInt64(); row4h = t0.AsUInt64(); }
// PolyvalPowersTable updates the POLYVAL value in polyval to include length bytes // of data from input, given the POLYVAL key in hashKey. It uses the precomputed // powers of the key given in htbl. If the length is not divisible by 16, input // is padded with zeros until it's a multiple of 16 bytes. private static void PolyvalPowersTable(byte *polyval, byte *htbl, byte *input, int length) { if (length == 0) { return; } int blocks = Math.DivRem(length, 16, out int remainder16); int remainder128 = length % 128 - remainder16; Vector128 <ulong> tmp0, tmp1, tmp2, tmp3, tmp4; var xhi = Sse2.SetZeroVector128 <ulong>(); var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1)); var t = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval)); if (remainder128 != 0) { int remainder128Blocks = remainder128 / 16; blocks -= remainder128Blocks; var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(input))); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - 1) * 16])); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); for (int i = 1; i < remainder128Blocks; ++i) { data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&input[i * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - i - 1) * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); } tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); } if (blocks != 0) { var fixedInput = input + remainder128; if (remainder128 == 0) { var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[7 * 16])); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16])); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[6 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[5 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[4 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[3 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16])); tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[2 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[1 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[0 * 16]))); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); } for (int i = remainder128 == 0 ? 8 : 0; i < blocks; i += 8) { var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 7) * 16])); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16])); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 6) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 5) * 16])); tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(t, tmp4); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 4) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 3) * 16])); tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(t, tmp4); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 2) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(t, xhi); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 1) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[i * 16]))); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); } } if (blocks != 0 || remainder128 != 0) { tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); t = Sse2.Xor(xhi, t); } if (remainder16 != 0) { byte *b = stackalloc byte[16]; new Span <byte>(input + length - remainder16, remainder16).CopyTo(new Span <byte>(b, 16)); var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b))); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl)); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); t = Sse2.Xor(xhi, t); } Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t)); }
static unsafe int Main(string[] args) { int testResult = Pass; if (Ssse3.IsSupported) { using (TestTable <sbyte> sbyteTable = new TestTable <sbyte>(new sbyte[16] { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, new sbyte[16] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, new sbyte[16])) { var vf1 = Unsafe.Read <Vector128 <sbyte> >(sbyteTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector128 <sbyte> >(sbyteTable.inArray2Ptr); var vf3 = Ssse3.AlignRight(vf1, vf2, 27); Unsafe.Write(sbyteTable.outArrayPtr, vf3); if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 27) && (z[01] == 28) && (z[02] == 29) && (z[03] == 30) && (z[04] == 31) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) && (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) && (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00))) { Console.WriteLine("SSE AlignRight failed on sbyte:"); foreach (var item in sbyteTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Ssse3.AlignRight(vf1, vf2, 5); Unsafe.Write(sbyteTable.outArrayPtr, vf3); if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 05) && (z[01] == 06) && (z[02] == 07) && (z[03] == 08) && (z[04] == 09) && (z[05] == 10) && (z[06] == 11) && (z[07] == 12) && (z[08] == 13) && (z[09] == 14) && (z[10] == 15) && (z[11] == 16) && (z[12] == 17) && (z[13] == 18) && (z[14] == 19) && (z[15] == 20))) { Console.WriteLine("SSE AlignRight failed on sbyte:"); foreach (var item in sbyteTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Ssse3.AlignRight(vf1, vf2, 250); Unsafe.Write(sbyteTable.outArrayPtr, vf3); if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 00) && (z[01] == 00) && (z[02] == 00) && (z[03] == 00) && (z[04] == 00) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) && (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) && (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00))) { Console.WriteLine("SSE AlignRight failed on sbyte:"); foreach (var item in sbyteTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Ssse3.AlignRight(vf1, vf2, 228); Unsafe.Write(sbyteTable.outArrayPtr, vf3); if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 00) && (z[01] == 00) && (z[02] == 00) && (z[03] == 00) && (z[04] == 00) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) && (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) && (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00))) { Console.WriteLine("SSE AlignRight failed on sbyte:"); foreach (var item in sbyteTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = (Vector128 <sbyte>) typeof(Ssse3).GetMethod(nameof(Ssse3.AlignRight), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(27) }); Unsafe.Write(sbyteTable.outArrayPtr, vf3); if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 27) && (z[01] == 28) && (z[02] == 29) && (z[03] == 30) && (z[04] == 31) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) && (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) && (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00))) { Console.WriteLine("SSE AlignRight failed on sbyte:"); foreach (var item in sbyteTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } } return(testResult); }
// // NOTE(casey): Single block version // public static unsafe Vector128 <byte> Hash(ReadOnlySpan <byte> Seed128Init, ReadOnlySpan <byte> SourceInit) { Vector128 <byte> xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length) int Len = SourceInit.Length; fixed(byte *sourceInitPtr = SourceInit) fixed(byte *seedInitPtr = Seed128Init) { byte *rax = sourceInitPtr; byte *rcx = seedInitPtr; // // NOTE(casey): Seed the eight hash registers // xmm0 = Sse2.LoadVector128(rcx + 0x00); xmm1 = Sse2.LoadVector128(rcx + 0x10); xmm2 = Sse2.LoadVector128(rcx + 0x20); xmm3 = Sse2.LoadVector128(rcx + 0x30); xmm4 = Sse2.LoadVector128(rcx + 0x40); xmm5 = Sse2.LoadVector128(rcx + 0x50); xmm6 = Sse2.LoadVector128(rcx + 0x60); xmm7 = Sse2.LoadVector128(rcx + 0x70); // MEOW_DUMP_STATE("Seed", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); // // NOTE(casey): Hash all full 256-byte blocks // int BlockCount = (SourceInit.Length >> 8); if (BlockCount > MEOW_PREFETCH_LIMIT) { // NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this loop while (BlockCount-- > 0) { Sse.Prefetch0(rax + MEOW_PREFETCH + 0x00); Sse.Prefetch0(rax + MEOW_PREFETCH + 0x40); Sse.Prefetch0(rax + MEOW_PREFETCH + 0x80); Sse.Prefetch0(rax + MEOW_PREFETCH + 0xc0); MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00); MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20); MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40); MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60); MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80); MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0); MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0); MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0); rax += 0x100; } } else { // NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port pressure), so we use this loop. while (BlockCount-- > 0) { MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00); MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20); MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40); MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60); MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80); MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0); MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0); MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0); rax += 0x100; } } #if MEOW_DUMP MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif // // NOTE(casey): Load any less-than-32-byte residual // xmm9 = Vector128 <byte> .Zero; xmm11 = Vector128 <byte> .Zero; // // TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here, // because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the // result, but it would take a little thought to make sure it couldn't read off the end of the buffer due // to the & 0xf on the align computation. // // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned byte *Last = (byte *)sourceInitPtr + (Len & ~0xf); int Len8 = (Len & 0xf); if (Len8 > 0) { // NOTE(casey): Load the mask early fixed(byte *MeowMaskLen = s_meowMaskLen) { xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]); } byte *LastOk = (byte *)((((ulong)(((byte *)sourceInitPtr) + Len - 1)) | (MEOW_PAGESIZE - 1)) - 16); int Align = (Last > LastOk) ? ((int)(ulong)Last) & 0xf : 0; fixed(byte *MeowShiftAdjust = s_meowShiftAdjust) { xmm10 = Sse2.LoadVector128(&MeowShiftAdjust[Align]); } xmm9 = Sse2.LoadVector128(Last - Align); xmm9 = Ssse3.Shuffle(xmm9, xmm10); // NOTE(jeffr): and off the extra bytes xmm9 = Sse2.And(xmm9, xmm8); } // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned if ((Len & 0x10) != 0) { xmm11 = xmm9; xmm9 = Sse2.LoadVector128(Last - 0x10); } // // NOTE(casey): Construct the residual and length injests // xmm8 = xmm9; xmm10 = xmm9; xmm8 = Ssse3.AlignRight(xmm8, xmm11, 15); xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1); // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but // the decision was made to leave them zero'd so as not to confuse people // about hwo to use them or what security implications they had. xmm12 = Vector128 <byte> .Zero; xmm13 = Vector128 <byte> .Zero; xmm14 = Vector128 <byte> .Zero; xmm15 = Vector128.Create((ulong)Len, 0).AsByte(); xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15); xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1); #if MEOW_DUMP MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); #endif // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11); // NOTE(casey): Append the length, to avoid problems with our 32-byte padding MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15); #if MEOW_DUMP MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif // // NOTE(casey): Hash all full 32-byte blocks // int LaneCount = (Len >> 5) & 0x7; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount; // // NOTE(casey): Mix the eight lanes down to one 128-bit hash // MixDown: #if MEOW_DUMP MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2); MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3); MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4); MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5); MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); #if MEOW_DUMP MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif xmm0 = AddQ(xmm0, xmm2); xmm1 = AddQ(xmm1, xmm3); xmm4 = AddQ(xmm4, xmm6); xmm5 = AddQ(xmm5, xmm7); xmm0 = Sse2.Xor(xmm0, xmm1); xmm4 = Sse2.Xor(xmm4, xmm5); xmm0 = AddQ(xmm0, xmm4); #if MEOW_DUMP MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif return(xmm0); } }
public static unsafe Vector128 <byte> End(ref State state, Span <byte> store128) { long Len = state.TotalLengthInBytes; Vector128 <byte> xmm0 = state.xmm0; Vector128 <byte> xmm1 = state.xmm1; Vector128 <byte> xmm2 = state.xmm2; Vector128 <byte> xmm3 = state.xmm3; Vector128 <byte> xmm4 = state.xmm4; Vector128 <byte> xmm5 = state.xmm5; Vector128 <byte> xmm6 = state.xmm6; Vector128 <byte> xmm7 = state.xmm7; Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; fixed(byte *rax = state.Buffer) { xmm9 = Vector128 <byte> .Zero; xmm11 = Vector128 <byte> .Zero; byte *Last = (byte *)rax + (Len & 0xf0); long Len8 = (Len & 0xf); if (Len8 > 0) { fixed(byte *MeowMaskLen = s_meowMaskLen) { xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]); } xmm9 = Sse2.LoadVector128(Last); xmm9 = Sse2.And(xmm9, xmm8); } if ((Len & 0x10) != 0) { xmm11 = xmm9; xmm9 = Sse2.LoadVector128(Last - 0x10); } xmm8 = xmm9; xmm10 = xmm9; xmm8 = Ssse3.AlignRight(xmm8, xmm11, 15); xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1); xmm12 = Vector128 <byte> .Zero; xmm13 = Vector128 <byte> .Zero; xmm14 = Vector128 <byte> .Zero; xmm15 = Vector128.Create((ulong)Len, 0).AsByte(); xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15); xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1); #if MEOW_DUMP MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); #endif // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11); // NOTE(casey): Append the length, to avoid problems with our 32-byte padding MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15); #if MEOW_DUMP MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif // // NOTE(casey): Hash all full 32-byte blocks // long LaneCount = (Len >> 5) & 0x7; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount; // // NOTE(casey): Mix the eight lanes down to one 128-bit hash // MixDown: #if MEOW_DUMP MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2); MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3); MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4); MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5); MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); #if MEOW_DUMP MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif if (store128 != null) { fixed(byte *store128Ptr = store128) { Sse2.Store(store128Ptr + 0x00, xmm0); Sse2.Store(store128Ptr + 0x10, xmm1); Sse2.Store(store128Ptr + 0x20, xmm2); Sse2.Store(store128Ptr + 0x30, xmm3); Sse2.Store(store128Ptr + 0x40, xmm4); Sse2.Store(store128Ptr + 0x50, xmm5); Sse2.Store(store128Ptr + 0x60, xmm6); Sse2.Store(store128Ptr + 0x70, xmm7); } } xmm0 = AddQ(xmm0, xmm2); xmm1 = AddQ(xmm1, xmm3); xmm4 = AddQ(xmm4, xmm6); xmm5 = AddQ(xmm5, xmm7); xmm0 = Sse2.Xor(xmm0, xmm1); xmm4 = Sse2.Xor(xmm4, xmm5); xmm0 = AddQ(xmm0, xmm4); #if MEOW_DUMP MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif return(xmm0); } }
private static Vector128 <ulong> alignr_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) => Ssse3.AlignRight(x.AsSByte(), y.AsSByte(), m).AsUInt64();
public static Vector128 <uint> _mm_alignr_epi8(Vector128 <uint> left, Vector128 <uint> right, byte mask) { return(Ssse3.AlignRight(left, right, mask)); }
private static Vector128 <ulong> alignr_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) => Ssse3.AlignRight(x.As <sbyte>(), y.As <sbyte>(), m).As <ulong>();
// DecryptPowersTable decrypts ctLen bytes from ct and writes them to pt. While // decrypting, it updates the POLYVAL value in polyval. In order to decrypt and // update the POLYVAL value, it uses the expanded key from ks and the table of // powers in htbl. Decryption processes 6 blocks of data in parallel. private static void DecryptPowersTable(byte *ct, int ctLen, byte *pt, byte *polyval, byte *htbl, byte *tag, byte *ks) { Vector128 <ulong> sCtr1, sCtr2, sCtr3, sCtr4, sCtr5, sCtr6, tmp0, tmp1, tmp2, tmp3, tmp4, h; var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1)); var t = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval)); var orMask = Sse.StaticCast <uint, byte>(Sse2.SetVector128(0x80000000, 0, 0, 0)); var ctr = Sse2.Or(Sse2.LoadVector128(tag), orMask); var one = Sse2.SetVector128(0, 0, 0, 1); var two = Sse2.SetVector128(0, 0, 0, 2); int blocks = 0; if (ctLen >= 96) { var ctr1 = ctr; var ctr2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one)); var ctr3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two)); var ctr4 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), one)); var ctr5 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), two)); var ctr6 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), one)); ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), two)); var key = Sse2.LoadVector128(ks); ctr1 = Sse2.Xor(ctr1, key); ctr2 = Sse2.Xor(ctr2, key); ctr3 = Sse2.Xor(ctr3, key); ctr4 = Sse2.Xor(ctr4, key); ctr5 = Sse2.Xor(ctr5, key); ctr6 = Sse2.Xor(ctr6, key); for (int i = 1; i < 14; ++i) { key = Sse2.LoadVector128(&ks[i * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); } key = Sse2.LoadVector128(&ks[14 * 16]); ctr1 = Aes.EncryptLast(ctr1, key); ctr2 = Aes.EncryptLast(ctr2, key); ctr3 = Aes.EncryptLast(ctr3, key); ctr4 = Aes.EncryptLast(ctr4, key); ctr5 = Aes.EncryptLast(ctr5, key); ctr6 = Aes.EncryptLast(ctr6, key); ctr1 = Sse2.Xor(ctr1, Sse2.LoadVector128(&ct[0 * 16])); ctr2 = Sse2.Xor(ctr2, Sse2.LoadVector128(&ct[1 * 16])); ctr3 = Sse2.Xor(ctr3, Sse2.LoadVector128(&ct[2 * 16])); ctr4 = Sse2.Xor(ctr4, Sse2.LoadVector128(&ct[3 * 16])); ctr5 = Sse2.Xor(ctr5, Sse2.LoadVector128(&ct[4 * 16])); ctr6 = Sse2.Xor(ctr6, Sse2.LoadVector128(&ct[5 * 16])); Sse2.Store(&pt[0 * 16], ctr1); Sse2.Store(&pt[1 * 16], ctr2); Sse2.Store(&pt[2 * 16], ctr3); Sse2.Store(&pt[3 * 16], ctr4); Sse2.Store(&pt[4 * 16], ctr5); Sse2.Store(&pt[5 * 16], ctr6); ctLen -= 96; blocks += 6; while (ctLen >= 96) { sCtr6 = Sse.StaticCast <byte, ulong>(ctr6); sCtr5 = Sse.StaticCast <byte, ulong>(ctr5); sCtr4 = Sse.StaticCast <byte, ulong>(ctr4); sCtr3 = Sse.StaticCast <byte, ulong>(ctr3); sCtr2 = Sse.StaticCast <byte, ulong>(ctr2); sCtr1 = Sse.StaticCast <byte, ulong>(ctr1); ctr1 = ctr; ctr2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one)); ctr3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two)); ctr4 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), one)); ctr5 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), two)); ctr6 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), one)); ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), two)); key = Sse2.LoadVector128(ks); ctr1 = Sse2.Xor(ctr1, key); ctr2 = Sse2.Xor(ctr2, key); ctr3 = Sse2.Xor(ctr3, key); ctr4 = Sse2.Xor(ctr4, key); ctr5 = Sse2.Xor(ctr5, key); ctr6 = Sse2.Xor(ctr6, key); tmp3 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl)); tmp1 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x00); tmp0 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x01); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x10); tmp0 = Sse2.Xor(tmp3, tmp0); key = Sse2.LoadVector128(&ks[1 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); key = Sse2.LoadVector128(&ks[2 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); key = Sse2.LoadVector128(&ks[3 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); key = Sse2.LoadVector128(&ks[4 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); key = Sse2.LoadVector128(&ks[5 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[6 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[7 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); sCtr1 = Sse2.Xor(t, sCtr1); tmp4 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x01); tmp0 = Sse2.Xor(tmp3, tmp0); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x11); tmp1 = Sse2.Xor(tmp3, tmp1); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x00); tmp2 = Sse2.Xor(tmp3, tmp2); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x10); tmp0 = Sse2.Xor(tmp3, tmp0); key = Sse2.LoadVector128(&ks[8 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp0, 8); tmp4 = Sse2.Xor(tmp3, tmp1); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp0, 8); t = Sse2.Xor(tmp3, tmp2); key = Sse2.LoadVector128(&ks[9 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse2.Xor(tmp1, t); key = Sse2.LoadVector128(&ks[10 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[11 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[12 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[13 * 16]); ctr1 = Aes.Encrypt(ctr1, key); ctr2 = Aes.Encrypt(ctr2, key); ctr3 = Aes.Encrypt(ctr3, key); ctr4 = Aes.Encrypt(ctr4, key); ctr5 = Aes.Encrypt(ctr5, key); ctr6 = Aes.Encrypt(ctr6, key); key = Sse2.LoadVector128(&ks[14 * 16]); ctr1 = Aes.EncryptLast(ctr1, key); ctr2 = Aes.EncryptLast(ctr2, key); ctr3 = Aes.EncryptLast(ctr3, key); ctr4 = Aes.EncryptLast(ctr4, key); ctr5 = Aes.EncryptLast(ctr5, key); ctr6 = Aes.EncryptLast(ctr6, key); ctr1 = Sse2.Xor(ctr1, Sse2.LoadVector128(&ct[(blocks + 0) * 16])); ctr2 = Sse2.Xor(ctr2, Sse2.LoadVector128(&ct[(blocks + 1) * 16])); ctr3 = Sse2.Xor(ctr3, Sse2.LoadVector128(&ct[(blocks + 2) * 16])); ctr4 = Sse2.Xor(ctr4, Sse2.LoadVector128(&ct[(blocks + 3) * 16])); ctr5 = Sse2.Xor(ctr5, Sse2.LoadVector128(&ct[(blocks + 4) * 16])); ctr6 = Sse2.Xor(ctr6, Sse2.LoadVector128(&ct[(blocks + 5) * 16])); tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse2.Xor(tmp1, t); t = Sse2.Xor(tmp4, t); Sse2.Store(&pt[(blocks + 0) * 16], ctr1); Sse2.Store(&pt[(blocks + 1) * 16], ctr2); Sse2.Store(&pt[(blocks + 2) * 16], ctr3); Sse2.Store(&pt[(blocks + 3) * 16], ctr4); Sse2.Store(&pt[(blocks + 4) * 16], ctr5); Sse2.Store(&pt[(blocks + 5) * 16], ctr6); ctLen -= 96; blocks += 6; } sCtr6 = Sse.StaticCast <byte, ulong>(ctr6); sCtr5 = Sse.StaticCast <byte, ulong>(ctr5); sCtr4 = Sse.StaticCast <byte, ulong>(ctr4); sCtr3 = Sse.StaticCast <byte, ulong>(ctr3); sCtr2 = Sse.StaticCast <byte, ulong>(ctr2); sCtr1 = Sse.StaticCast <byte, ulong>(ctr1); tmp3 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl)); tmp0 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x10); tmp1 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x00); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x01); tmp0 = Sse2.Xor(tmp3, tmp0); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x10); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x00); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x01); tmp0 = Sse2.Xor(tmp0, tmp3); sCtr1 = Sse2.Xor(t, sCtr1); tmp4 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x11); tmp1 = Sse2.Xor(tmp3, tmp1); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x00); tmp2 = Sse2.Xor(tmp3, tmp2); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x10); tmp0 = Sse2.Xor(tmp3, tmp0); tmp3 = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x01); tmp0 = Sse2.Xor(tmp3, tmp0); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp0, 8); tmp4 = Sse2.Xor(tmp3, tmp1); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp0, 8); t = Sse2.Xor(tmp3, tmp2); tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse2.Xor(tmp1, t); tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse2.Xor(tmp1, t); t = Sse2.Xor(tmp4, t); } h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl)); while (ctLen >= 16) { var tmp = ctr; ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one)); tmp = Sse2.Xor(tmp, Sse2.LoadVector128(ks)); for (int i = 1; i < 14; ++i) { tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[i * 16])); } tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16])); tmp = Sse2.Xor(tmp, Sse2.LoadVector128(&ct[blocks * 16])); Sse2.Store(&pt[blocks * 16], tmp); t = Sse2.Xor(Sse.StaticCast <byte, ulong>(tmp), t); tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00); tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10); tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01); tmp2 = Sse2.Xor(tmp3, tmp2); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp1 = Sse2.Xor(tmp1, tmp3); tmp4 = Sse2.Xor(tmp2, tmp4); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp2, tmp3); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(tmp1, tmp4); ctLen -= 16; ++blocks; } if (ctLen > 0) { byte *b = stackalloc byte[16]; new Span <byte>(ct + blocks * 16, ctLen).CopyTo(new Span <byte>(b, 16)); var tmp = Sse2.Xor(ctr, Sse2.LoadVector128(ks)); for (int i = 1; i < 14; ++i) { tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[i * 16])); } tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16])); tmp = Sse2.Xor(tmp, Sse2.LoadVector128(b)); Sse2.Store(b, tmp); new Span <byte>(b, ctLen).CopyTo(new Span <byte>(&pt[blocks * 16], ctLen)); new Span <byte>(b + ctLen, 16 - ctLen).Clear(); t = Sse2.Xor(Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b)), t); tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00); tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11); tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10); tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01); tmp2 = Sse2.Xor(tmp3, tmp2); tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp1 = Sse2.Xor(tmp1, tmp3); tmp4 = Sse2.Xor(tmp2, tmp4); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp2, tmp3); tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10); tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78)); tmp1 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(tmp1, tmp4); } Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t)); }