public static Vector128 <short> DivideBy10(this Vector128 <short> dividend) { // Convert to two 32-bit integers Vector128 <int> a_hi = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16); Vector128 <int> a_lo = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16); a_lo = Sse2.ShiftRightArithmetic(a_lo, 16); Vector128 <int> div10_hi; Vector128 <int> div10_lo; if (Avx2.IsSupported) { Vector256 <int> a = Vector256.Create(a_lo, a_hi); Vector256 <int> s0 = Avx2.ShiftRightArithmetic(a, 15); Vector256 <int> factor = Vector256.Create(26215); Vector256 <int> mul = Avx2.MultiplyLow(a, factor); Vector256 <int> s1 = Avx2.ShiftRightArithmetic(mul, 18); Vector256 <int> div10 = Avx2.Subtract(s1, s0); div10_hi = div10.GetUpper(); div10_lo = div10.GetLower(); } else { Vector128 <int> s0_hi = Sse2.ShiftRightArithmetic(a_hi, 15); Vector128 <int> s0_lo = Sse2.ShiftRightArithmetic(a_lo, 15); Vector128 <int> factor = Vector128.Create(26215); Vector128 <int> mul_hi = Sse41.MultiplyLow(a_hi, factor); Vector128 <int> mul_lo = Sse41.MultiplyLow(a_lo, factor); Vector128 <int> s1_hi = Sse2.ShiftRightArithmetic(mul_hi, 18); Vector128 <int> s1_lo = Sse2.ShiftRightArithmetic(mul_lo, 18); div10_hi = Sse2.Subtract(s1_hi, s0_hi); div10_lo = Sse2.Subtract(s1_lo, s0_lo); } //div10_hi = Sse2.ShiftLeftLogical(div10_hi, 16); div10_hi = Sse2.ShiftLeftLogical128BitLane(div10_hi, 2); return(Sse41.Blend(div10_lo.AsInt16(), div10_hi.AsInt16(), 0xAA)); }
static unsafe int Main(string[] args) { int testResult = Pass; if (Sse41.IsSupported) { using (TestTable <float> floatTable = new TestTable <float>(new float[4] { 1, -5, 100, 0 }, new float[4] { 22, -1, -50, 0 }, new float[4])) { var vf1 = Unsafe.Read <Vector128 <float> >(floatTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector128 <float> >(floatTable.inArray2Ptr); // SDDD var vf3 = Sse41.Blend(vf1, vf2, 1); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]))) { Console.WriteLine("SSE41 Blend failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DSDD vf3 = Sse41.Blend(vf1, vf2, 2); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == y[1]) && (z[2] == x[2]) && (z[3] == x[3]))) { Console.WriteLine("SSE41 Blend failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DDSD vf3 = Sse41.Blend(vf1, vf2, 4); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]) && (z[2] == y[2]) && (z[3] == x[3]))) { Console.WriteLine("SSE41 Blend failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SDSD vf3 = Sse41.Blend(vf1, vf2, 85); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == y[2]) && (z[3] == x[3]))) { Console.WriteLine("SSE41 Blend failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SDDD vf3 = (Vector128 <float>) typeof(Sse41).GetMethod(nameof(Sse41.Blend), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(1) }); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]))) { Console.WriteLine("SSE41 Blend failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <double> doubleTable = new TestTable <double>(new double[2] { 1, -5 }, new double[2] { 22, -1 }, new double[2])) { var vf1 = Unsafe.Read <Vector128 <double> >(doubleTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector128 <double> >(doubleTable.inArray2Ptr); // DD var vf3 = Sse41.Blend(vf1, vf2, 0); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]))) { Console.WriteLine("SSE41 Blend failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SD vf3 = Sse41.Blend(vf1, vf2, 1); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]))) { Console.WriteLine("SSE41 Blend failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DS vf3 = Sse41.Blend(vf1, vf2, 2); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == y[1]))) { Console.WriteLine("SSE41 Blend failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SS vf3 = Sse41.Blend(vf1, vf2, 51); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == y[1]))) { Console.WriteLine("SSE41 Blend failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SDDD vf3 = (Vector128 <double>) typeof(Sse41).GetMethod(nameof(Sse41.Blend), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(0) }); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]))) { Console.WriteLine("SSE41 Blend failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <short> shortTable = new TestTable <short>(new short[8] { 1, -5, 100, 0, 1, -5, 100, 0 }, new short[8] { 22, -1, -50, 0, 22, -1, -50, 0 }, new short[8])) { var vf1 = Unsafe.Read <Vector128 <short> >(shortTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector128 <short> >(shortTable.inArray2Ptr); // SDDD DDDD var vf3 = Sse41.Blend(vf1, vf2, 1); Unsafe.Write(shortTable.outArrayPtr, vf3); if (!shortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("SSE41 Blend failed on short:"); foreach (var item in shortTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DSDD DDDD vf3 = Sse41.Blend(vf1, vf2, 2); Unsafe.Write(shortTable.outArrayPtr, vf3); if (!shortTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == y[1]) && (z[2] == x[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("SSE41 Blend failed on short:"); foreach (var item in shortTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DDSD DDDD vf3 = Sse41.Blend(vf1, vf2, 4); Unsafe.Write(shortTable.outArrayPtr, vf3); if (!shortTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]) && (z[2] == y[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("SSE41 Blend failed on short:"); foreach (var item in shortTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SDSD SDSD vf3 = Sse41.Blend(vf1, vf2, 85); Unsafe.Write(shortTable.outArrayPtr, vf3); if (!shortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == y[2]) && (z[3] == x[3]) && (z[4] == y[4]) && (z[5] == x[5]) && (z[6] == y[6]) && (z[7] == x[7]))) { Console.WriteLine("SSE41 Blend failed on short:"); foreach (var item in shortTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SDDD DDDD vf3 = (Vector128 <short>) typeof(Sse41).GetMethod(nameof(Sse41.Blend), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(1) }); Unsafe.Write(shortTable.outArrayPtr, vf3); if (!shortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("SSE41 Blend failed on short:"); foreach (var item in shortTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <ushort> ushortTable = new TestTable <ushort>(new ushort[8] { 1, 5, 100, 0, 1, 5, 100, 0 }, new ushort[8] { 22, 1, 50, 0, 22, 1, 50, 0 }, new ushort[8])) { var vf1 = Unsafe.Read <Vector128 <ushort> >(ushortTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector128 <ushort> >(ushortTable.inArray2Ptr); // SDDD DDDD var vf3 = Sse41.Blend(vf1, vf2, 1); Unsafe.Write(ushortTable.outArrayPtr, vf3); if (!ushortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("SSE41 Blend failed on ushort:"); foreach (var item in ushortTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DSDD DDDD vf3 = Sse41.Blend(vf1, vf2, 2); Unsafe.Write(ushortTable.outArrayPtr, vf3); if (!ushortTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == y[1]) && (z[2] == x[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("SSE41 Blend failed on ushort:"); foreach (var item in ushortTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DDSD DDDD vf3 = Sse41.Blend(vf1, vf2, 4); Unsafe.Write(ushortTable.outArrayPtr, vf3); if (!ushortTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]) && (z[2] == y[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("SSE41 Blend failed on ushort:"); foreach (var item in ushortTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SDSD SDSD vf3 = Sse41.Blend(vf1, vf2, 85); Unsafe.Write(ushortTable.outArrayPtr, vf3); if (!ushortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == y[2]) && (z[3] == x[3]) && (z[4] == y[4]) && (z[5] == x[5]) && (z[6] == y[6]) && (z[7] == x[7]))) { Console.WriteLine("SSE41 Blend failed on ushort:"); foreach (var item in ushortTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SDDD DDDD vf3 = (Vector128 <ushort>) typeof(Sse41).GetMethod(nameof(Sse41.Blend), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(1) }); Unsafe.Write(ushortTable.outArrayPtr, vf3); if (!ushortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("SSE41 Blend failed on ushort:"); foreach (var item in ushortTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } } return(testResult); }
public static Vector128 <short> Divide(this Vector128 <short> dividend, Vector128 <short> divisor) { // Based on https://stackoverflow.com/a/51458507/347870 // Convert to two 32-bit integers Vector128 <int> a_hi_epi32 = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16); Vector128 <int> a_lo_epi32_shift = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16); Vector128 <int> a_lo_epi32 = Sse2.ShiftRightArithmetic(a_lo_epi32_shift, 16); Vector128 <int> b_hi_epi32 = Sse2.ShiftRightArithmetic(divisor.AsInt32(), 16); Vector128 <int> b_lo_epi32_shift = Sse2.ShiftLeftLogical(divisor.AsInt32(), 16); Vector128 <int> b_lo_epi32 = Sse2.ShiftRightArithmetic(b_lo_epi32_shift, 16); // Convert to 32-bit floats Vector128 <float> a_hi = Sse2.ConvertToVector128Single(a_hi_epi32); Vector128 <float> a_lo = Sse2.ConvertToVector128Single(a_lo_epi32); Vector128 <float> b_hi = Sse2.ConvertToVector128Single(b_hi_epi32); Vector128 <float> b_lo = Sse2.ConvertToVector128Single(b_lo_epi32); // Calculate the reciprocal Vector128 <float> b_hi_rcp = Sse.Reciprocal(b_hi); Vector128 <float> b_lo_rcp = Sse.Reciprocal(b_lo); // Calculate the inverse Vector128 <float> b_hi_inv_1; Vector128 <float> b_lo_inv_1; Vector128 <float> two = Vector128.Create(2.00000051757f); if (Fma.IsSupported) { b_hi_inv_1 = Fma.MultiplyAddNegated(b_hi_rcp, b_hi, two); b_lo_inv_1 = Fma.MultiplyAddNegated(b_lo_rcp, b_lo, two); } else { Vector128 <float> b_mul_hi = Sse.Multiply(b_hi_rcp, b_hi); Vector128 <float> b_mul_lo = Sse.Multiply(b_lo_rcp, b_lo); b_hi_inv_1 = Sse.Subtract(two, b_mul_hi); b_lo_inv_1 = Sse.Subtract(two, b_mul_lo); } // Compensate for the loss Vector128 <float> b_hi_rcp_1 = Sse.Multiply(b_hi_rcp, b_hi_inv_1); Vector128 <float> b_lo_rcp_1 = Sse.Multiply(b_lo_rcp, b_lo_inv_1); // Perform the division by multiplication Vector128 <float> hi = Sse.Multiply(a_hi, b_hi_rcp_1); Vector128 <float> lo = Sse.Multiply(a_lo, b_lo_rcp_1); // Convert back to integers Vector128 <int> hi_epi32 = Sse2.ConvertToVector128Int32WithTruncation(hi); Vector128 <int> lo_epi32 = Sse2.ConvertToVector128Int32WithTruncation(lo); // Zero-out the unnecessary parts Vector128 <int> hi_epi32_shift = Sse2.ShiftLeftLogical(hi_epi32, 16); // Blend the bits, and return if (Sse41.IsSupported) { return(Sse41.Blend(lo_epi32.AsInt16(), hi_epi32_shift.AsInt16(), 0xAA)); } else { Vector128 <int> lo_epi32_mask = Sse2.And(lo_epi32, Vector128.Create((ushort)0xFFFF).AsInt16().AsInt32()); return(Sse2.Or(hi_epi32_shift, lo_epi32_mask).AsInt16()); } }
public static Vector128 <float> _mm_blend_ps(Vector128 <float> left, Vector128 <float> right, byte control) { return(Sse41.Blend(left, right, control)); }
public static Vector128 <double> _mm_blend_pd(Vector128 <double> left, Vector128 <double> right, byte control) { return(Sse41.Blend(left, right, control)); }
private static Vector128 <ulong> blend_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) => Sse41.Blend(x.AsUInt16(), y.AsUInt16(), m).AsUInt64();
public static Vector128 <ushort> _mm_blend_epi16(Vector128 <ushort> left, Vector128 <ushort> right, byte control) { return(Sse41.Blend(left, right, control)); }
private static Vector128 <ulong> blend_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) => Sse41.Blend(x.As <ushort>(), y.As <ushort>(), m).As <ulong>();
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); byte * op = opstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vzero = Vector256 <float> .Zero; var vmin = Vector256.Create(0.5f / byte.MaxValue); var vscale = Vector256.Create((float)byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32))); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vf0 = Avx.LoadVector256(ip); var vf1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var vf2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); var vf3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3); ip += Vector256 <byte> .Count; var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Avx.Max(vfa0, vmin); vfa1 = Avx.Max(vfa1, vmin); vfa2 = Avx.Max(vfa2, vmin); vfa3 = Avx.Max(vfa3, vmin); vf0 = Avx.Multiply(vf0, Avx.Reciprocal(vfa0)); vf1 = Avx.Multiply(vf1, Avx.Reciprocal(vfa1)); vf2 = Avx.Multiply(vf2, Avx.Reciprocal(vfa2)); vf3 = Avx.Multiply(vf3, Avx.Reciprocal(vfa3)); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Avx.BlendVariable(vf0, vzero, HWIntrinsics.AvxCompareEqual(vfa0, vmin)); vf1 = Avx.BlendVariable(vf1, vzero, HWIntrinsics.AvxCompareEqual(vfa1, vmin)); vf2 = Avx.BlendVariable(vf2, vzero, HWIntrinsics.AvxCompareEqual(vfa2, vmin)); vf3 = Avx.BlendVariable(vf3, vzero, HWIntrinsics.AvxCompareEqual(vfa3, vmin)); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vi0 = Avx.ConvertToVector256Int32(vf0); var vi1 = Avx.ConvertToVector256Int32(vf1); var vi2 = Avx.ConvertToVector256Int32(vf2); var vi3 = Avx.ConvertToVector256Int32(vf3); var vs0 = Avx2.PackSignedSaturate(vi0, vi1); var vs1 = Avx2.PackSignedSaturate(vi2, vi3); var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1); vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte(); Avx.Store(op, vb0); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vzero = Vector128 <float> .Zero; var vmin = Vector128.Create(0.5f / byte.MaxValue); var vscale = Vector128.Create((float)byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vf0 = Sse.LoadVector128(ip); var vf1 = Sse.LoadVector128(ip + Vector128 <float> .Count); var vf2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2); var vf3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3); ip += Vector128 <byte> .Count; var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Sse.Max(vfa0, vmin); vfa1 = Sse.Max(vfa1, vmin); vfa2 = Sse.Max(vfa2, vmin); vfa3 = Sse.Max(vfa3, vmin); vf0 = Sse.Multiply(vf0, Sse.Reciprocal(vfa0)); vf1 = Sse.Multiply(vf1, Sse.Reciprocal(vfa1)); vf2 = Sse.Multiply(vf2, Sse.Reciprocal(vfa2)); vf3 = Sse.Multiply(vf3, Sse.Reciprocal(vfa3)); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Sse41.BlendVariable(vf0, vzero, Sse.CompareEqual(vfa0, vmin)); vf1 = Sse41.BlendVariable(vf1, vzero, Sse.CompareEqual(vfa1, vmin)); vf2 = Sse41.BlendVariable(vf2, vzero, Sse.CompareEqual(vfa2, vmin)); vf3 = Sse41.BlendVariable(vf3, vzero, Sse.CompareEqual(vfa3, vmin)); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vi0 = Sse2.ConvertToVector128Int32(vf0); var vi1 = Sse2.ConvertToVector128Int32(vf1); var vi2 = Sse2.ConvertToVector128Int32(vf2); var vi3 = Sse2.ConvertToVector128Int32(vf3); var vs0 = Sse2.PackSignedSaturate(vi0, vi1); var vs1 = Sse2.PackSignedSaturate(vi2, vi3); var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1); Sse2.Store(op, vb0); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif float fmax = new Vector4(byte.MaxValue).X, fround = new Vector4(0.5f).X, fmin = fround / fmax; while (ip < ipe) { float f3 = ip[3]; if (f3 < fmin) { *(uint *)op = 0; } else { float f3i = fmax / f3; byte o0 = ClampToByte((int)(ip[0] * f3i + fround)); byte o1 = ClampToByte((int)(ip[1] * f3i + fround)); byte o2 = ClampToByte((int)(ip[2] * f3i + fround)); byte o3 = ClampToByte((int)(f3 * fmax + fround)); op[0] = o0; op[1] = o1; op[2] = o2; op[3] = o3; } ip += 4; op += 4; } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { fixed(float *atstart = &LookupTables.Alpha[0]) { byte * ip = ipstart, ipe = ipstart + cb; float *op = (float *)opstart, at = atstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vscale = Vector256.Create(1f / byte.MaxValue); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vi0 = Avx2.ConvertToVector256Int32(ip); var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count); var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2); var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3); ip += Vector256 <byte> .Count; var vf0 = Avx.ConvertToVector256Single(vi0); var vf1 = Avx.ConvertToVector256Single(vi1); var vf2 = Avx.ConvertToVector256Single(vi2); var vf3 = Avx.ConvertToVector256Single(vi3); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vf0 = Avx.Multiply(vf0, vfa0); vf1 = Avx.Multiply(vf1, vfa1); vf2 = Avx.Multiply(vf2, vfa2); vf3 = Avx.Multiply(vf3, vfa3); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); Avx.Store(op, vf0); Avx.Store(op + Vector256 <int> .Count, vf1); Avx.Store(op + Vector256 <int> .Count * 2, vf2); Avx.Store(op + Vector256 <int> .Count * 3, vf3); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vscale = Vector128.Create(1f / byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vi0 = Sse41.ConvertToVector128Int32(ip); var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count); var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2); var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3); ip += Vector128 <byte> .Count; var vf0 = Sse2.ConvertToVector128Single(vi0); var vf1 = Sse2.ConvertToVector128Single(vi1); var vf2 = Sse2.ConvertToVector128Single(vi2); var vf3 = Sse2.ConvertToVector128Single(vi3); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vf0 = Sse.Multiply(vf0, vfa0); vf1 = Sse.Multiply(vf1, vfa1); vf2 = Sse.Multiply(vf2, vfa2); vf3 = Sse.Multiply(vf3, vfa3); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); Sse.Store(op, vf0); Sse.Store(op + Vector128 <int> .Count, vf1); Sse.Store(op + Vector128 <int> .Count * 2, vf2); Sse.Store(op + Vector128 <int> .Count * 3, vf3); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif while (ip < ipe) { float o0 = at[(uint)ip[0]]; float o1 = at[(uint)ip[1]]; float o2 = at[(uint)ip[2]]; float o3 = at[(uint)ip[3]]; ip += 4; op[0] = o0 * o3; op[1] = o1 * o3; op[2] = o2 * o3; op[3] = o3; op += 4; } } }
public static __m128 _mm_blend_ps(__m128 left, __m128 right, byte control) => Sse41.Blend(left, right, control);
unsafe private static void mixSse41(Blake2sContext *s, uint *m) { var row1 = Sse2.LoadVector128(s->h); var row2 = Sse2.LoadVector128(s->h + 4); var row3 = v128iv0; var row4 = v128iv1; row4 = Sse2.Xor(row4, Sse2.LoadVector128(s->t)); // reads into f[] as well var m0 = Sse2.LoadVector128(m); var m1 = Sse2.LoadVector128(m + 4); var m2 = Sse2.LoadVector128(m + 8); var m3 = Sse2.LoadVector128(m + 12); var r16 = v128rm0; var r8 = v128rm1; //ROUND 1 #if OLD_INTRINSICS var b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m0), Sse.StaticCast <uint, float>(m1), 0b_10_00_10_00)); #else var b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_10_00_10_00).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m0), Sse.StaticCast <uint, float>(m1), 0b_11_01_11_01)); #else b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_11_01_11_01).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m2), Sse.StaticCast <uint, float>(m3), 0b_10_00_10_00)); #else b0 = Sse.Shuffle(m2.AsSingle(), m3.AsSingle(), 0b_10_00_10_00).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m2), Sse.StaticCast <uint, float>(m3), 0b_11_01_11_01)); #else b0 = Sse.Shuffle(m2.AsSingle(), m3.AsSingle(), 0b_11_01_11_01).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 2 #if OLD_INTRINSICS var t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_11_00)); #else var t0 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_00_11_00).AsUInt32(); #endif var t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4); #if OLD_INTRINSICS var t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00)); #else var t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.Shuffle(m2, 0b_00_00_10_00); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m3), 0b_11_00_00_00)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00)); #else t1 = Sse41.Blend(m1.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); t0 = Sse2.ShiftLeftLogical128BitLane(m1, 4); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_00_00)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00)); #else t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t2 = Sse41.Blend(m0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_00)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 3 t0 = Sse2.UnpackHigh(m2, m3); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m1), 0b_00_00_11_00)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_11)); #else t1 = Sse41.Blend(m3.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackLow(m2, m0); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m0), 0b_11_11_00_00)); #else t1 = Sse41.Blend(t0.AsUInt16(), m0.AsUInt16(), 0b_11_11_00_00).AsUInt32(); #endif t2 = Sse2.ShiftLeftLogical128BitLane(m3, 8); #if OLD_INTRINSICS b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t2), 0b_11_00_00_00)); #else b0 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_11_00)); #else t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_11_11_00).AsUInt32(); #endif t1 = Sse2.ShiftRightLogical128BitLane(m1, 12); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_01_00_11_10); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.ShiftLeftLogical128BitLane(m3, 4); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m1), 0b_00_11_00_11)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00)); #else t1 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_00_01_10_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 4 t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.UnpackHigh(t0, m2); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_00)); #else t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.ShiftLeftLogical128BitLane(m2, 8); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00)); #else t1 = Sse41.Blend(m3.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_00_01_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m1), 0b_00_00_11_11)); t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m3), 0b_11_00_00_00)); #else t0 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t1, 0b_11_00_01_10); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackLow(m0, m2); t1 = Sse2.UnpackHigh(m1, m2); #if OLD_INTRINSICS b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t1), Sse.StaticCast <uint, ulong>(t0))); #else b0 = Sse2.UnpackLow(t1.AsUInt64(), t0.AsUInt64()).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 5 #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2))); t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m2))); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11)); #else t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackHigh(m0.AsUInt64(), m2.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_00_01_11); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m3))); t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m1))); b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11)); #else t0 = Sse2.UnpackHigh(m1.AsUInt64(), m3.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackLow(m0.AsUInt64(), m1.AsUInt64()).AsUInt32(); b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m3), Sse.StaticCast <uint, ulong>(m1))); t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m2), Sse.StaticCast <uint, ulong>(m0))); b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_00_11)); #else t0 = Sse2.UnpackHigh(m3.AsUInt64(), m1.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackHigh(m2.AsUInt64(), m0.AsUInt64()).AsUInt32(); b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_11).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_00_11)); #else t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32(); #endif t1 = Sse2.ShiftLeftLogical128BitLane(t0, 8); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11)); #else t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_01_10_00_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 6 t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.UnpackLow(m0, m2); #if OLD_INTRINSICS b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1))); #else b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.ShiftRightLogical128BitLane(m2, 4); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_00_11)); b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_11_00)); #else t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_00_00_11).AsUInt32(); b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_11_00).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00)); #else t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); #endif t1 = Sse2.ShiftRightLogical128BitLane(m3, 4); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_00)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_01_10_11_00); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2))); #else t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); #endif t1 = Sse2.Shuffle(m3, 0b_00_10_00_01); #if OLD_INTRINSICS b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11)); #else b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 7 t0 = Sse2.ShiftLeftLogical128BitLane(m1, 12); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m3), 0b_00_11_00_11)); b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00)); #else t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_11).AsUInt32(); b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_00_00)); #else t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32(); #endif t1 = Sse2.ShiftRightLogical128BitLane(m1, 4); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_01_11_00); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m2))); #else t0 = Sse2.UnpackLow(m0.AsUInt64(), m2.AsUInt64()).AsUInt32(); #endif t1 = Sse2.ShiftRightLogical128BitLane(m1, 4); #if OLD_INTRINSICS b0 = Sse2.Shuffle(Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_00)), 0b_10_11_01_00); #else b0 = Sse2.Shuffle(Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(), 0b_10_11_01_00); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackHigh(m1, m2); #if OLD_INTRINSICS t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(t0))); #else t1 = Sse2.UnpackHigh(m0.AsUInt64(), t0.AsUInt64()).AsUInt32(); #endif b0 = Sse2.Shuffle(t1, 0b_11_00_01_10); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 8 t0 = Sse2.UnpackHigh(m0, m1); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11)); #else t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t1, 0b_10_00_11_01); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(m3), 0b_00_11_00_00)); #else t0 = Sse41.Blend(m2.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_00).AsUInt32(); #endif t1 = Sse2.ShiftRightLogical128BitLane(m0, 4); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_01_00_10_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m3))); t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2))); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_11_00)); #else t0 = Sse2.UnpackHigh(m0.AsUInt64(), m3.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_11_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_00_10_11_01); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackLow(m0, m1); t1 = Sse2.UnpackHigh(m1, m2); #if OLD_INTRINSICS b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1))); #else b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 9 t0 = Sse2.UnpackHigh(m1, m3); #if OLD_INTRINSICS t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(m0))); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m2), 0b_11_00_00_00)); b0 = Sse.StaticCast <ushort, uint>(Sse2.ShuffleHigh(Sse.StaticCast <uint, ushort>(t2), 0b_01_00_11_10)); #else t1 = Sse2.UnpackLow(t0.AsUInt64(), m0.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); b0 = Sse2.ShuffleHigh(t2.AsUInt16(), 0b_01_00_11_10).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackHigh(m0, m3); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(t0), 0b_11_11_00_00)); #else t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_11_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t1, 0b_00_10_01_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00)); #else t0 = Sse41.Blend(m2.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); #endif t1 = Sse2.ShiftLeftLogical128BitLane(t0, 4); #if OLD_INTRINSICS b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11)); #else b0 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m0), 0b_00_11_00_00)); #else t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t0, 0b_01_00_11_10); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 10 #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_00_11)); t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_00_00)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_00_11_11)); #else t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32(); t1 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_01_11_00_10); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.ShiftLeftLogical128BitLane(m0, 4); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00)); #else t1 = Sse41.Blend(m1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t1, 0b_01_10_00_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); t0 = Sse2.UnpackHigh(m0, m3); t1 = Sse2.UnpackLow(m2, m3); #if OLD_INTRINSICS t2 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1))); #else t2 = Sse2.UnpackHigh(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_11_00_10_01); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m2), 0b_11_00_00_00)); #else t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif t1 = Sse2.UnpackLow(m0, m3); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_11)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_00_01_10_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); row1 = Sse2.Xor(row1, row3); row2 = Sse2.Xor(row2, row4); row1 = Sse2.Xor(row1, Sse2.LoadVector128(s->h)); row2 = Sse2.Xor(row2, Sse2.LoadVector128(s->h + 4)); Sse2.Store(s->h, row1); Sse2.Store(s->h + 4, row2); }