public static Vector128 <float> MaskExp(Vector128 <float> power, int exponentMask) { Vector128 <float> restrictedPower = Avx.Blend(power, AvxExtensions.BroadcastScalarToVector128(1.0F), (byte)exponentMask); Vector128 <float> exponent = Avx.Blend(MathV.Exp(restrictedPower), Vector128 <float> .Zero, (byte)exponentMask); return(exponent); }
public static unsafe RtMatrix Transpose(RtMatrix matrix) { RtMatrix result = new RtMatrix(); if (Avx2.IsSupported && useIntrinsics) { var row1 = Avx.LoadVector256(&matrix.M11); var row2 = Avx.LoadVector256(&matrix.M21); var row3 = Avx.LoadVector256(&matrix.M31); var row4 = Avx.LoadVector256(&matrix.M41); var l12 = Avx.UnpackLow(row1, row2); var l34 = Avx.UnpackLow(row3, row4); var h12 = Avx.UnpackHigh(row1, row2); var h34 = Avx.UnpackHigh(row3, row4); Avx.Store(&result.M11, Avx.Blend(l12, Avx2.Permute4x64(l34, 0x4E), 0x0C)); Avx.Store(&result.M21, Avx.Blend(h12, Avx2.Permute4x64(h34, 0x4E), 0x0C)); Avx.Store(&result.M31, Avx.Blend(Avx2.Permute4x64(l12, 0x4E), l34, 0x0c)); Avx.Store(&result.M41, Avx.Blend(Avx2.Permute4x64(h12, 0x4E), h34, 0x0c)); return(result); } result.M11 = matrix.M11; result.M12 = matrix.M21; result.M13 = matrix.M31; result.M14 = matrix.M41; result.M21 = matrix.M12; result.M22 = matrix.M22; result.M23 = matrix.M32; result.M24 = matrix.M42; result.M31 = matrix.M13; result.M32 = matrix.M23; result.M33 = matrix.M33; result.M34 = matrix.M43; result.M41 = matrix.M14; result.M42 = matrix.M24; result.M43 = matrix.M34; result.M44 = matrix.M44; return(result); }
public unsafe static Vector128 <float> Exp2(Vector128 <float> power) { Debug.Assert(Avx.MoveMask(Avx.And(Avx.CompareGreaterThan(power, AvxExtensions.BroadcastScalarToVector128(MathV.FloatMaximumPower)), Avx.CompareOrdered(power, power))) == 0); byte zeroMask = (byte)Avx.MoveMask(Avx.CompareLessThan(power, AvxExtensions.BroadcastScalarToVector128(-MathV.FloatMaximumPower))); Vector128 <float> integerPart = Avx.RoundToNearestInteger(power); Vector128 <float> integerExponent = Avx.ShiftLeftLogical(Avx.Add(Avx.ConvertToVector128Int32(integerPart), MathV.FloatMantissaZero128), MathV.FloatMantissaBits).AsSingle(); // evaluate polynomial Vector128 <float> beta1 = AvxExtensions.BroadcastScalarToVector128(MathV.Exp2Beta1); Vector128 <float> beta2 = AvxExtensions.BroadcastScalarToVector128(MathV.Exp2Beta2); Vector128 <float> beta3 = AvxExtensions.BroadcastScalarToVector128(MathV.Exp2Beta3); Vector128 <float> beta4 = AvxExtensions.BroadcastScalarToVector128(MathV.Exp2Beta4); Vector128 <float> x = Avx.Subtract(power, integerPart); // fractional part Vector128 <float> fractionalExponent = AvxExtensions.BroadcastScalarToVector128(MathV.One); fractionalExponent = Avx.Add(fractionalExponent, Avx.Multiply(beta1, x)); Vector128 <float> x2 = Avx.Multiply(x, x); fractionalExponent = Avx.Add(fractionalExponent, Avx.Multiply(beta2, x2)); Vector128 <float> x3 = Avx.Multiply(x2, x); fractionalExponent = Avx.Add(fractionalExponent, Avx.Multiply(beta3, x3)); Vector128 <float> x4 = Avx.Multiply(x3, x); fractionalExponent = Avx.Add(fractionalExponent, Avx.Multiply(beta4, x4)); // form exponent Vector128 <float> exponent = Avx.Multiply(integerExponent, fractionalExponent); // suppress exponent overflows by truncating values less than 2^-127 to zero if (zeroMask != 0) { exponent = Avx.Blend(exponent, Vector128 <float> .Zero, zeroMask); } return(exponent); }
static unsafe int Main(string[] args) { int testResult = Pass; if (Avx.IsSupported) { using (TestTable <float> floatTable = new TestTable <float>(new float[8] { 1, -5, 100, 0, 1, -5, 100, 0 }, new float[8] { 22, -1, -50, 0, 22, -1, -50, 0 }, new float[8])) { var vf1 = Unsafe.Read <Vector256 <float> >(floatTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector256 <float> >(floatTable.inArray2Ptr); // SDDD SDDD var vf3 = Avx.Blend(vf1, vf2, 1); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("0Avx Blend failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DSDD DDDD vf3 = Avx.Blend(vf1, vf2, 2); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == y[1]) && (z[2] == x[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("Avx Blend failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DDSD DDDD vf3 = Avx.Blend(vf1, vf2, 4); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]) && (z[2] == y[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("Avx Blend failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SDSD SDSD vf3 = Avx.Blend(vf1, vf2, 85); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == y[2]) && (z[3] == x[3]) && (z[4] == y[4]) && (z[5] == x[5]) && (z[6] == y[6]) && (z[7] == x[7]))) { Console.WriteLine("Avx Blend failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SDDD DDDD vf3 = (Vector256 <float>) typeof(Avx).GetMethod(nameof(Avx.Blend), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(1) }); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]) && (z[4] == x[4]) && (z[5] == x[5]) && (z[6] == x[6]) && (z[7] == x[7]))) { Console.WriteLine("Avx Blend failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <double> doubleTable = new TestTable <double>(new double[4] { 1, -5, 100, 0 }, new double[4] { 22, -1, -50, 0 }, new double[4])) { var vf1 = Unsafe.Read <Vector256 <double> >(doubleTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector256 <double> >(doubleTable.inArray2Ptr); // DD DD var vf3 = Avx.Blend(vf1, vf2, 0); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]))) { Console.WriteLine("Avx Blend failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SD DD vf3 = Avx.Blend(vf1, vf2, 1); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]))) { Console.WriteLine("Avx Blend failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DS DD vf3 = Avx.Blend(vf1, vf2, 2); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == y[1]) && (z[2] == x[2]) && (z[3] == x[3]))) { Console.WriteLine("Avx Blend failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // SS DD vf3 = Avx.Blend(vf1, vf2, 51); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == y[1]) && (z[2] == x[2]) && (z[3] == x[3]))) { Console.WriteLine("Avx Blend failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } // DD DD vf3 = (Vector256 <double>) typeof(Avx).GetMethod(nameof(Avx.Blend), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(0) }); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]))) { Console.WriteLine("Avx Blend failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } } return(testResult); }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); byte * op = opstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vzero = Vector256 <float> .Zero; var vmin = Vector256.Create(0.5f / byte.MaxValue); var vscale = Vector256.Create((float)byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32))); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vf0 = Avx.LoadVector256(ip); var vf1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var vf2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); var vf3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3); ip += Vector256 <byte> .Count; var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Avx.Max(vfa0, vmin); vfa1 = Avx.Max(vfa1, vmin); vfa2 = Avx.Max(vfa2, vmin); vfa3 = Avx.Max(vfa3, vmin); vf0 = Avx.Multiply(vf0, Avx.Reciprocal(vfa0)); vf1 = Avx.Multiply(vf1, Avx.Reciprocal(vfa1)); vf2 = Avx.Multiply(vf2, Avx.Reciprocal(vfa2)); vf3 = Avx.Multiply(vf3, Avx.Reciprocal(vfa3)); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Avx.BlendVariable(vf0, vzero, HWIntrinsics.AvxCompareEqual(vfa0, vmin)); vf1 = Avx.BlendVariable(vf1, vzero, HWIntrinsics.AvxCompareEqual(vfa1, vmin)); vf2 = Avx.BlendVariable(vf2, vzero, HWIntrinsics.AvxCompareEqual(vfa2, vmin)); vf3 = Avx.BlendVariable(vf3, vzero, HWIntrinsics.AvxCompareEqual(vfa3, vmin)); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vi0 = Avx.ConvertToVector256Int32(vf0); var vi1 = Avx.ConvertToVector256Int32(vf1); var vi2 = Avx.ConvertToVector256Int32(vf2); var vi3 = Avx.ConvertToVector256Int32(vf3); var vs0 = Avx2.PackSignedSaturate(vi0, vi1); var vs1 = Avx2.PackSignedSaturate(vi2, vi3); var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1); vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte(); Avx.Store(op, vb0); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vzero = Vector128 <float> .Zero; var vmin = Vector128.Create(0.5f / byte.MaxValue); var vscale = Vector128.Create((float)byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vf0 = Sse.LoadVector128(ip); var vf1 = Sse.LoadVector128(ip + Vector128 <float> .Count); var vf2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2); var vf3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3); ip += Vector128 <byte> .Count; var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Sse.Max(vfa0, vmin); vfa1 = Sse.Max(vfa1, vmin); vfa2 = Sse.Max(vfa2, vmin); vfa3 = Sse.Max(vfa3, vmin); vf0 = Sse.Multiply(vf0, Sse.Reciprocal(vfa0)); vf1 = Sse.Multiply(vf1, Sse.Reciprocal(vfa1)); vf2 = Sse.Multiply(vf2, Sse.Reciprocal(vfa2)); vf3 = Sse.Multiply(vf3, Sse.Reciprocal(vfa3)); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Sse41.BlendVariable(vf0, vzero, Sse.CompareEqual(vfa0, vmin)); vf1 = Sse41.BlendVariable(vf1, vzero, Sse.CompareEqual(vfa1, vmin)); vf2 = Sse41.BlendVariable(vf2, vzero, Sse.CompareEqual(vfa2, vmin)); vf3 = Sse41.BlendVariable(vf3, vzero, Sse.CompareEqual(vfa3, vmin)); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vi0 = Sse2.ConvertToVector128Int32(vf0); var vi1 = Sse2.ConvertToVector128Int32(vf1); var vi2 = Sse2.ConvertToVector128Int32(vf2); var vi3 = Sse2.ConvertToVector128Int32(vf3); var vs0 = Sse2.PackSignedSaturate(vi0, vi1); var vs1 = Sse2.PackSignedSaturate(vi2, vi3); var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1); Sse2.Store(op, vb0); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif float fmax = new Vector4(byte.MaxValue).X, fround = new Vector4(0.5f).X, fmin = fround / fmax; while (ip < ipe) { float f3 = ip[3]; if (f3 < fmin) { *(uint *)op = 0; } else { float f3i = fmax / f3; byte o0 = ClampToByte((int)(ip[0] * f3i + fround)); byte o1 = ClampToByte((int)(ip[1] * f3i + fround)); byte o2 = ClampToByte((int)(ip[2] * f3i + fround)); byte o3 = ClampToByte((int)(f3 * fmax + fround)); op[0] = o0; op[1] = o1; op[2] = o2; op[3] = o3; } ip += 4; op += 4; } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { fixed(float *atstart = &LookupTables.Alpha[0]) { byte * ip = ipstart, ipe = ipstart + cb; float *op = (float *)opstart, at = atstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vscale = Vector256.Create(1f / byte.MaxValue); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vi0 = Avx2.ConvertToVector256Int32(ip); var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count); var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2); var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3); ip += Vector256 <byte> .Count; var vf0 = Avx.ConvertToVector256Single(vi0); var vf1 = Avx.ConvertToVector256Single(vi1); var vf2 = Avx.ConvertToVector256Single(vi2); var vf3 = Avx.ConvertToVector256Single(vi3); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vf0 = Avx.Multiply(vf0, vfa0); vf1 = Avx.Multiply(vf1, vfa1); vf2 = Avx.Multiply(vf2, vfa2); vf3 = Avx.Multiply(vf3, vfa3); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); Avx.Store(op, vf0); Avx.Store(op + Vector256 <int> .Count, vf1); Avx.Store(op + Vector256 <int> .Count * 2, vf2); Avx.Store(op + Vector256 <int> .Count * 3, vf3); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vscale = Vector128.Create(1f / byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vi0 = Sse41.ConvertToVector128Int32(ip); var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count); var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2); var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3); ip += Vector128 <byte> .Count; var vf0 = Sse2.ConvertToVector128Single(vi0); var vf1 = Sse2.ConvertToVector128Single(vi1); var vf2 = Sse2.ConvertToVector128Single(vi2); var vf3 = Sse2.ConvertToVector128Single(vi3); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vf0 = Sse.Multiply(vf0, vfa0); vf1 = Sse.Multiply(vf1, vfa1); vf2 = Sse.Multiply(vf2, vfa2); vf3 = Sse.Multiply(vf3, vfa3); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); Sse.Store(op, vf0); Sse.Store(op + Vector128 <int> .Count, vf1); Sse.Store(op + Vector128 <int> .Count * 2, vf2); Sse.Store(op + Vector128 <int> .Count * 3, vf3); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif while (ip < ipe) { float o0 = at[(uint)ip[0]]; float o1 = at[(uint)ip[1]]; float o2 = at[(uint)ip[2]]; float o3 = at[(uint)ip[3]]; ip += 4; op[0] = o0 * o3; op[1] = o1 * o3; op[2] = o2 * o3; op[3] = o3; op += 4; } } }