private static float TruncateSse41(float x) { var f = Vector128.CreateScalarUnsafe(x); var r = Sse41.RoundToZero(f); return(r.ToScalar()); }
public static int AVXMin(int x, int y) { var v1 = Vector128.CreateScalarUnsafe(x); var v2 = Vector128.CreateScalarUnsafe(y); return(Avx.Min(v1, v2).ToScalar()); }
private static float RoundSse41(float x) { var f = Vector128.CreateScalarUnsafe(x); var r = Sse41.RoundCurrentDirectionScalar(f); return(r.ToScalar()); }
private static float CeilingSse41(float x) { var f = Vector128.CreateScalarUnsafe(x); var r = Sse41.CeilingScalar(f); return(r.ToScalar()); }
public float AccurateSse(float a) { return(Sse.DivideScalar( Vector128.CreateScalarUnsafe(1f), Sse.SqrtScalar(Vector128.CreateScalarUnsafe(a))) .ToScalar()); }
public unsafe static Vector128 <float> BroadcastScalarToVector128(float value) { // could implement this with Avx.BroadcastScalarToVector128(&value) (_mm256_broadcast_ps) but vbroadcastf128 makes a memory thunk Vector128 <float> value128 = Vector128.CreateScalarUnsafe(value); return(Avx.Shuffle(value128, value128, Constant.Simd128x4.Broadcast0toAll)); }
static void TestExplicitFmaUsage6(ref Vector128 <float> a, float b) { CompareFloats(ReferenceMultiplyAdd(b, b, b), Fma.MultiplyAdd( Vector128.CreateScalarUnsafe(b), Vector128.CreateScalar(b), Vector128.Create(b)).ToScalar()); }
static void TestExplicitFmaUsage5(ref Vector128 <double> a, double b) { CompareDoubles(ReferenceMultiplyAdd(-b, -b, -333.0), Fma.MultiplyAdd( Vector128.CreateScalarUnsafe(-b), Vector128.CreateScalarUnsafe(-b), Vector128.CreateScalarUnsafe(-333.0)).ToScalar()); }
static void TestExplicitFmaUsage6(ref Vector128 <double> a, double b) { CompareDoubles(ReferenceMultiplyAdd(b, b, b), Fma.MultiplyAdd( Vector128.CreateScalarUnsafe(b), Vector128.CreateScalar(b), Vector128.Create(b)).ToScalar()); }
public static f32 Max_f32(f32 a, f32 b) { if (Sse.IsSupported) { return(Sse.MaxScalar(Vector128.CreateScalarUnsafe(a), Vector128.CreateScalarUnsafe(b)).ToScalar()); } return(MathF.Max(a, b)); }
private static double Sse2Clamp(double value, double min, double max) { // around 2x faster than managed (benchmarked on i7-4720HQ @ 2.6Ghz) var vals = Vector128.CreateScalarUnsafe(value); var mins = Vector128.CreateScalarUnsafe(min); var maxs = Vector128.CreateScalarUnsafe(max); return(Sse2.MaxScalar(mins, Sse2.MinScalar(vals, maxs)).ToScalar()); }
private static float SseClamp(float value, float min, float max) { // around 2x faster than managed (benchmarked on i7-4720HQ @ 2.6Ghz) var vals = Vector128.CreateScalarUnsafe(value); var mins = Vector128.CreateScalarUnsafe(min); var maxs = Vector128.CreateScalarUnsafe(max); return(Sse.MaxScalar(mins, Sse.MinScalar(vals, maxs)).ToScalar()); }
public static Vector128 <int> BroadcastScalarToVector128(int value) { // AVX version of Avx2.BroadcastScalarToVector128(int) (_mm_broadcastd_epi32()) // Same code as https://github.com/dotnet/runtime/blob/master/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs // Vector128.Create(int) without the CPU dispatch and signal to compiler that VEX can be used. Vector128 <int> value128 = Vector128.CreateScalarUnsafe(value); // reinterpet cast without upper zeroing return(Avx.Shuffle(value128, Constant.Simd128x4.Broadcast0toAll)); }
public void RunBasicScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario)); UInt16 value = TestLibrary.Generator.GetUInt16(); Vector128 <UInt16> result = Vector128.CreateScalarUnsafe(value); ValidateResult(result, value); }
private static float RoundDown(float x) { if (Sse41.IsSupported) { return(Sse41.RoundToNegativeInfinity(Vector128.CreateScalarUnsafe(x)).ToScalar()); } return(MathF.Round(x, MidpointRounding.ToNegativeInfinity)); }
public static float CopySign(float x, float y) { if (Sse.IsSupported || AdvSimd.IsSupported) { return(VectorMath.ConditionalSelectBitwise(Vector128.CreateScalarUnsafe(-0.0f), Vector128.CreateScalarUnsafe(y), Vector128.CreateScalarUnsafe(x)).ToScalar()); } else { return(SoftwareFallback(x, y)); }
public static Vector128 <uint> CreateTwoUInt(uint a) { if (Sse2.IsSupported) { var t1 = Vector128.CreateScalarUnsafe(a).AsUInt64(); return(Sse2.UnpackLow(t1, t1).AsUInt32()); } return(Vector128.Create(a, 0, a, 0)); }
static unsafe double asdouble(ulong x) { #if SSE if (Sse.IsSupported) { return(Vector128.CreateScalarUnsafe(x).AsDouble().ToScalar()); // ToScalar "relies" on Sse (the fallback is garbage) } else #endif return(*(double *)&x); // this produces bad codegen on < net5 }
private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, ulong value) { Debug.Assert(AllCharsInUInt64AreAscii(value)); #if NETCOREAPP3_1 if (Bmi2.X64.IsSupported) { // BMI2 will work regardless of the processor's endianness. Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul)); } #else if (Sse2.X64.IsSupported) { // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes // [ b0 b1 b2 b3 b0 b1 b2 b3 ], then writes 4 bytes (32 bits) to the destination. Vector128 <short> vecWide = Sse2.X64.ConvertScalarToVector128UInt64(value).AsInt16(); Vector128 <uint> vecNarrow = Sse2.PackUnsignedSaturate(vecWide, vecWide).AsUInt32(); Unsafe.WriteUnaligned <uint>(ref outputBuffer, Sse2.ConvertToUInt32(vecNarrow)); } else if (AdvSimd.IsSupported) { // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes // [ b0 b1 b2 b3 * * * * ], then writes 4 bytes (32 bits) to the destination. Vector128 <short> vecWide = Vector128.CreateScalarUnsafe(value).AsInt16(); Vector64 <byte> lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(vecWide); Unsafe.WriteUnaligned <uint>(ref outputBuffer, lower.AsUInt32().ToScalar()); } #endif else { if (BitConverter.IsLittleEndian) { outputBuffer = (byte)value; value >>= 16; Unsafe.Add(ref outputBuffer, 1) = (byte)value; value >>= 16; Unsafe.Add(ref outputBuffer, 2) = (byte)value; value >>= 16; Unsafe.Add(ref outputBuffer, 3) = (byte)value; } else { Unsafe.Add(ref outputBuffer, 3) = (byte)value; value >>= 16; Unsafe.Add(ref outputBuffer, 2) = (byte)value; value >>= 16; Unsafe.Add(ref outputBuffer, 1) = (byte)value; value >>= 16; outputBuffer = (byte)value; } } }
public static f32 Reciprocal_f32(f32 a) { if (Sse.IsSupported) { return(Sse.ReciprocalScalar(Vector128.CreateScalarUnsafe(a)).ToScalar()); } // pow( pow(x,-0.5), 2 ) = pow( x, -1 ) = 1.0 / x a = Casti32_f32((int)(0xbe6eb3beU - (uint)Castf32_i32(a)) >> 1); return(a * a); }
public static unsafe float Int32BitsToSingle(int value) { // Workaround for https://github.com/dotnet/runtime/issues/11413 if (Sse2.IsSupported) { Vector128 <float> vec = Vector128.CreateScalarUnsafe(value).AsSingle(); return(vec.ToScalar()); } return(*((float *)&value)); }
static unsafe float asfloat(uint x) { #if SSE if (Sse.IsSupported) { return(Vector128.CreateScalarUnsafe(x).AsSingle().ToScalar()); // ToScalar "relies" on Sse (the fallback is garbage) } else #endif return(*(float *)&x); // this produces bad codegen on < net5 }
public static unsafe double Int64BitsToDouble(long value) { // Workaround for https://github.com/dotnet/runtime/issues/11413 if (Sse2.X64.IsSupported) { Vector128 <double> vec = Vector128.CreateScalarUnsafe(value).AsDouble(); return(vec.ToScalar()); } return(*((double *)&value)); }
public static unsafe int SingleToInt32Bits(float value) { // Workaround for https://github.com/dotnet/runtime/issues/11413 if (Sse2.IsSupported) { Vector128 <int> vec = Vector128.CreateScalarUnsafe(value).AsInt32(); return(Sse2.ConvertToInt32(vec)); } return(*((int *)&value)); }
public static unsafe long DoubleToInt64Bits(double value) { // Workaround for https://github.com/dotnet/runtime/issues/11413 if (Sse2.X64.IsSupported) { Vector128 <long> vec = Vector128.CreateScalarUnsafe(value).AsInt64(); return(Sse2.X64.ConvertToInt64(vec)); } return(*((long *)&value)); }
public static bool IntrinsicEquality(this float left, float right) { var vLeft = Vector128.CreateScalarUnsafe(left).AsInt32(); var vRight = Vector128.CreateScalarUnsafe(right).AsInt32(); vLeft = Sse2.CompareEqual(vLeft, vRight); int mask = Sse.MoveMask(vLeft.AsSingle()); return(mask == -1); }
private static void EncodeToUtf16_Ssse3(ReadOnlySpan <byte> bytes, Span <char> chars, Casing casing) { Debug.Assert(bytes.Length >= 4); nint pos = 0; Vector128 <byte> shuffleMask = Vector128.Create( 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 1, 0xFF, 0xFF, 0xFF, 2, 0xFF, 0xFF, 0xFF, 3, 0xFF); Vector128 <byte> asciiTable = (casing == Casing.Upper) ? Vector128.Create((byte)'0', (byte)'1', (byte)'2', (byte)'3', (byte)'4', (byte)'5', (byte)'6', (byte)'7', (byte)'8', (byte)'9', (byte)'A', (byte)'B', (byte)'C', (byte)'D', (byte)'E', (byte)'F') : Vector128.Create((byte)'0', (byte)'1', (byte)'2', (byte)'3', (byte)'4', (byte)'5', (byte)'6', (byte)'7', (byte)'8', (byte)'9', (byte)'a', (byte)'b', (byte)'c', (byte)'d', (byte)'e', (byte)'f'); do { // Read 32bits from "bytes" span at "pos" offset uint block = Unsafe.ReadUnaligned <uint>( ref Unsafe.Add(ref MemoryMarshal.GetReference(bytes), pos)); // Calculate nibbles Vector128 <byte> lowNibbles = Ssse3.Shuffle( Vector128.CreateScalarUnsafe(block).AsByte(), shuffleMask); Vector128 <byte> highNibbles = Sse2.ShiftRightLogical( Sse2.ShiftRightLogical128BitLane(lowNibbles, 2).AsInt32(), 4).AsByte(); // Lookup the hex values at the positions of the indices Vector128 <byte> indices = Sse2.And( Sse2.Or(lowNibbles, highNibbles), Vector128.Create((byte)0xF)); Vector128 <byte> hex = Ssse3.Shuffle(asciiTable, indices); // The high bytes (0x00) of the chars have also been converted // to ascii hex '0', so clear them out. hex = Sse2.And(hex, Vector128.Create((ushort)0xFF).AsByte()); // Save to "chars" at pos*2 offset Unsafe.WriteUnaligned( ref Unsafe.As <char, byte>( ref Unsafe.Add(ref MemoryMarshal.GetReference(chars), pos * 2)), hex); pos += 4; } while (pos < bytes.Length - 3); // Process trailing elements (bytes.Length % 4) for (; pos < bytes.Length; pos++) { ToCharsBuffer(Unsafe.Add(ref MemoryMarshal.GetReference(bytes), pos), chars, (int)pos * 2, casing); } }
public static Vector128 <uint> CreateTwoUInt(uint a, uint b) { if (Sse2.IsSupported) { var t1 = Vector128.CreateScalarUnsafe(a); var t2 = Vector128.CreateScalarUnsafe(b); return(Sse2.UnpackLow(t1.AsUInt64(), t2.AsUInt64()).AsUInt32()); } return(Vector128.Create(a, 0, b, 0)); }
public RgbaColor32 GetColor32() { if (Sse41.IsSupported) { Vector128 <byte> color = Vector128.CreateScalarUnsafe(Unsafe.As <RgbaColor8, uint>(ref this)).AsByte(); return(new RgbaColor32(Sse41.ConvertToVector128Int32(color))); } else { return(new RgbaColor32(R, G, B, A)); } }
private static double RoundSse41(double x, MidpointRounding mpr) { var f = Vector128.CreateScalarUnsafe(x); return((mpr switch { MidpointRounding.ToEven => Sse41.RoundToNearestIntegerScalar(f), MidpointRounding.AwayFromZero => Sse41.RoundCurrentDirectionScalar(f), MidpointRounding.ToZero => Sse41.RoundToZeroScalar(f), MidpointRounding.ToNegativeInfinity => Sse41.RoundToNegativeInfinityScalar(f), MidpointRounding.ToPositiveInfinity => Sse41.RoundToPositiveInfinityScalar(f), _ => throw new ArgumentOutOfRangeException(nameof(mpr), mpr, "Midpoint Rounding must be a valid value.") }).ToScalar());