public static Vector256 <double> DotProduct2D(Vector256 <double> left, Vector256 <double> right) { // SSE4.1 has a native dot product instruction, dppd if (Sse41.IsSupported) { // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0011_1111; Vector2D dp = Sse41.DotProduct(left.GetLower(), right.GetLower(), control); return(Helpers.DuplicateToVector256(dp)); } else if (Sse3.IsSupported) { Vector2D tmp = Sse2.Multiply(left.GetLower(), right.GetLower()); return(Helpers.DuplicateToVector256(Sse3.HorizontalAdd(tmp, tmp))); } else if (Sse2.IsSupported) { Vector2D tmp = Sse2.Multiply(left.GetLower(), right.GetLower()); Vector2D shuf = Sse2.Shuffle(tmp, tmp, ShuffleValues.YXYX); var dot = Sse2.Add(tmp, shuf); return(dot.ToVector256Unsafe().WithUpper(dot)); } return(DotProduct2D_Software(left, right)); }
internal static unsafe float AbsMaxAvx(this ReadOnlySpan <float> array) { const int StepSize = 8; // Vector256<float>.Count; Debug.Assert(array.Length >= StepSize, "Input can't be smaller than the vector size."); // Constant used to get the absolute value of a Vector<float> Vector256 <float> neg = Vector256.Create(-0.0f); int len = array.Length; int rem = len % StepSize; int fit = len - rem; fixed(float *p = array) { Vector256 <float> maxVec = Avx.AndNot(neg, Avx.LoadVector256(p)); for (int i = StepSize; i < fit; i += StepSize) { maxVec = Avx.Max(maxVec, Avx.AndNot(neg, Avx.LoadVector256(p + i))); } if (rem != 0) { maxVec = Avx.Max(maxVec, Avx.AndNot(neg, Avx.LoadVector256(p + len - StepSize))); } Vector128 <float> maxVec128 = Avx.Max(maxVec.GetLower(), maxVec.GetUpper()); maxVec128 = Avx.Max(maxVec128, Avx.Permute(maxVec128, 0b00001110)); maxVec128 = Avx.Max(maxVec128, Avx.Permute(maxVec128, 0b00000001)); return(maxVec128.GetElement(0)); } }
public static Vector256 <float> Multiply(Vector256 <float> left, Vector256 <float> right) { if (Avx.IsSupported) { return(Avx.Multiply(left, right)); } return(FromLowHigh(Multiply(left.GetLower(), right.GetLower()), Multiply(left.GetUpper(), right.GetLower()))); }
public static Vector256 <float> Add(Vector256 <float> left, Vector256 <float> right) { if (Avx.IsSupported) { return(Avx.Add(left, right)); } return(FromLowHigh(Add(left.GetLower(), right.GetLower()), Add(left.GetUpper(), right.GetLower()))); }
public static Vector256 <float> Subtract(Vector256 <float> left, Vector256 <float> right) { if (Avx.IsSupported) { return(Avx.Subtract(left, right)); } return(FromLowHigh(Subtract(left.GetLower(), right.GetLower()), Subtract(left.GetUpper(), right.GetLower()))); }
public static int EvenReduceSum(Vector256 <int> accumulator) { Vector128 <int> vsum = Sse2.Add(accumulator.GetLower(), accumulator.GetUpper()); // add upper lane to lower lane vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10)); // add high to low // Vector128<int>.ToScalar() isn't optimized pre-net5.0 https://github.com/dotnet/runtime/pull/37882 return(Sse2.ConvertToInt32(vsum)); }
public static Vector256 <float> Divide(Vector256 <float> dividend, Vector256 <float> divisor) { if (Avx.IsSupported) { return(Avx.Divide(dividend, divisor)); } return(FromLowHigh(Divide(dividend.GetLower(), divisor.GetLower()), Divide(dividend.GetUpper(), divisor.GetLower()))); }
public static Vector256 <float> CompareEqual(Vector256 <float> left, Vector256 <float> right) { if (Avx.IsSupported) { return(Avx.Compare(left, right, FloatComparisonMode.UnorderedEqualNonSignaling)); } return(FromLowHigh(CompareEqual(left.GetLower(), right.GetLower()), CompareEqual(left.GetUpper(), right.GetUpper()))); }
public unsafe static Vector256 <byte> Reverse(this Vector256 <byte> source) { var shuffleMask = stackalloc byte[] { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; var shuffleMaskVector = Avx2.LoadVector128(shuffleMask); return(JoinMask( Avx2.Shuffle(source.GetUpper(), shuffleMaskVector), Avx2.Shuffle(source.GetLower(), shuffleMaskVector) )); }
public static int ReduceSum(Vector256 <int> accumulator) { // Add upper lane to lower lane. Vector128 <int> vsum = Sse2.Add(accumulator.GetLower(), accumulator.GetUpper()); // Add odd to even. vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_11_01_01)); // Add high to low. vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10)); return(Sse2.ConvertToInt32(vsum)); }
public static double ReduceSum(this Vector <double> vector) { #if NETCOREAPP3_0 if (Avx.IsSupported) { Vector256 <double> a = Unsafe.As <Vector <double>, Vector256 <double> >(ref vector); Vector256 <double> tmp = Avx.HorizontalAdd(a, a); Vector128 <double> hi128 = tmp.GetUpper(); Vector128 <double> lo128 = tmp.GetLower(); Vector128 <double> s = Sse2.Add(lo128, hi128); return(s.ToScalar()); } #endif return(Vector.Dot(Vector <double> .One, vector)); }
// A little look into C# intrinsics, .NET Core 3+ only private static void AdjustVolumeAllSamples(float *inbuffer, float *outbuffer, int length, float volume) { int i = 0; if (Avx.IsSupported) { Vector256 <float> volVec = Vector256.Create(volume); while (length - i >= Vector256 <float> .Count) { Vector256 <float> tmp = Avx.Multiply(volVec, Avx.LoadVector256(inbuffer + i)); //Load from input, multiply by volume Avx.Store(outbuffer + i, tmp); //Store in output i += Vector256 <float> .Count; //Increment index by the number of vector elements } if (length - i >= Vector128 <float> .Count) { Vector128 <float> tmp = Sse.Multiply(volVec.GetLower(), Sse.LoadVector128(inbuffer + i)); Sse.Store(outbuffer + i, tmp); i += Vector128 <float> .Count; } } else if (Sse.IsSupported) { Vector128 <float> volVec = Vector128.Create(volume); //Broadcast the volume value across all vector elements while (length - i >= Vector128 <float> .Count) { Vector128 <float> tmp = Sse.Multiply(volVec, Sse.LoadVector128(inbuffer + i)); //Load from input, multiply by volume Sse.Store(outbuffer + i, tmp); //Store in output i += Vector128 <float> .Count; } } //process remaining, if any while (i < length) { outbuffer[i] = volume * inbuffer[i]; i += 1; } }
public static Vector256 <double> Add(Vector256 <double> first, Vector256 <double> second) { return(System.Runtime.Intrinsics.X86.Avx.IsSupported ? System.Runtime.Intrinsics.X86.Avx.Add(first, second) : System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.IsSupported? Vector256.Create( System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.Add( first.GetLower(), second.GetLower()), System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.Add( first.GetUpper(), second.GetUpper())) : Vector256.Create( first.GetElement(0) + second.GetElement(0), first.GetElement(1) + second.GetElement(1), first.GetElement(2) + second.GetElement(2), first.GetElement(3) + second.GetElement(3))); }
public static Vector256 <double> Scale(this Vector256 <double> value, double scale) { return(System.Runtime.Intrinsics.X86.Avx.IsSupported ? System.Runtime.Intrinsics.X86.Avx.Multiply(value, Vector256.Create(scale)) : System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.IsSupported? Vector256.Create( System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.Multiply( value.GetLower(), Vector128.Create(scale)), System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.Multiply( value.GetUpper(), Vector128.Create(scale))) : Vector256.Create( value.GetElement(0) * scale, value.GetElement(1) * scale, value.GetElement(2) * scale, value.GetElement(3) * scale)); }
public static Vector128 <short> DivideBy10(this Vector128 <short> dividend) { // Convert to two 32-bit integers Vector128 <int> a_hi = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16); Vector128 <int> a_lo = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16); a_lo = Sse2.ShiftRightArithmetic(a_lo, 16); Vector128 <int> div10_hi; Vector128 <int> div10_lo; if (Avx2.IsSupported) { Vector256 <int> a = Vector256.Create(a_lo, a_hi); Vector256 <int> s0 = Avx2.ShiftRightArithmetic(a, 15); Vector256 <int> factor = Vector256.Create(26215); Vector256 <int> mul = Avx2.MultiplyLow(a, factor); Vector256 <int> s1 = Avx2.ShiftRightArithmetic(mul, 18); Vector256 <int> div10 = Avx2.Subtract(s1, s0); div10_hi = div10.GetUpper(); div10_lo = div10.GetLower(); } else { Vector128 <int> s0_hi = Sse2.ShiftRightArithmetic(a_hi, 15); Vector128 <int> s0_lo = Sse2.ShiftRightArithmetic(a_lo, 15); Vector128 <int> factor = Vector128.Create(26215); Vector128 <int> mul_hi = Sse41.MultiplyLow(a_hi, factor); Vector128 <int> mul_lo = Sse41.MultiplyLow(a_lo, factor); Vector128 <int> s1_hi = Sse2.ShiftRightArithmetic(mul_hi, 18); Vector128 <int> s1_lo = Sse2.ShiftRightArithmetic(mul_lo, 18); div10_hi = Sse2.Subtract(s1_hi, s0_hi); div10_lo = Sse2.Subtract(s1_lo, s0_lo); } //div10_hi = Sse2.ShiftLeftLogical(div10_hi, 16); div10_hi = Sse2.ShiftLeftLogical128BitLane(div10_hi, 2); return(Sse41.Blend(div10_lo.AsInt16(), div10_hi.AsInt16(), 0xAA)); }
public (double near, double far) IntersectAVX(Ray ray) { Vector256 <double> origin = (Vector256 <double>)ray.Origin; Vector256 <double> direction = (Vector256 <double>)ray.Direction; Vector256 <double> zeroes = new Vector256 <double>(); Vector256 <double> min = (Vector256 <double>)Minimum; Vector256 <double> max = (Vector256 <double>)Maximum; // Replace slabs that won't be checked (0 direction axis) with infinity so that NaN doesn't propagate Vector256 <double> dirInfMask = Avx.And( Avx.Compare(direction, zeroes, FloatComparisonMode.OrderedEqualNonSignaling), Avx.And( Avx.Compare(origin, min, FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling), Avx.Compare(origin, max, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling))); min = Avx.BlendVariable(min, SIMDHelpers.BroadcastScalar4(double.NegativeInfinity), dirInfMask); max = Avx.BlendVariable(max, SIMDHelpers.BroadcastScalar4(double.PositiveInfinity), dirInfMask); // Flip slabs in direction axes that are negative (using direction as mask takes the most significant bit, the sign.. probably includes -0) Vector256 <double> minMasked = Avx.BlendVariable(min, max, direction); Vector256 <double> maxMasked = Avx.BlendVariable(max, min, direction); direction = Avx.Divide(Vector256.Create(1D), direction); Vector256 <double> near4 = Avx.Multiply(Avx.Subtract(minMasked, origin), direction); Vector256 <double> far4 = Avx.Multiply(Avx.Subtract(maxMasked, origin), direction); Vector128 <double> near2 = Sse2.Max(near4.GetLower(), near4.GetUpper()); near2 = Sse2.MaxScalar(near2, SIMDHelpers.Swap(near2)); Vector128 <double> far2 = Sse2.Min(far4.GetLower(), far4.GetUpper()); far2 = Sse2.MinScalar(far2, SIMDHelpers.Swap(far2)); if (Sse2.CompareScalarOrderedGreaterThan(near2, far2) | Sse2.CompareScalarOrderedLessThan(far2, new Vector128 <double>())) { return(double.NaN, double.NaN); } return(near2.ToScalar(), far2.ToScalar()); }
public void RunBasicScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario)); Double[] values = new Double[ElementCount]; for (int i = 0; i < ElementCount; i++) { values[i] = TestLibrary.Generator.GetDouble(); } Vector256 <Double> value = Vector256.Create(values[0], values[1], values[2], values[3]); Vector128 <Double> lowerResult = value.GetLower(); Vector128 <Double> upperResult = value.GetUpper(); ValidateGetResult(lowerResult, upperResult, values); Vector256 <Double> result = value.WithLower(upperResult); result = result.WithUpper(lowerResult); ValidateWithResult(result, values); }
private static double MinMaxCore(Vector <double> vector, bool doMin) { Vector256 <double> vec256 = Unsafe.As <Vector <double>, Vector256 <double> >(ref vector); Vector128 <double> hi128 = vec256.GetUpper(); Vector128 <double> lo128 = vec256.GetLower(); Vector128 <double> tmp1 = Avx.Permute(hi128, 0b_01); Vector128 <double> tmp2 = Avx.Permute(lo128, 0b_01); if (doMin) { hi128 = Sse2.Min(hi128, tmp1); lo128 = Sse2.Min(lo128, tmp2); lo128 = Sse2.Min(lo128, hi128); } else { hi128 = Sse2.Max(hi128, tmp1); lo128 = Sse2.Max(lo128, tmp2); lo128 = Sse2.Max(lo128, hi128); } return(lo128.ToScalar()); }
public void RunBasicScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario)); Int16[] values = new Int16[ElementCount]; for (int i = 0; i < ElementCount; i++) { values[i] = TestLibrary.Generator.GetInt16(); } Vector256 <Int16> value = Vector256.Create(values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7], values[8], values[9], values[10], values[11], values[12], values[13], values[14], values[15]); Vector128 <Int16> lowerResult = value.GetLower(); Vector128 <Int16> upperResult = value.GetUpper(); ValidateGetResult(lowerResult, upperResult, values); Vector256 <Int16> result = value.WithLower(upperResult); result = result.WithUpper(lowerResult); ValidateWithResult(result, values); }
public void RunBasicScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario)); Byte[] values = new Byte[ElementCount]; for (int i = 0; i < ElementCount; i++) { values[i] = TestLibrary.Generator.GetByte(); } Vector256 <Byte> value = Vector256.Create(values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7], values[8], values[9], values[10], values[11], values[12], values[13], values[14], values[15], values[16], values[17], values[18], values[19], values[20], values[21], values[22], values[23], values[24], values[25], values[26], values[27], values[28], values[29], values[30], values[31]); Vector128 <Byte> lowerResult = value.GetLower(); Vector128 <Byte> upperResult = value.GetUpper(); ValidateGetResult(lowerResult, upperResult, values); Vector256 <Byte> result = value.WithLower(upperResult); result = result.WithUpper(lowerResult); ValidateWithResult(result, values); }
public static void GetLowHigh <T>(Vector256 <T> vector, out Vector128 <T> low, out Vector128 <T> high) where T : struct { low = vector.GetLower(); high = vector.GetUpper(); }
private static void _mm256_storeu2_m128i(byte *hiaddr, byte *loaddr, Vector256 <byte> a) { Sse2.Store(loaddr, a.GetLower()); Sse2.Store(hiaddr, Avx.ExtractVector128(a, 0x1)); }
private static float SumVector256(Vector256 <float> v) { v = Avx.HorizontalAdd(v, v); //0+1, 2+3, .., .., 4+5, 6+7, .., .. v = Avx.HorizontalAdd(v, v); //0+1+2+3, .., .., .., 4+5+6+7, .., .., .. return(v.GetUpper().ToScalar() + v.GetLower().ToScalar()); }
public static void GetLowHigh(Vector256 <double> vector, out Vector128 <double> low, out Vector128 <double> high) { low = vector.GetLower(); high = vector.GetUpper(); }
// This function implements Algorithm 1 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf. // Compute the output value of the field-aware factorization, as the sum of the linear part and the latent part. // The linear part is the inner product of linearWeights and featureValues. // The latent part is the sum of all intra-field interactions in one field f, for all fields possible public static unsafe void CalculateIntermediateVariables(int *fieldIndices, int *featureIndices, float *featureValues, float *linearWeights, float *latentWeights, float *latentSum, float *response, int fieldCount, int latentDim, int count) { Contracts.Assert(Avx.IsSupported); // The number of all possible fields. int m = fieldCount; int d = latentDim; int c = count; int * pf = fieldIndices; int * pi = featureIndices; float *px = featureValues; float *pw = linearWeights; float *pv = latentWeights; float *pq = latentSum; float linearResponse = 0; float latentResponse = 0; Unsafe.InitBlock(pq, 0, (uint)(m * m * d * sizeof(float))); Vector256 <float> y = Vector256 <float> .Zero; Vector256 <float> tmp = Vector256 <float> .Zero; for (int i = 0; i < c; i++) { int f = pf[i]; int j = pi[i]; linearResponse += pw[j] * px[i]; Vector256 <float> x = Avx.BroadcastScalarToVector256(px + i); Vector256 <float> xx = Avx.Multiply(x, x); // tmp -= <v_j,f, v_j,f> * x * x int vBias = j * m * d + f * d; // j-th feature's latent vector in the f-th field hidden space. float *vjf = pv + vBias; for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> vjfBuffer = Avx.LoadVector256(vjf + k); tmp = MultiplyAddNegated(Avx.Multiply(vjfBuffer, vjfBuffer), xx, tmp); } for (int fprime = 0; fprime < m; fprime++) { vBias = j * m * d + fprime * d; int qBias = f * m * d + fprime * d; float *vjfprime = pv + vBias; float *qffprime = pq + qBias; // q_f,f' += v_j,f' * x for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> vjfprimeBuffer = Avx.LoadVector256(vjfprime + k); Vector256 <float> q = Avx.LoadVector256(qffprime + k); q = MultiplyAdd(vjfprimeBuffer, x, q); Avx.Store(qffprime + k, q); } } } for (int f = 0; f < m; f++) { // tmp += <q_f,f, q_f,f> float *qff = pq + f * m * d + f * d; for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> qffBuffer = Avx.LoadVector256(qff + k); // Intra-field interactions. tmp = MultiplyAdd(qffBuffer, qffBuffer, tmp); } // y += <q_f,f', q_f',f>, f != f' // Whis loop handles inter - field interactions because f != f'. for (int fprime = f + 1; fprime < m; fprime++) { float *qffprime = pq + f * m * d + fprime * d; float *qfprimef = pq + fprime * m * d + f * d; for (int k = 0; k + 8 <= d; k += 8) { // Inter-field interaction. Vector256 <float> qffprimeBuffer = Avx.LoadVector256(qffprime + k); Vector256 <float> qfprimefBuffer = Avx.LoadVector256(qfprimef + k); y = MultiplyAdd(qffprimeBuffer, qfprimefBuffer, y); } } } y = MultiplyAdd(_point5, tmp, y); tmp = Avx.Add(y, Avx.Permute2x128(y, y, 1)); tmp = Avx.HorizontalAdd(tmp, tmp); y = Avx.HorizontalAdd(tmp, tmp); Sse.StoreScalar(&latentResponse, y.GetLower()); // The lowest slot is the response value. *response = linearResponse + latentResponse; }
unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy) { float *tp = (float *)tstart, tpe = (float *)(tstart + cb); float *pmapx = (float *)mapxstart; int kstride = smapx * channels; int tstride = smapy * 4; int vcnt = smapx / Vector128 <float> .Count; while (tp < tpe) { int ix = *(int *)pmapx++; int lcnt = vcnt; float *ip = (float *)istart + ix * channels; float *mp = pmapx; pmapx += kstride; Vector128 <float> av0, av1, av2; if (Avx.IsSupported && lcnt >= 2) { Vector256 <float> ax0 = Vector256 <float> .Zero, ax1 = ax0, ax2 = ax0; for (; lcnt >= 2; lcnt -= 2) { var iv0 = Avx.LoadVector256(ip); var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var iv2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); ip += Vector256 <int> .Count * channels; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); ax1 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax1); ax2 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax2); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); ax1 = Avx.Add(ax1, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count))); ax2 = Avx.Add(ax2, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2))); } mp += Vector256 <float> .Count * channels; } av0 = Sse.Add(ax0.GetLower(), ax1.GetUpper()); av1 = Sse.Add(ax0.GetUpper(), ax2.GetLower()); av2 = Sse.Add(ax1.GetLower(), ax2.GetUpper()); } else { av0 = av1 = av2 = Vector128 <float> .Zero; } for (; lcnt != 0; lcnt--) { var iv0 = Sse.LoadVector128(ip); var iv1 = Sse.LoadVector128(ip + Vector128 <float> .Count); var iv2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2); ip += Vector128 <float> .Count * channels; if (Fma.IsSupported) { av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0); av1 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count), iv1, av1); av2 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 2), iv2, av2); } else { av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp))); av1 = Sse.Add(av1, Sse.Multiply(iv1, Sse.LoadVector128(mp + Vector128 <float> .Count))); av2 = Sse.Add(av2, Sse.Multiply(iv2, Sse.LoadVector128(mp + Vector128 <float> .Count * 2))); } mp += Vector128 <float> .Count * channels; } var avs0 = Sse.Add(Sse.Add( Sse.Shuffle(av0, av0, 0b_00_10_01_11), Sse.Shuffle(av1, av1, 0b_00_01_11_10)), Sse.Shuffle(av2, av2, 0b_00_11_10_01) ); var avs1 = Sse3.IsSupported ? Sse3.MoveHighAndDuplicate(avs0) : Sse.Shuffle(avs0, avs0, 0b_11_11_01_01); var avs2 = Sse.UnpackHigh(avs0, avs0); tp[0] = Sse.AddScalar(av0, avs0).ToScalar(); tp[1] = Sse.AddScalar(av1, avs1).ToScalar(); tp[2] = Sse.AddScalar(av2, avs2).ToScalar(); tp += tstride; } }