public static unsafe RtMatrix operator *(RtMatrix value1, RtMatrix value2) { if (Avx2.IsSupported && useIntrinsics) { var row = Avx.LoadVector256(&value1.M11); Avx.Store(&value1.M11, Avx.Add(Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0x00), Avx.LoadVector256(&value2.M11)), Avx.Multiply(Avx2.Permute4x64(row, 0x55), Avx.LoadVector256(&value2.M21))), Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0xAA), Avx.LoadVector256(&value2.M31)), Avx.Multiply(Avx2.Permute4x64(row, 0xFF), Avx.LoadVector256(&value2.M41))))); // 0x00 is _MM_SHUFFLE(0,0,0,0), 0x55 is _MM_SHUFFLE(1,1,1,1), etc. // TODO: Replace with a method once it's added to the API. row = Avx.LoadVector256(&value1.M21); Avx.Store(&value1.M21, Avx.Add(Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0x00), Avx.LoadVector256(&value2.M11)), Avx.Multiply(Avx2.Permute4x64(row, 0x55), Avx.LoadVector256(&value2.M21))), Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0xAA), Avx.LoadVector256(&value2.M31)), Avx.Multiply(Avx2.Permute4x64(row, 0xFF), Avx.LoadVector256(&value2.M41))))); row = Avx.LoadVector256(&value1.M31); Avx.Store(&value1.M31, Avx.Add(Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0x00), Avx.LoadVector256(&value2.M11)), Avx.Multiply(Avx2.Permute4x64(row, 0x55), Avx.LoadVector256(&value2.M21))), Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0xAA), Avx.LoadVector256(&value2.M31)), Avx.Multiply(Avx2.Permute4x64(row, 0xFF), Avx.LoadVector256(&value2.M41))))); row = Avx.LoadVector256(&value1.M41); Avx.Store(&value1.M41, Avx.Add(Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0x00), Avx.LoadVector256(&value2.M11)), Avx.Multiply(Avx2.Permute4x64(row, 0x55), Avx.LoadVector256(&value2.M21))), Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0xAA), Avx.LoadVector256(&value2.M31)), Avx.Multiply(Avx2.Permute4x64(row, 0xFF), Avx.LoadVector256(&value2.M41))))); return(value1); } RtMatrix m; // First row m.M11 = value1.M11 * value2.M11 + value1.M12 * value2.M21 + value1.M13 * value2.M31 + value1.M14 * value2.M41; m.M12 = value1.M11 * value2.M12 + value1.M12 * value2.M22 + value1.M13 * value2.M32 + value1.M14 * value2.M42; m.M13 = value1.M11 * value2.M13 + value1.M12 * value2.M23 + value1.M13 * value2.M33 + value1.M14 * value2.M43; m.M14 = value1.M11 * value2.M14 + value1.M12 * value2.M24 + value1.M13 * value2.M34 + value1.M14 * value2.M44; // Second row m.M21 = value1.M21 * value2.M11 + value1.M22 * value2.M21 + value1.M23 * value2.M31 + value1.M24 * value2.M41; m.M22 = value1.M21 * value2.M12 + value1.M22 * value2.M22 + value1.M23 * value2.M32 + value1.M24 * value2.M42; m.M23 = value1.M21 * value2.M13 + value1.M22 * value2.M23 + value1.M23 * value2.M33 + value1.M24 * value2.M43; m.M24 = value1.M21 * value2.M14 + value1.M22 * value2.M24 + value1.M23 * value2.M34 + value1.M24 * value2.M44; // Third row m.M31 = value1.M31 * value2.M11 + value1.M32 * value2.M21 + value1.M33 * value2.M31 + value1.M34 * value2.M41; m.M32 = value1.M31 * value2.M12 + value1.M32 * value2.M22 + value1.M33 * value2.M32 + value1.M34 * value2.M42; m.M33 = value1.M31 * value2.M13 + value1.M32 * value2.M23 + value1.M33 * value2.M33 + value1.M34 * value2.M43; m.M34 = value1.M31 * value2.M14 + value1.M32 * value2.M24 + value1.M33 * value2.M34 + value1.M34 * value2.M44; // Fourth row m.M41 = value1.M41 * value2.M11 + value1.M42 * value2.M21 + value1.M43 * value2.M31 + value1.M44 * value2.M41; m.M42 = value1.M41 * value2.M12 + value1.M42 * value2.M22 + value1.M43 * value2.M32 + value1.M44 * value2.M42; m.M43 = value1.M41 * value2.M13 + value1.M42 * value2.M23 + value1.M43 * value2.M33 + value1.M44 * value2.M43; m.M44 = value1.M41 * value2.M14 + value1.M42 * value2.M24 + value1.M43 * value2.M34 + value1.M44 * value2.M44; return(m); }
public static Vector256 <double> op_Addition(Vector256 <double> left, Vector256 <double> right) => Avx.Add(left, right);
public VectorArg256 Change(float f) { Vector256 <float> t = Avx.SetAllVector256(f); return(new VectorArg256(Avx.Add(t, _rgb))); }
// Generic math public static f32 Add(f32 lhs, f32 rhs) => Avx.Add(lhs, rhs);
public static Vector128 <float> GetBrucePsmeAbgrGrowthEffectiveAge(SiteConstants site, float timeStepInYears, Vector128 <float> treeHeight, out Vector128 <float> potentialHeightGrowth) { Vector128 <float> B1 = AvxExtensions.BroadcastScalarToVector128(site.B1); Vector128 <float> B2 = AvxExtensions.BroadcastScalarToVector128(site.B2); Vector128 <float> X2toB2 = AvxExtensions.BroadcastScalarToVector128(site.X2toB2); Vector128 <float> siteIndexFromGround128 = AvxExtensions.BroadcastScalarToVector128(site.SiteIndexFromGround); Vector128 <float> X1 = AvxExtensions.BroadcastScalarToVector128(site.X1); Vector128 <float> XX1 = Avx.Add(Avx.Divide(MathV.Ln(Avx.Divide(treeHeight, siteIndexFromGround128)), B1), X2toB2); Vector128 <float> xx1lessThanZero = Avx.CompareLessThanOrEqual(XX1, Vector128 <float> .Zero); Vector128 <float> growthEffectiveAge = Avx.Subtract(MathV.Pow(XX1, Avx.Reciprocal(B2)), X1); growthEffectiveAge = Avx.BlendVariable(growthEffectiveAge, AvxExtensions.BroadcastScalarToVector128(500.0F), xx1lessThanZero); Vector128 <float> timeStepInYearsPlusX1 = AvxExtensions.BroadcastScalarToVector128(timeStepInYears + site.X1); Vector128 <float> potentialHeightPower = Avx.Multiply(B1, Avx.Subtract(MathV.Pow(Avx.Add(growthEffectiveAge, timeStepInYearsPlusX1), B2), X2toB2)); Vector128 <float> potentialHeight = Avx.Multiply(siteIndexFromGround128, MathV.Exp(potentialHeightPower)); potentialHeightGrowth = Avx.Subtract(potentialHeight, treeHeight); return(growthEffectiveAge); }
public unsafe override double[] Applay(double[] values, int halfWindow) { var windowSize = 2 * halfWindow + 1; var resultSize = values.Length - windowSize + 1; if (resultSize == 0) { return(null); } var a = new double[resultSize]; var sum = 0d; fixed(double *valueStart = values, aStart = a) { var valueCurrent = valueStart; var valueEndwindowSize = valueCurrent + windowSize; while (valueCurrent < valueEndwindowSize) { sum += *valueCurrent; valueCurrent++; } var aCurrent = aStart + 1; var aEnd = aStart + resultSize; var aUnrolledEnd = aStart + (((resultSize - 1) >> 4) << 4); valueCurrent = valueStart; var valueWindowSize = valueStart + windowSize; var vWindowSize = Vector256.Create((double)windowSize); var vCurrent = Vector256.Create( (ulong)aCurrent, (ulong)aCurrent + 4 * sizeof(double), (ulong)aCurrent + 8 * sizeof(double), (ulong)aCurrent + 12 * sizeof(double)); var vValueCurrent = Vector256.Create( (ulong)valueCurrent, (ulong)valueCurrent + 4 * sizeof(double), (ulong)valueCurrent + 8 * sizeof(double), (ulong)valueCurrent + 12 * sizeof(double)); var vValueWindowSize = Vector256.Create( (ulong)valueWindowSize, (ulong)valueWindowSize + 4 * sizeof(double), (ulong)valueWindowSize + 8 * sizeof(double), (ulong)valueWindowSize + 12 * sizeof(double)); var vShiftIndex1 = Vector256.Create(16ul * sizeof(double)); while (aCurrent < aUnrolledEnd) { #region 1 Avx.Store( aCurrent, Avx.Divide( Avx.Subtract( Avx.LoadVector256((double *)vValueWindowSize.GetElement(0)), Avx.LoadVector256((double *)vValueCurrent.GetElement(0))), vWindowSize ) ); #endregion #region 2 Avx.Store( (double *)vCurrent.GetElement(1), Avx.Divide( Avx.Subtract( Avx.LoadVector256((double *)vValueWindowSize.GetElement(1)), Avx.LoadVector256((double *)vValueCurrent.GetElement(1))), vWindowSize ) ); #endregion #region 3 Avx.Store( (double *)vCurrent.GetElement(2), Avx.Divide( Avx.Subtract( Avx.LoadVector256((double *)vValueWindowSize.GetElement(2)), Avx.LoadVector256((double *)vValueCurrent.GetElement(2))), vWindowSize ) ); #endregion #region 4 Avx.Store( (double *)vCurrent.GetElement(3), Avx.Divide( Avx.Subtract( Avx.LoadVector256((double *)vValueWindowSize.GetElement(3)), Avx.LoadVector256((double *)vValueCurrent.GetElement(3))), vWindowSize ) ); #endregion vCurrent = Avx.Add(vCurrent.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64(); vValueCurrent = Avx.Add(vValueCurrent.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64(); vValueWindowSize = Avx.Add(vValueWindowSize.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64(); aCurrent = (double *)vCurrent.GetElement(0); } valueWindowSize = (double *)vValueWindowSize.GetElement(0); valueCurrent = (double *)vValueCurrent.GetElement(0); while (aCurrent < aEnd) { *aCurrent = (*valueWindowSize - *valueCurrent) / windowSize; aCurrent++; valueCurrent++; valueWindowSize++; } var aPrev = aStart; aCurrent = aStart + 1; aEnd = aStart + resultSize; *aPrev = sum / windowSize; aUnrolledEnd = aStart + (((resultSize - 1) >> 2) << 2); vCurrent = Vector256.Create( (ulong)aCurrent, (ulong)aCurrent + sizeof(double), (ulong)aCurrent + 2 * sizeof(double), (ulong)aCurrent + 3 * sizeof(double)); var vPrev = Vector256.Create( (ulong)aPrev, (ulong)aPrev + sizeof(double), (ulong)aPrev + 2 * sizeof(double), (ulong)aPrev + 3 * sizeof(double)); var vShiftIndex = Vector256.Create(4ul * sizeof(double)); while (aCurrent < aUnrolledEnd) { #region 1 *aCurrent += *(double *)vPrev.GetElement(0); #endregion #region 2 *(double *)vCurrent.GetElement(1) += *(double *)vPrev.GetElement(1); #endregion #region 3 *(double *)vCurrent.GetElement(2) += *(double *)vPrev.GetElement(2); #endregion #region 4 *(double *)vCurrent.GetElement(3) += *(double *)vPrev.GetElement(3); #endregion vCurrent = Avx.Add(vCurrent.AsDouble(), vShiftIndex.AsDouble()).AsUInt64(); vPrev = Avx.Add(vPrev.AsDouble(), vShiftIndex.AsDouble()).AsUInt64(); aCurrent = (double *)vCurrent.GetElement(0); } aPrev = (double *)vPrev.GetElement(0); while (aCurrent < aEnd) { *aCurrent += *aPrev; aCurrent++; aPrev++; } } return(a); }
unsafe void IConvolver.WriteDestLine(byte *tstart, byte *ostart, int ox, int ow, byte *pmapy, int smapy) { float *op = (float *)ostart; int xc = ox + ow, tstride = smapy; int vcnt = smapy / Vector128 <float> .Count; while (ox < xc) { int lcnt = vcnt; float *tp = (float *)tstart + ox * tstride; float *mp = (float *)pmapy; Vector128 <float> av0; if (Avx.IsSupported && lcnt >= 2) { var ax0 = Vector256 <float> .Zero; for (; lcnt >= 4; lcnt -= 4) { var iv0 = Avx.LoadVector256(tp); var iv1 = Avx.LoadVector256(tp + Vector256 <float> .Count); tp += Vector256 <float> .Count * 2; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count))); } mp += Vector256 <float> .Count * 2; } if (lcnt >= 2) { lcnt -= 2; var iv0 = Avx.LoadVector256(tp); tp += Vector256 <float> .Count; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); } mp += Vector256 <float> .Count; } av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper()); } else { av0 = Vector128 <float> .Zero; } for (; lcnt != 0; lcnt--) { var iv0 = Sse.LoadVector128(tp); tp += Vector128 <float> .Count; if (Fma.IsSupported) { av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0); } else { av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp))); } mp += Vector128 <float> .Count; } *op++ = av0.HorizontalAdd(); ox++; } }
public unsafe void Process(MutableByteImage currentPicture, MutableByteImage nextPicture) { float MaxFactor = 1; float[] attackAr = new float[] { Attack, Attack, Attack, Attack }; float[] decayAr = new float[] { Decay, Decay, Decay, Decay }; int length = nextPicture.Data.Length; float *MaxFactorPtr = &MaxFactor; fixed(float *AttackPtr = attackAr) fixed(float *DecayPtr = decayAr) fixed(byte *currentPicPtr = currentPicture.Data) fixed(byte *nextPicPtr = nextPicture.Data) { byte *currentPxPtr = currentPicPtr; byte *nextPxPtr = nextPicPtr; int remainingLength = length % 4; for (int i = 0; i < length; i += 4) { var currentColor = *nextPxPtr; var workingDataColor = *currentPxPtr; var currentColorPtr = nextPxPtr; var workingDataColorPtr = currentPxPtr; var cmpResult = Avx.ConvertToVector128Single( Sse2.CompareGreaterThan( Sse41.ConvertToVector128Int32(currentColorPtr), Sse41.ConvertToVector128Int32(workingDataColorPtr) )); var pixelFactor = Avx.Add( Avx.And(cmpResult, Avx.BroadcastScalarToVector128(AttackPtr)), Avx.AndNot(cmpResult, Avx.BroadcastScalarToVector128(DecayPtr)) ); var result = Avx.Add( Avx.Multiply( Avx.Subtract( Avx.BroadcastScalarToVector128(MaxFactorPtr), pixelFactor), Sse41.ConvertToVector128Single( Sse41.ConvertToVector128Int32(workingDataColorPtr)) ), Avx.Multiply( pixelFactor, Sse41.ConvertToVector128Single( Sse41.ConvertToVector128Int32(currentColorPtr)))); // TODO improve Store *currentPxPtr = (byte)Avx.Extract(result, 0); currentPxPtr++; *currentPxPtr = (byte)Avx.Extract(result, 1); currentPxPtr++; *currentPxPtr = (byte)Avx.Extract(result, 2); currentPxPtr++; *currentPxPtr = (byte)Avx.Extract(result, 3); currentPxPtr++; nextPxPtr += 4; } for (int i = 0; i < remainingLength; i++) { var currentColor = *nextPxPtr; var workingDataColor = *currentPxPtr; var newPixelFactor = workingDataColor < currentColor ? Attack : Decay; var newPixelValue = (byte)((currentColor * newPixelFactor) + (workingDataColor * (1 - newPixelFactor))); *currentPxPtr = newPixelValue; currentPxPtr++; nextPxPtr++; } } }
unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy) { float *tp = (float *)tstart, tpe = (float *)(tstart + cb); float *pmapx = (float *)mapxstart; int kstride = smapx; int tstride = smapy; int vcnt = smapx / Vector128 <float> .Count; while (tp < tpe) { int ix = *(int *)pmapx++; int lcnt = vcnt; float *ip = (float *)istart + ix; float *mp = pmapx; pmapx += kstride; Vector128 <float> av0; if (Avx.IsSupported && lcnt >= 2) { var ax0 = Vector256 <float> .Zero; for (; lcnt >= 8; lcnt -= 8) { var iv0 = Avx.LoadVector256(ip); var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var iv2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); var iv3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3); ip += Vector256 <float> .Count * 4; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 3), iv3, ax0); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count))); ax0 = Avx.Add(ax0, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2))); ax0 = Avx.Add(ax0, Avx.Multiply(iv3, Avx.LoadVector256(mp + Vector256 <float> .Count * 3))); } mp += Vector256 <float> .Count * 4; } if (lcnt >= 6) { lcnt -= 6; var iv0 = Avx.LoadVector256(ip); var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var iv2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); ip += Vector256 <float> .Count * 3; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax0); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count))); ax0 = Avx.Add(ax0, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2))); } mp += Vector256 <float> .Count * 3; } else if (lcnt >= 4) { lcnt -= 4; var iv0 = Avx.LoadVector256(ip); var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count); ip += Vector256 <float> .Count * 2; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count))); } mp += Vector256 <float> .Count * 2; } else if (lcnt >= 2) { lcnt -= 2; var iv0 = Avx.LoadVector256(ip); ip += Vector256 <float> .Count; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); } mp += Vector256 <float> .Count; } av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper()); } else { av0 = Vector128 <float> .Zero; } for (; lcnt != 0; lcnt--) { var iv0 = Sse.LoadVector128(ip); ip += Vector128 <float> .Count; if (Fma.IsSupported) { av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0); } else { av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp))); } mp += Vector128 <float> .Count; } tp[0] = av0.HorizontalAdd(); tp += tstride; } }
unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy) { float *tp = (float *)tstart, tpe = (float *)(tstart + cb); float *pmapx = (float *)mapxstart; int kstride = smapx * channels; int tstride = smapy * channels; int vcnt = smapx / Vector128 <float> .Count; while (tp < tpe) { int ix = *(int *)pmapx++; int lcnt = vcnt; float *ip = (float *)istart + ix * channels; float *mp = pmapx; pmapx += kstride; Vector128 <float> av0; if (Avx.IsSupported && lcnt >= 2) { var ax0 = Vector256 <float> .Zero; for (; lcnt >= 2; lcnt -= 2) { var iv0 = Avx.LoadVector256(ip); var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var iv2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); var iv3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3); ip += Vector256 <int> .Count * channels; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 3), iv3, ax0); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count))); ax0 = Avx.Add(ax0, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2))); ax0 = Avx.Add(ax0, Avx.Multiply(iv3, Avx.LoadVector256(mp + Vector256 <float> .Count * 3))); } mp += Vector256 <float> .Count * channels; } av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper()); } else { av0 = Vector128 <float> .Zero; } for (; lcnt != 0; lcnt--) { var iv0 = Sse.LoadVector128(ip); var iv1 = Sse.LoadVector128(ip + Vector128 <float> .Count); var iv2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2); var iv3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3); ip += Vector128 <float> .Count * channels; if (Fma.IsSupported) { av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0); av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count), iv1, av0); av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 2), iv2, av0); av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 3), iv3, av0); } else { av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp))); av0 = Sse.Add(av0, Sse.Multiply(iv1, Sse.LoadVector128(mp + Vector128 <float> .Count))); av0 = Sse.Add(av0, Sse.Multiply(iv2, Sse.LoadVector128(mp + Vector128 <float> .Count * 2))); av0 = Sse.Add(av0, Sse.Multiply(iv3, Sse.LoadVector128(mp + Vector128 <float> .Count * 3))); } mp += Vector128 <float> .Count * channels; } tp[0] = av0.ToScalar(); tp[1] = Sse.Shuffle(av0, av0, 0b_11_10_01_01).ToScalar(); tp[2] = Sse.UnpackHigh(av0, av0).ToScalar(); tp[3] = Sse.Shuffle(av0, av0, 0b_11_10_01_11).ToScalar(); tp += tstride; } }
unsafe void IConvolver.WriteDestLine(byte *tstart, byte *ostart, int ox, int ow, byte *pmapy, int smapy) { float *op = (float *)ostart; int xc = ox + ow, tstride = smapy * channels; int vcnt = smapy / Vector128 <float> .Count; while (ox < xc) { int lcnt = vcnt; float *tp = (float *)tstart + ox * tstride; float *mp = (float *)pmapy; Vector128 <float> av0; if (Avx.IsSupported && lcnt >= 2) { var ax0 = Vector256 <float> .Zero; for (; lcnt >= 2; lcnt -= 2) { var iv0 = Avx.LoadVector256(tp); var iv1 = Avx.LoadVector256(tp + Vector256 <float> .Count); var iv2 = Avx.LoadVector256(tp + Vector256 <float> .Count * 2); var iv3 = Avx.LoadVector256(tp + Vector256 <float> .Count * 3); tp += Vector256 <int> .Count * channels; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 3), iv3, ax0); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count))); ax0 = Avx.Add(ax0, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2))); ax0 = Avx.Add(ax0, Avx.Multiply(iv3, Avx.LoadVector256(mp + Vector256 <float> .Count * 3))); } mp += Vector256 <float> .Count * channels; } av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper()); } else { av0 = Vector128 <float> .Zero; } for (; lcnt != 0; lcnt--) { var iv0 = Sse.LoadVector128(tp); var iv1 = Sse.LoadVector128(tp + Vector128 <float> .Count); var iv2 = Sse.LoadVector128(tp + Vector128 <float> .Count * 2); var iv3 = Sse.LoadVector128(tp + Vector128 <float> .Count * 3); tp += Vector128 <float> .Count * channels; if (Fma.IsSupported) { av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0); av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count), iv1, av0); av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 2), iv2, av0); av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 3), iv3, av0); } else { av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp))); av0 = Sse.Add(av0, Sse.Multiply(iv1, Sse.LoadVector128(mp + Vector128 <float> .Count))); av0 = Sse.Add(av0, Sse.Multiply(iv2, Sse.LoadVector128(mp + Vector128 <float> .Count * 2))); av0 = Sse.Add(av0, Sse.Multiply(iv3, Sse.LoadVector128(mp + Vector128 <float> .Count * 3))); } mp += Vector128 <float> .Count * channels; } op[0] = av0.ToScalar(); op[1] = Sse.Shuffle(av0, av0, 0b_11_10_01_01).ToScalar(); op[2] = Sse.UnpackHigh(av0, av0).ToScalar(); op[3] = Sse.Shuffle(av0, av0, 0b_11_10_01_11).ToScalar(); op += channels; ox++; } }
public static unsafe float GetScribnerBoardFeetPerAcre(Trees trees) { // for now, assume all trees are of the same species if (trees.Species != FiaCode.PseudotsugaMenziesii) { throw new NotSupportedException(); } if (trees.Units != Units.English) { throw new NotSupportedException(); } // Douglas-fir #if DEBUG Vector128 <float> v6p8 = AvxExtensions.BroadcastScalarToVector128(6.8F); Vector128 <float> v10k = AvxExtensions.BroadcastScalarToVector128(10.0F * 1000.0F); #endif // constants Vector128 <float> forestersEnglish = AvxExtensions.BroadcastScalarToVector128(Constant.ForestersEnglish); Vector128 <float> one = AvxExtensions.BroadcastScalarToVector128(1.0F); Vector128 <float> six = AvxExtensions.BroadcastScalarToVector128(6.0F); Vector128 <float> vm3p21809 = AvxExtensions.BroadcastScalarToVector128(-3.21809F); // b4 Vector128 <float> v0p04948 = AvxExtensions.BroadcastScalarToVector128(0.04948F); Vector128 <float> vm0p15664 = AvxExtensions.BroadcastScalarToVector128(-0.15664F); Vector128 <float> v2p02132 = AvxExtensions.BroadcastScalarToVector128(2.02132F); Vector128 <float> v1p63408 = AvxExtensions.BroadcastScalarToVector128(1.63408F); Vector128 <float> vm0p16184 = AvxExtensions.BroadcastScalarToVector128(-0.16184F); Vector128 <float> v1p033 = AvxExtensions.BroadcastScalarToVector128(1.033F); Vector128 <float> v1p382937 = AvxExtensions.BroadcastScalarToVector128(1.382937F); Vector128 <float> vm0p4015292 = AvxExtensions.BroadcastScalarToVector128(-0.4015292F); Vector128 <float> v0p087266 = AvxExtensions.BroadcastScalarToVector128(0.087266F); Vector128 <float> vm0p174533 = AvxExtensions.BroadcastScalarToVector128(-0.174533F); Vector128 <float> vm0p6896598794 = AvxExtensions.BroadcastScalarToVector128(-0.6896598794F); // rc6-rs632 Vector128 <float> v0p993 = AvxExtensions.BroadcastScalarToVector128(0.993F); Vector128 <float> v0p174439 = AvxExtensions.BroadcastScalarToVector128(0.174439F); Vector128 <float> v0p117594 = AvxExtensions.BroadcastScalarToVector128(0.117594F); Vector128 <float> vm8p210585 = AvxExtensions.BroadcastScalarToVector128(-8.210585F); Vector128 <float> v0p236693 = AvxExtensions.BroadcastScalarToVector128(0.236693F); Vector128 <float> v0p00001345 = AvxExtensions.BroadcastScalarToVector128(0.00001345F); Vector128 <float> v0p00001937 = AvxExtensions.BroadcastScalarToVector128(0.00001937F); Vector128 <float> v1p001491 = AvxExtensions.BroadcastScalarToVector128(1.001491F); Vector128 <float> vm6p924097 = AvxExtensions.BroadcastScalarToVector128(-6.924097F); Vector128 <float> v0p912733 = AvxExtensions.BroadcastScalarToVector128(0.912733F); Vector128 <float> v0p00001351 = AvxExtensions.BroadcastScalarToVector128(0.00001351F); fixed(float *dbh = &trees.Dbh[0], expansionFactors = &trees.LiveExpansionFactor[0], height = &trees.Height[0]) { Vector128 <float> standBoardFeetPerAcre = Vector128 <float> .Zero; for (int treeIndex = 0; treeIndex < trees.Count; treeIndex += Constant.Simd128x4.Width) { Vector128 <float> dbhInInches = Avx.LoadVector128(dbh + treeIndex); Vector128 <float> heightInFeet = Avx.LoadVector128(height + treeIndex); Vector128 <float> logDbhInInches = MathV.Log10(dbhInInches); Vector128 <float> logHeightInFeet = MathV.Log10(heightInFeet); // FiaCode.PseudotsugaMenziesii => -3.21809F + 0.04948F * logHeightInFeet * logDbhInInches - 0.15664F * logDbhInInches * logDbhInInches + // 2.02132F * logDbhInInches + 1.63408F * logHeightInFeet - 0.16184F * logHeightInFeet * logHeightInFeet, Vector128 <float> cvtsl = Avx.Add(vm3p21809, Avx.Multiply(v0p04948, Avx.Multiply(logHeightInFeet, logDbhInInches))); cvtsl = Avx.Add(cvtsl, Avx.Multiply(vm0p15664, Avx.Multiply(logDbhInInches, logDbhInInches))); cvtsl = Avx.Add(cvtsl, Avx.Multiply(v2p02132, logDbhInInches)); cvtsl = Avx.Add(cvtsl, Avx.Multiply(v1p63408, logHeightInFeet)); cvtsl = Avx.Add(cvtsl, Avx.Multiply(vm0p16184, Avx.Multiply(logHeightInFeet, logHeightInFeet))); Vector128 <float> cubicFeet = MathV.Exp10(cvtsl); Vector128 <float> dbhSquared = Avx.Multiply(dbhInInches, dbhInInches); // could be consolidated by merging other scaling constants with Forester's constant for basal area Vector128 <float> basalAreaInSquareFeet = Avx.Multiply(forestersEnglish, dbhSquared); // b4 = cubicFeet / (1.033F * (1.0F + 1.382937F * MathV.Exp(-4.015292F * dbhInInches / 10.0F)) * (basalAreaInSquareFeet + 0.087266F) - 0.174533F); Vector128 <float> b4 = Avx.Divide(cubicFeet, Avx.Add(Avx.Multiply(v1p033, Avx.Multiply(Avx.Add(one, Avx.Multiply(v1p382937, MathV.Exp(Avx.Multiply(vm0p4015292, dbhInInches)))), Avx.Add(basalAreaInSquareFeet, v0p087266))), vm0p174533)); Vector128 <float> cv4 = Avx.Multiply(b4, Avx.Subtract(basalAreaInSquareFeet, v0p087266)); // conversion to Scribner volumes for 32 foot trees // Waddell 2014:32 // rc6 = 0.993F * (1.0F - MathF.Pow(0.62F, dbhInInches - 6.0F)); Vector128 <float> rc6 = Avx.Multiply(v0p993, Avx.Subtract(one, MathV.Exp(Avx.Multiply(vm0p6896598794, Avx.Subtract(dbhInInches, six))))); // log2(0.62) = -0.6896598794 Vector128 <float> cv6 = Avx.Multiply(rc6, cv4); Vector128 <float> logB4 = MathV.Log10(b4); // float rs616 = MathF.Pow(10.0F, 0.174439F + 0.117594F * logDbhInInches * logB4 - 8.210585F / (dbhInInches * dbhInInches) + 0.236693F * logB4 - 0.00001345F * b4 * b4 - 0.00001937F * dbhInInches * dbhInInches); Vector128 <float> rs616l = Avx.Add(v0p174439, Avx.Multiply(v0p117594, Avx.Multiply(logDbhInInches, logB4))); rs616l = Avx.Add(rs616l, Avx.Divide(vm8p210585, dbhSquared)); rs616l = Avx.Add(rs616l, Avx.Multiply(v0p236693, logB4)); rs616l = Avx.Subtract(rs616l, Avx.Multiply(v0p00001345, Avx.Multiply(b4, b4))); rs616l = Avx.Subtract(rs616l, Avx.Multiply(v0p00001937, dbhSquared)); Vector128 <float> rs616 = MathV.Exp10(rs616l); Vector128 <float> sv616 = Avx.Multiply(rs616, cv6); // Scribner board foot volume to a 6 inch top for 16 foot logs // float rs632 = 1.001491F - 6.924097F / tarif + 0.00001351F * dbhInInches * dbhInInches; Vector128 <float> rs632 = Avx.Add(v1p001491, Avx.Divide(vm6p924097, Avx.Multiply(v0p912733, b4))); rs632 = Avx.Add(rs632, Avx.Multiply(v0p00001351, dbhSquared)); Vector128 <float> zeroVolumeMask = Avx.CompareLessThanOrEqual(dbhInInches, six); Vector128 <float> sv632 = Avx.Multiply(rs632, sv616); // Scribner board foot volume to a 6 inch top for 32 foot logs sv632 = Avx.BlendVariable(sv632, Vector128 <float> .Zero, zeroVolumeMask); #if DEBUG DebugV.Assert(Avx.CompareGreaterThanOrEqual(Avx.BlendVariable(rc6, Vector128 <float> .Zero, zeroVolumeMask), Vector128 <float> .Zero)); DebugV.Assert(Avx.CompareLessThanOrEqual(rc6, one)); DebugV.Assert(Avx.CompareGreaterThanOrEqual(Avx.BlendVariable(rs616, one, zeroVolumeMask), one)); DebugV.Assert(Avx.CompareLessThanOrEqual(Avx.BlendVariable(rs616, Vector128 <float> .Zero, zeroVolumeMask), v6p8)); DebugV.Assert(Avx.CompareGreaterThanOrEqual(Avx.BlendVariable(rs632, Vector128 <float> .Zero, zeroVolumeMask), Vector128 <float> .Zero)); DebugV.Assert(Avx.CompareLessThanOrEqual(Avx.BlendVariable(rs632, Vector128 <float> .Zero, zeroVolumeMask), one)); DebugV.Assert(Avx.CompareGreaterThanOrEqual(Avx.BlendVariable(sv632, Vector128 <float> .Zero, zeroVolumeMask), Vector128 <float> .Zero)); DebugV.Assert(Avx.CompareLessThanOrEqual(Avx.BlendVariable(sv632, Vector128 <float> .Zero, zeroVolumeMask), v10k)); #endif Vector128 <float> expansionFactor = Avx.LoadVector128(expansionFactors + treeIndex); standBoardFeetPerAcre = Avx.Add(standBoardFeetPerAcre, Avx.Multiply(expansionFactor, sv632)); } standBoardFeetPerAcre = Avx.HorizontalAdd(standBoardFeetPerAcre, standBoardFeetPerAcre); standBoardFeetPerAcre = Avx.HorizontalAdd(standBoardFeetPerAcre, standBoardFeetPerAcre); return(standBoardFeetPerAcre.ToScalar()); } }
public static unsafe ComplexFloat[] Kernel32(ComplexFloat[] i, ref ComplexFloat[][] omegas) { ComplexFloat[] result = new ComplexFloat[32]; ComplexFloat[] tmp = new ComplexFloat[48]; ComplexFloat ami = i[0] - i[8]; ComplexFloat api = i[0] + i[8]; ComplexFloat fmn = i[5] - i[13]; ComplexFloat fpn = i[5] + i[13]; ComplexFloat xami = i[16] - i[24]; ComplexFloat xapi = i[16] + i[24]; ComplexFloat xfmn = i[21] - i[29]; ComplexFloat xfpn = i[21] + i[29]; tmp[0] = api + i[2] + i[4] + i[6] + i[10] + i[12] + i[14]; tmp[1] = ami + (i[2] - i[10] + (i[6] - i[14]).TimesMinusI()) * omegas[3][1] + (i[4] - i[12]).TimesMinusI(); tmp[2] = api - i[4] - i[12] + (i[2] - i[6] + i[10] - i[14]).TimesMinusI(); tmp[3] = ami - (i[4] - i[12]).TimesMinusI() - (i[10] - i[2] + (i[6] - i[14]).TimesMinusI()) * omegas[3][3]; tmp[4] = api - i[2] + i[4] - i[6] - i[10] + i[12] - i[14]; tmp[5] = ami - (i[2] - i[10] + (i[6] - i[14]).TimesMinusI()) * omegas[3][1] + (i[4] - i[12]).TimesMinusI(); tmp[6] = api - i[4] - i[12] - (i[2] - i[6] + i[10] - i[14]).TimesMinusI(); tmp[7] = ami - (i[4] - i[12]).TimesMinusI() + (i[10] - i[2] + (i[6] - i[14]).TimesMinusI()).TimesMinusI(); tmp[8] = i[1] + i[3] + fpn + i[7] + i[9] + i[11] + i[15]; tmp[9] = omegas[4][1] * (i[1] - i[9] + (i[3] - i[11] + (i[7] - i[15]).TimesMinusI()) * omegas[3][1] + (fmn).TimesMinusI()); tmp[10] = omegas[4][2] * ((i[3] - i[7] + i[11] - i[15]).TimesMinusI() + i[1] - fpn + i[9]); tmp[11] = omegas[4][3] * (omegas[3][3] * (i[11] - i[3] + (i[7] - i[15]).TimesMinusI()) - i[1] + i[9] + (fmn).TimesMinusI()); tmp[12] = (i[1] - i[3] + fpn - i[7] + i[9] - i[11] - i[15]).TimesMinusI(); tmp[13] = (i[1] - i[9] - (i[3] - i[11] + (i[7] - i[15]).TimesMinusI()) * omegas[3][1] + (fmn).TimesMinusI()) * omegas[4][5]; tmp[14] = omegas[4][6] * ((i[3] - i[7] + i[11] - i[15]).TimesMinusI() - i[1] + fpn - i[9]); tmp[15] = omegas[4][7] * ((i[11] - i[3] + (i[7] - i[15]).TimesMinusI()).TimesMinusI() + i[1] - i[9] - (fmn).TimesMinusI()); tmp[16] = xapi + i[18] + i[20] + i[22] + i[28] + i[26] + i[30]; tmp[17] = xami + (i[18] - i[26] + (i[22] - i[30]).TimesMinusI()) * omegas[3][1] + (i[20] - i[28]).TimesMinusI(); tmp[18] = xapi - i[20] - i[28] + (i[18] - i[22] + i[26] - i[30]).TimesMinusI(); tmp[19] = xami - (i[20] - i[28]).TimesMinusI() - (i[26] - i[18] + (i[22] - i[30]).TimesMinusI()) * omegas[3][3]; tmp[20] = xapi - i[28] + i[20] - i[22] - i[26] + i[28] - i[30]; tmp[21] = xami - (i[28] - i[26] + (i[22] - i[30]).TimesMinusI()) * omegas[3][1] + (i[20] - i[22]).TimesMinusI(); tmp[22] = xapi - i[20] - i[28] - (i[18] - i[22] + i[26] - i[30]).TimesMinusI(); tmp[23] = xami - (i[20] - i[28]).TimesMinusI() + (i[26] - i[18] + (i[22] - i[30]).TimesMinusI()).TimesMinusI(); tmp[24] = i[17] + i[19] + xfpn + i[23] + i[25] + i[27] + i[31]; tmp[25] = omegas[4][1] * (i[17] - i[25] + (i[19] - i[27] + (i[23] - i[31]).TimesMinusI()) * omegas[3][1] + (xfmn).TimesMinusI()); tmp[26] = omegas[4][2] * ((i[19] - i[23] + i[27] - i[31]).TimesMinusI() + i[17] - xfpn + i[25]); tmp[27] = omegas[4][3] * (omegas[3][3] * (i[27] - i[19] + (i[23] - i[31]).TimesMinusI()) - i[17] + i[25] + (xfmn).TimesMinusI()); tmp[28] = (i[17] - i[19] + xfpn - i[23] + i[25] - i[27] - i[31]).TimesMinusI(); tmp[29] = (i[17] - i[25] - (i[19] - i[27] + (i[23] - i[31]).TimesMinusI()) * omegas[3][1] + (xfmn).TimesMinusI()) * omegas[4][5]; tmp[30] = omegas[4][6] * ((i[19] - i[23] + i[27] - i[31]).TimesMinusI() - i[17] + xfpn - i[25]); tmp[31] = omegas[4][7] * ((i[27] - i[19] + (i[23] - i[31]).TimesMinusI()).TimesMinusI() + i[17] - i[25] - (xfmn).TimesMinusI()); //32 complex floats = 64 floats //Divided into 4 parts A, B, C, D = each containing 8 complex floats, so 16 floats //AVX takes 8 floats at once, so will calculate in halves of those parts //Tmp will ocntain 6 octets fixed(ComplexFloat *entry = result, om5 = omegas[5], t = tmp) { Vector256 <float> a; Vector256 <float> b; Vector256 <float> bSwap; Vector256 <float> aIm; Vector256 <float> aRe; Vector256 <float> aIM_bSwap; float *partA = (float *)entry; float *partB = partA + 16; float *partC = partA + 32; float *partD = partA + 48; float *omPart1 = (float *)om5; float *omPart2 = omPart1 + 16; float *tmpPart1 = (float *)t; float *tmpPart2 = tmpPart1 + 16; float *tmpPart3 = tmpPart1 + 32; float *tmpPart4 = tmpPart1 + 48; float *tmpPart5 = tmpPart1 + 64; float *tmpPart6 = tmpPart1 + 80; //Summing up result Avx2.Store(partA, Avx2.Add(Avx2.LoadVector256(tmpPart1), Avx2.LoadVector256(tmpPart2))); Avx2.Store(partA + 8, Avx2.Add(Avx2.LoadVector256(tmpPart1 + 8), Avx2.LoadVector256(tmpPart2 + 8))); Avx2.Store(partB, Avx2.Subtract(Avx2.LoadVector256(tmpPart1), Avx2.LoadVector256(tmpPart2))); Avx2.Store(partB + 8, Avx2.Subtract(Avx2.LoadVector256(tmpPart1 + 8), Avx2.LoadVector256(tmpPart2 + 8))); Avx2.Store(partC, Avx2.Add(Avx2.LoadVector256(tmpPart3), Avx2.LoadVector256(tmpPart4))); Avx2.Store(partC + 8, Avx2.Add(Avx2.LoadVector256(tmpPart3 + 8), Avx2.LoadVector256(tmpPart4 + 8))); Avx2.Store(partD, Avx2.Subtract(Avx2.LoadVector256(tmpPart3), Avx2.LoadVector256(tmpPart4))); Avx2.Store(partD + 8, Avx2.Subtract(Avx2.LoadVector256(tmpPart3 + 8), Avx2.LoadVector256(tmpPart4 + 8))); //------------------------------------------------------------------------------------------------------------- //First part of each 8 complex part //Tmp[0] = A + B Avx2.Store(tmpPart1, Avx2.Add(Avx2.LoadVector256(partA), Avx2.LoadVector256(partB))); //Tmp[1] = A - B Avx2.Store(tmpPart2, Avx2.Subtract(Avx2.LoadVector256(partA), Avx2.LoadVector256(partB))); //Tmp[2] = C + D Avx2.Store(tmpPart3, Avx2.Add(Avx2.LoadVector256(partC), Avx2.LoadVector256(partD))); //Tmp[3] = C - D Avx2.Store(tmpPart4, Avx2.Subtract(Avx2.LoadVector256(partC), Avx2.LoadVector256(partD))); //Complex multiplication based on: https://www.researchgate.net/figure/Vectorized-complex-multiplication-using-AVX-2_fig2_337532904 //Tmp[4] = omega * (C+D) a = Avx2.LoadVector256(tmpPart3); b = Avx2.LoadVector256(omPart1); bSwap = Avx2.Shuffle(b, b, imm8bShuffle); aIm = Avx2.Shuffle(a, a, imm8aImShuffle); aRe = Avx2.Shuffle(a, a, imm8aReShuffle); aIM_bSwap = Avx.Multiply(aIm, bSwap); Avx2.Store(tmpPart5, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap)); //Tmp[4] = omega * (C-D) a = Avx2.LoadVector256(tmpPart4); b = Avx2.LoadVector256(omPart2); bSwap = Avx2.Shuffle(b, b, imm8bShuffle); aIm = Avx2.Shuffle(a, a, imm8aImShuffle); aRe = Avx2.Shuffle(a, a, imm8aReShuffle); aIM_bSwap = Avx.Multiply(aIm, bSwap); Avx2.Store(tmpPart6, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap)); //(A+B) + (C+D) Avx2.Store(partA, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart3))); //(A-B) + (C-D) Avx2.Store(partB, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart4))); //(A+B) + omega*(C+D) Avx2.Store(partC, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart5))); //(A-B) + omega*(C-D) Avx2.Store(partD, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart6))); //-------------------------------------------------------------------------------------------------------------- //Second part of each 8 complex part //Tmp[0] = A + B Avx2.Store(tmpPart1, Avx2.Add(Avx2.LoadVector256(partA + 8), Avx2.LoadVector256(partB + 8))); //Tmp[1] = A - B Avx2.Store(tmpPart2, Avx2.Subtract(Avx2.LoadVector256(partA + 8), Avx2.LoadVector256(partB + 8))); //Tmp[2] = C + D Avx2.Store(tmpPart3, Avx2.Add(Avx2.LoadVector256(partC + 8), Avx2.LoadVector256(partD + 8))); //Tmp[2] = C - D Avx2.Store(tmpPart4, Avx2.Subtract(Avx2.LoadVector256(partC + 8), Avx2.LoadVector256(partD + 8))); //Complex multiplication based on: https://www.researchgate.net/figure/Vectorized-complex-multiplication-using-AVX-2_fig2_337532904 //Tmp[4] = omega * (C+D) a = Avx2.LoadVector256(tmpPart3); b = Avx2.LoadVector256(omPart1 + 8); bSwap = Avx2.Shuffle(b, b, imm8bShuffle); aIm = Avx2.Shuffle(a, a, imm8aImShuffle); aRe = Avx2.Shuffle(a, a, imm8aReShuffle); aIM_bSwap = Avx.Multiply(aIm, bSwap); Avx2.Store(tmpPart5, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap)); //Tmp[4] = omega * (C-D) a = Avx2.LoadVector256(tmpPart4); b = Avx2.LoadVector256(omPart2 + 8); bSwap = Avx2.Shuffle(b, b, imm8bShuffle); aIm = Avx2.Shuffle(a, a, imm8aImShuffle); aRe = Avx2.Shuffle(a, a, imm8aReShuffle); aIM_bSwap = Avx.Multiply(aIm, bSwap); Avx2.Store(tmpPart6, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap)); //(A+B) + (C+D) Avx2.Store(partA + 8, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart3))); //(A-B) + (C-D) Avx2.Store(partB + 8, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart4))); //(A+B) + omega*(C+D) Avx2.Store(partC + 8, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart5))); //(A-B) + omega*(C-D) Avx2.Store(partD + 8, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart6))); } return(result); //ComplexFloat[] result = new ComplexFloat[32]; //ArraySegment<ComplexFloat> partA = new ArraySegment<ComplexFloat>(i, 0, 16); //ArraySegment<ComplexFloat> partB = new ArraySegment<ComplexFloat>(i, 16, 16); //Kernel16(partA.ToArray(), ref omegas).CopyTo(result, 0); //Kernel16(partA.ToArray(), ref omegas).CopyTo(result, 16); //return result; }
protected override unsafe double CalculateImpl(double x, double stepThreshold, int maxN) { if (!Avx.IsSupported) { Status = TaylorSeriesStatus.NotSupported; return(Double.NaN); } const int vectorSize = 256 / 8 / sizeof(double); // v8888 <- (8, 8, 8, 8) var value8 = 8.0; var v8888 = Avx.BroadcastScalarToVector256(&value8); // xPow8 <- (x^8, x^8, x^8, x^8) var xPow8 = Avx.BroadcastScalarToVector256(&x); xPow8 = Avx.Multiply(xPow8, xPow8); xPow8 = Avx.Multiply(xPow8, xPow8); xPow8 = Avx.Multiply(xPow8, xPow8); // up <- (x^(-1), x^(-3), x^(-5), x^(-7)) var upSa = stackalloc double[vectorSize]; var xDiv2iPlus1 = 1 / x; for (var i = 0; i < vectorSize; i++) { upSa[i] = xDiv2iPlus1; xDiv2iPlus1 /= x * x; } var up = Avx.LoadVector256(upSa); // down <- (1, 3, 5, 7) var downSa = stackalloc double[vectorSize] { 1, 3, 5, 7 }; var down = Avx.LoadVector256(downSa); // sum <- (0, 0, 0, 0) var sum = Vector256 <double> .Zero; N = 0; while (N < maxN) { // div <- up / down var div = Avx.Divide(up, down); // sum <- sum + div sum = Avx.Add(sum, div); // div = (x1, x2, x3, last) var last = div.GetElement(vectorSize - 1); N += vectorSize; if (Math.Abs(last) < stepThreshold) { break; } // up <- up / (x^8, x^8, x^8, x^8) up = Avx.Divide(up, xPow8); // down <- down + (8, 8, 8, 8) down = Avx.Add(down, v8888); } var resultSa = stackalloc double[vectorSize]; Avx.Store(resultSa, sum); Status = N >= maxN ? TaylorSeriesStatus.TooManyIterations : TaylorSeriesStatus.Success; return(resultSa[0] + resultSa[1] + resultSa[2] + resultSa[3]); } }
// Element-wise addition. public static IEnumerable <Vector256 <double> > Add( this IEnumerable <Vector256 <double> > @this, IEnumerable <Vector256 <double> > other) => @this.Zip(other).Select(ab => Avx.Add(ab.First, ab.Second));
static unsafe int Main(string[] args) { int testResult = Pass; if (Avx.IsSupported) { using (TestTable <float> floatTable = new TestTable <float>(new float[8] { 1, -5, 100, 0, 1, -5, 100, 0 }, new float[8] { 22, -1, -50, 0, 22, -1, -50, 0 }, new float[8])) using (TestTable <double> doubleTable = new TestTable <double>(new double[4] { 1, -5, 100, 0 }, new double[4] { 22, -1, -50, 0 }, new double[4])) { var vf1 = Unsafe.Read <Vector256 <float> >(floatTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector256 <float> >(floatTable.inArray2Ptr); var vf3 = Avx.Add(vf1, vf2); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => x + y == z)) { Console.WriteLine("AVX Add failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = (Vector256 <float>) typeof(Avx).GetMethod(nameof(Avx.Add), new Type[] { vf1.GetType(), vf2.GetType() }).Invoke(null, new object[] { vf1, vf2 }); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => x + y == z)) { Console.WriteLine("AVX Add failed via reflection on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } var vd1 = Unsafe.Read <Vector256 <double> >(doubleTable.inArray1Ptr); var vd2 = Unsafe.Read <Vector256 <double> >(doubleTable.inArray2Ptr); var vd3 = Avx.Add(vd1, vd2); Unsafe.Write(doubleTable.outArrayPtr, vd3); if (!doubleTable.CheckResult((x, y, z) => x + y == z)) { Console.WriteLine("AVX Add failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vd3 = (Vector256 <double>) typeof(Avx).GetMethod(nameof(Avx.Add), new Type[] { vd1.GetType(), vd2.GetType() }).Invoke(null, new object[] { vd1, vd2 }); Unsafe.Write(doubleTable.outArrayPtr, vd3); if (!doubleTable.CheckResult((x, y, z) => x + y == z)) { Console.WriteLine("AVX Add failed via reflection on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } } return(testResult); }
static int Main() { s_success = true; // We expect the AOT compiler generated HW intrinsics with the following characteristics: // // * TRUE = IsSupported assumed to be true, no runtime check // * NULL = IsSupported is a runtime check, code should be behind the check or bad things happen // * FALSE = IsSupported assumed to be false, no runtime check, PlatformNotSupportedException if used // // The test is compiled with multiple defines to test this. #if BASELINE_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 16; bool?Sse2AndBelow = true; bool?Sse3Group = null; bool?AesLzPcl = null; bool?Sse4142 = null; bool?PopCnt = null; bool?Avx12 = false; bool?FmaBmi12 = false; #elif NON_VEX_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 16; bool?Sse2AndBelow = true; bool?Sse3Group = true; bool?AesLzPcl = null; bool?Sse4142 = true; bool?PopCnt = null; bool?Avx12 = false; bool?FmaBmi12 = false; #elif VEX_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 32; bool?Sse2AndBelow = true; bool?Sse3Group = true; bool?AesLzPcl = null; bool?Sse4142 = true; bool?PopCnt = null; bool?Avx12 = true; bool?FmaBmi12 = null; #else #error Who dis? #endif if (vectorsAccelerated != Vector.IsHardwareAccelerated) { throw new Exception($"Vectors HW acceleration state unexpected - expected {vectorsAccelerated}, got {Vector.IsHardwareAccelerated}"); } if (byteVectorLength != Vector <byte> .Count) { throw new Exception($"Unexpected vector length - expected {byteVectorLength}, got {Vector<byte>.Count}"); } Check("Sse", Sse2AndBelow, &SseIsSupported, Sse.IsSupported, () => Sse.Subtract(Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Sse.X64", Sse2AndBelow, &SseX64IsSupported, Sse.X64.IsSupported, () => Sse.X64.ConvertToInt64WithTruncation(Vector128 <float> .Zero) == 0); Check("Sse2", Sse2AndBelow, &Sse2IsSupported, Sse2.IsSupported, () => Sse2.Extract(Vector128 <ushort> .Zero, 0) == 0); Check("Sse2.X64", Sse2AndBelow, &Sse2X64IsSupported, Sse2.X64.IsSupported, () => Sse2.X64.ConvertToInt64(Vector128 <double> .Zero) == 0); Check("Sse3", Sse3Group, &Sse3IsSupported, Sse3.IsSupported, () => Sse3.MoveHighAndDuplicate(Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Sse3.X64", Sse3Group, &Sse3X64IsSupported, Sse3.X64.IsSupported, null); Check("Ssse3", Sse3Group, &Ssse3IsSupported, Ssse3.IsSupported, () => Ssse3.Abs(Vector128 <short> .Zero).Equals(Vector128 <ushort> .Zero)); Check("Ssse3.X64", Sse3Group, &Ssse3X64IsSupported, Ssse3.X64.IsSupported, null); Check("Sse41", Sse4142, &Sse41IsSupported, Sse41.IsSupported, () => Sse41.Max(Vector128 <int> .Zero, Vector128 <int> .Zero).Equals(Vector128 <int> .Zero)); Check("Sse41.X64", Sse4142, &Sse41X64IsSupported, Sse41.X64.IsSupported, () => Sse41.X64.Extract(Vector128 <long> .Zero, 0) == 0); Check("Sse42", Sse4142, &Sse42IsSupported, Sse42.IsSupported, () => Sse42.Crc32(0, 0) == 0); Check("Sse42.X64", Sse4142, &Sse42X64IsSupported, Sse42.X64.IsSupported, () => Sse42.X64.Crc32(0, 0) == 0); Check("Aes", AesLzPcl, &AesIsSupported, Aes.IsSupported, () => Aes.KeygenAssist(Vector128 <byte> .Zero, 0).Equals(Vector128.Create((byte)99))); Check("Aes.X64", AesLzPcl, &AesX64IsSupported, Aes.X64.IsSupported, null); Check("Avx", Avx12, &AvxIsSupported, Avx.IsSupported, () => Avx.Add(Vector256 <double> .Zero, Vector256 <double> .Zero).Equals(Vector256 <double> .Zero)); Check("Avx.X64", Avx12, &AvxX64IsSupported, Avx.X64.IsSupported, null); Check("Avx2", Avx12, &Avx2IsSupported, Avx2.IsSupported, () => Avx2.Abs(Vector256 <int> .Zero).Equals(Vector256 <uint> .Zero)); Check("Avx2.X64", Avx12, &Avx2X64IsSupported, Avx2.X64.IsSupported, null); Check("Bmi1", FmaBmi12, &Bmi1IsSupported, Bmi1.IsSupported, () => Bmi1.AndNot(0, 0) == 0); Check("Bmi1.X64", FmaBmi12, &Bmi1X64IsSupported, Bmi1.X64.IsSupported, () => Bmi1.X64.AndNot(0, 0) == 0); Check("Bmi2", FmaBmi12, &Bmi2IsSupported, Bmi2.IsSupported, () => Bmi2.MultiplyNoFlags(0, 0) == 0); Check("Bmi2.X64", FmaBmi12, &Bmi2X64IsSupported, Bmi2.X64.IsSupported, () => Bmi2.X64.MultiplyNoFlags(0, 0) == 0); Check("Fma", FmaBmi12, &FmaIsSupported, Fma.IsSupported, () => Fma.MultiplyAdd(Vector128 <float> .Zero, Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Fma.X64", FmaBmi12, &FmaX64IsSupported, Fma.X64.IsSupported, null); Check("Lzcnt", AesLzPcl, &LzcntIsSupported, Lzcnt.IsSupported, () => Lzcnt.LeadingZeroCount(0) == 32); Check("Lzcnt.X64", AesLzPcl, &LzcntX64IsSupported, Lzcnt.X64.IsSupported, () => Lzcnt.X64.LeadingZeroCount(0) == 64); Check("Pclmulqdq", AesLzPcl, &PclmulqdqIsSupported, Pclmulqdq.IsSupported, () => Pclmulqdq.CarrylessMultiply(Vector128 <long> .Zero, Vector128 <long> .Zero, 0).Equals(Vector128 <long> .Zero)); Check("Pclmulqdq.X64", AesLzPcl, &PclmulqdqX64IsSupported, Pclmulqdq.X64.IsSupported, null); Check("Popcnt", PopCnt, &PopcntIsSupported, Popcnt.IsSupported, () => Popcnt.PopCount(0) == 0); Check("Popcnt.X64", PopCnt, &PopcntX64IsSupported, Popcnt.X64.IsSupported, () => Popcnt.X64.PopCount(0) == 0); return(s_success ? 100 : 1); }
return(new AvxVec3() { x = Avx.Add(x, other.x), y = Avx.Add(y, other.y), z = Avx.Add(z, other.z) });
public static float DotMultiplyIntrinsicWAvxWSpanPtr(ref Memory <float> vector1, ref Memory <float> vector2) { var span1 = vector1.Span; var span2 = vector2.Span; var cnt = Math.Min(span1.Length, span2.Length); var v3 = Vector256.CreateScalarUnsafe(0f); var vectLen = Vector256 <float> .Count; var vectCnt = cnt / vectLen; var total = 0f; #if TEST var file = Path.GetTempFileName(); using var writer = new StreamWriter(file); Console.WriteLine($"Intrinsic with AvxWPtr Mult. results will be written into {file}"); #endif int i; unsafe { var ptr1 = (float *)Unsafe.AsPointer(ref span1[0]); var ptr2 = (float *)Unsafe.AsPointer(ref span2[0]); for (i = 0; i < vectCnt; i++) { var v1 = Avx.LoadVector256(ptr1); var v2 = Avx.LoadVector256(ptr2); var t = Avx.Multiply(v1, v2); v3 = Avx.Add(v3, t); ptr1 += vectLen; ptr2 += vectLen; #if TEST writer.WriteLine($"{v1.ToString()}\t{v2.ToString()}\t{v3.ToString()}"); #endif } for (i = 0; i < vectLen; i++) { total += v3.GetElement(i); } i = vectCnt * vectLen; if (cnt % vectLen > 0) { ptr1 = (float *)Unsafe.AsPointer(ref span1[i]); ptr2 = (float *)Unsafe.AsPointer(ref span2[i]); for (; i < cnt; i++) { total += *ptr1++ **ptr2++; } } } if (vector1.Length != vector2.Length) { var h = vector1.Length > vector2.Length ? span1 : span2; for (var j = cnt; j < h.Length; j++) { total += h[j]; } } return(total); }
unsafe void IConvolver.SharpenLine(byte *cstart, byte *ystart, byte *bstart, byte *ostart, int ox, int ow, float amt, float thresh, bool gamma) { float *ip = (float *)cstart + (uint)ox * channels, yp = (float *)ystart + (uint)ox, bp = (float *)bstart, op = (float *)ostart; float *ipe = ip + (uint)ow * channels; bool threshold = thresh > 0f; if (Avx.IsSupported && ip <= ipe - VectorAvx.Count) { var vthresh = Vector256.Create(threshold ? thresh : -1f); var vmsk = Vector256.Create(0x7fffffff).AsSingle(); var vamt = Vector256.Create(amt); var vmin = VectorAvx.Zero; ipe -= VectorAvx.Count; do { var vd = Avx.Subtract(Avx.LoadVector256(yp), Avx.LoadVector256(bp)); yp += VectorAvx.Count; bp += VectorAvx.Count; if (threshold) { var sm = HWIntrinsics.AvxCompareGreaterThan(Avx.And(vd, vmsk), vthresh); vd = Avx.And(vd, sm); } vd = Avx.Multiply(vd, vamt); var v0 = Avx.LoadVector256(ip); ip += VectorAvx.Count; if (gamma) { v0 = Avx.Max(v0, vmin); v0 = Avx.Multiply(v0, Avx.ReciprocalSqrt(v0)); v0 = Avx.Add(v0, vd); v0 = Avx.Max(v0, vmin); v0 = Avx.Multiply(v0, v0); } else { v0 = Avx.Add(v0, vd); } Avx.Store(op, v0); op += VectorAvx.Count; } while (ip <= ipe); ipe += VectorAvx.Count; } else if (ip <= ipe - VectorSse.Count) { var vthresh = Vector128.Create(threshold ? thresh : -1f); var vmsk = Vector128.Create(0x7fffffff).AsSingle(); var vamt = Vector128.Create(amt); var vmin = VectorSse.Zero; ipe -= VectorSse.Count; do { var vd = Sse.Subtract(Sse.LoadVector128(yp), Sse.LoadVector128(bp)); yp += VectorSse.Count; bp += VectorSse.Count; if (threshold) { var sm = Sse.CompareGreaterThan(Sse.And(vd, vmsk), vthresh); vd = Sse.And(vd, sm); } vd = Sse.Multiply(vd, vamt); var v0 = Sse.LoadVector128(ip); ip += VectorSse.Count; if (gamma) { v0 = Sse.Max(v0, vmin); v0 = Sse.Multiply(v0, Sse.ReciprocalSqrt(v0)); v0 = Sse.Add(v0, vd); v0 = Sse.Max(v0, vmin); v0 = Sse.Multiply(v0, v0); } else { v0 = Sse.Add(v0, vd); } Sse.Store(op, v0); op += VectorSse.Count; } while (ip <= ipe); ipe += VectorSse.Count; } float fmin = VectorSse.Zero.ToScalar(); while (ip < ipe) { float dif = *yp++ - *bp++; float c0 = *ip++; if (!threshold || Math.Abs(dif) > thresh) { dif *= amt; if (gamma) { c0 = MathUtil.MaxF(c0, fmin).Sqrt(); c0 = MathUtil.MaxF(c0 + dif, fmin); c0 *= c0; } else { c0 += dif; } } *op++ = c0; } }
public unsafe void Vector256Mandel() { int floatL3Size = TOTALBYTES / sizeof(float); resolutionX = (int)MathF.Floor(MathF.Sqrt(floatL3Size * ratioy_x)); if (resolutionX % 8 != 0) { resolutionX -= resolutionX % 8; } resolutionY = (int)MathF.Floor(resolutionX * ratioy_x); if (resolutionY % 8 != 0) { resolutionY -= resolutionY % 8; } STEP_X = (RIGHT_X - LEFT_X) / resolutionX; STEP_Y = STEP_X; // ratioy_x * STEP_X; Bug from reddit comment numberOfPoints = resolutionX * resolutionY; results2 = new float[numberOfPoints]; xPoints = new float[resolutionX]; yPoints = new float[resolutionY]; for (int i = 0; i < resolutionX; i++) { xPoints.Span[i] = LEFT_X + i * STEP_X; } for (int i = 0; i < resolutionY; i++) { yPoints.Span[i] = TOP_Y - i * STEP_Y; } int countX = 0, countY = 0; int maxInter = 256; int inter; ReadOnlySpan <float> ySpan = yPoints.Span;// MemoryMarshal.Cast<float, Vector256<float>>(yPoints.Span); ReadOnlySpan <Vector256 <float> > xSpan = MemoryMarshal.Cast <float, Vector256 <float> >(xPoints.Span); Span <Vector256 <float> > res = MemoryMarshal.Cast <float, Vector256 <float> >(results2.Span); Span <Vector256 <float> > testSpan = MemoryMarshal.Cast <float, Vector256 <float> >(testValue2.Span); int resVectorNumber = 0; Vector256 <float> xVec, yVec; var oneVec = Vector256.Create(1.0f); var fourVec = Vector256.Create(4.0f); while (countY < ySpan.Length) { var currYVec = Vector256.Create(ySpan[countY]); while (countX < xSpan.Length) { Vector256 <float> currXVec = xSpan[countX]; var xSquVec = Vector256.Create(0.0f); var ySquVec = Vector256.Create(0.0f); var zSquVec = Vector256.Create(0.0f); var interVec = Vector256.Create(0.0f); Vector256 <float> sumVector = oneVec; inter = 0; bool goOn = true; while (goOn) { xVec = Avx.Add(Avx.Subtract(xSquVec, ySquVec), currXVec); yVec = Avx.Add(Avx.Subtract(Avx.Subtract(zSquVec, ySquVec), xSquVec), currYVec); xSquVec = Avx.Multiply(xVec, xVec); ySquVec = Avx.Multiply(yVec, yVec); zSquVec = Avx.Multiply(Avx.Add(xVec, yVec), Avx.Add(xVec, yVec)); Vector256 <float> test = Avx.Compare(Avx.Add(xSquVec, ySquVec), fourVec, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // <= 4.0? sumVector = Avx.BlendVariable(Vector256 <float> .Zero, sumVector, test); // selects from second if true, from first otherwise goOn = (Avx.MoveMask(test) > 0) & (inter < maxInter); //any of the values still alive, and inter still below cutoff value? if (goOn) { interVec = Avx.Add(interVec, sumVector); } inter = goOn ? inter + 1 : inter; } testSpan[resVectorNumber] = Avx.Add(xSquVec, ySquVec); res[resVectorNumber] = interVec; resVectorNumber++; countX++; } countX = 0; countY++; } }
// This function implements Algorithm 1 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf. // Compute the output value of the field-aware factorization, as the sum of the linear part and the latent part. // The linear part is the inner product of linearWeights and featureValues. // The latent part is the sum of all intra-field interactions in one field f, for all fields possible public static unsafe void CalculateIntermediateVariables(int *fieldIndices, int *featureIndices, float *featureValues, float *linearWeights, float *latentWeights, float *latentSum, float *response, int fieldCount, int latentDim, int count) { Contracts.Assert(Avx.IsSupported); // The number of all possible fields. int m = fieldCount; int d = latentDim; int c = count; int * pf = fieldIndices; int * pi = featureIndices; float *px = featureValues; float *pw = linearWeights; float *pv = latentWeights; float *pq = latentSum; float linearResponse = 0; float latentResponse = 0; Unsafe.InitBlock(pq, 0, (uint)(m * m * d * sizeof(float))); Vector256 <float> y = Vector256 <float> .Zero; Vector256 <float> tmp = Vector256 <float> .Zero; for (int i = 0; i < c; i++) { int f = pf[i]; int j = pi[i]; linearResponse += pw[j] * px[i]; Vector256 <float> x = Avx.BroadcastScalarToVector256(px + i); Vector256 <float> xx = Avx.Multiply(x, x); // tmp -= <v_j,f, v_j,f> * x * x int vBias = j * m * d + f * d; // j-th feature's latent vector in the f-th field hidden space. float *vjf = pv + vBias; for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> vjfBuffer = Avx.LoadVector256(vjf + k); tmp = MultiplyAddNegated(Avx.Multiply(vjfBuffer, vjfBuffer), xx, tmp); } for (int fprime = 0; fprime < m; fprime++) { vBias = j * m * d + fprime * d; int qBias = f * m * d + fprime * d; float *vjfprime = pv + vBias; float *qffprime = pq + qBias; // q_f,f' += v_j,f' * x for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> vjfprimeBuffer = Avx.LoadVector256(vjfprime + k); Vector256 <float> q = Avx.LoadVector256(qffprime + k); q = MultiplyAdd(vjfprimeBuffer, x, q); Avx.Store(qffprime + k, q); } } } for (int f = 0; f < m; f++) { // tmp += <q_f,f, q_f,f> float *qff = pq + f * m * d + f * d; for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> qffBuffer = Avx.LoadVector256(qff + k); // Intra-field interactions. tmp = MultiplyAdd(qffBuffer, qffBuffer, tmp); } // y += <q_f,f', q_f',f>, f != f' // Whis loop handles inter - field interactions because f != f'. for (int fprime = f + 1; fprime < m; fprime++) { float *qffprime = pq + f * m * d + fprime * d; float *qfprimef = pq + fprime * m * d + f * d; for (int k = 0; k + 8 <= d; k += 8) { // Inter-field interaction. Vector256 <float> qffprimeBuffer = Avx.LoadVector256(qffprime + k); Vector256 <float> qfprimefBuffer = Avx.LoadVector256(qfprimef + k); y = MultiplyAdd(qffprimeBuffer, qfprimefBuffer, y); } } } y = MultiplyAdd(_point5, tmp, y); tmp = Avx.Add(y, Avx.Permute2x128(y, y, 1)); tmp = Avx.HorizontalAdd(tmp, tmp); y = Avx.HorizontalAdd(tmp, tmp); Sse.StoreScalar(&latentResponse, y.GetLower()); // The lowest slot is the response value. *response = linearResponse + latentResponse; }
static public float Dot(Vector v0, Vector v1) { if (v0.lng != v1.lng) { throw new Exception(); } int lng = v0.lng; float *p0 = v0.ptr; float *p1 = v1.ptr; float *tmp = stackalloc float[8]; if (lng < 8) { float sum = 0; for (int i = 0; i < lng; i++) { sum += p0[i] * p1[i]; } return(sum); } if (lng < 64) { var sum0 = Vector256 <float> .Zero; for (int i = 0; i <= lng - 8; i += 8) { sum0 = Fma.MultiplyAdd(Avx.LoadVector256(p0 + i), Avx.LoadVector256(p1 + i), sum0); } Avx.Store(tmp, sum0); float sum = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; for (int i = lng / 8 * 8; i < lng; i++) { sum += p0[i] * p1[i]; } return(sum); } else { var sum = Vector256 <float> .Zero; var sum0 = Vector256 <float> .Zero; var sum1 = Vector256 <float> .Zero; var sum2 = Vector256 <float> .Zero; var sum3 = Vector256 <float> .Zero; var sum4 = Vector256 <float> .Zero; var sum5 = Vector256 <float> .Zero; var sum6 = Vector256 <float> .Zero; var sum7 = Vector256 <float> .Zero; float *pp1 = p0; float *pp2 = p1; double dsum = 0; for (int i = 0; i < lng / 64; i++) { sum0 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 00), Avx.LoadVector256(pp2 + 00), sum0); sum1 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 08), Avx.LoadVector256(pp2 + 08), sum1); sum2 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 16), Avx.LoadVector256(pp2 + 16), sum2); sum3 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 24), Avx.LoadVector256(pp2 + 24), sum3); sum4 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 32), Avx.LoadVector256(pp2 + 32), sum4); sum5 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 40), Avx.LoadVector256(pp2 + 40), sum5); sum6 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 48), Avx.LoadVector256(pp2 + 48), sum6); sum7 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 56), Avx.LoadVector256(pp2 + 56), sum7); pp1 += 64; pp2 += 64; //精度改善のためdoubleに結果を保存しておく if (i % 1024 == 1023) { sum = Avx.Add(Avx.Add(Avx.Add(sum0, sum1), Avx.Add(sum2, sum3)), Avx.Add(Avx.Add(sum4, sum5), Avx.Add(sum6, sum7))); Avx.Store(tmp, sum); dsum += tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; sum0 = Vector256 <float> .Zero; sum1 = Vector256 <float> .Zero; sum2 = Vector256 <float> .Zero; sum3 = Vector256 <float> .Zero; sum4 = Vector256 <float> .Zero; sum5 = Vector256 <float> .Zero; sum6 = Vector256 <float> .Zero; sum7 = Vector256 <float> .Zero; } } sum = Avx.Add(Avx.Add(Avx.Add(sum0, sum1), Avx.Add(sum2, sum3)), Avx.Add(Avx.Add(sum4, sum5), Avx.Add(sum6, sum7))); for (int i = lng / 64 * 64; i <= lng - 8; i += 8) { sum = Fma.MultiplyAdd(Avx.LoadVector256(p0 + i), Avx.LoadVector256(p1 + i), sum); } Avx.Store(tmp, sum); dsum += tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; for (int i = lng / 8 * 8; i < lng; i++) { dsum += p0[i] * p1[i]; } return((float)dsum); } }
public VectorArg256 Change(float f) { Vector256 <float> t = Vector256.Create(f); return(new VectorArg256(Avx.Add(t, _rgb))); }
public static Vector256 <float> op_Addition(Vector256 <float> left, Vector256 <float> right) => Avx.Add(left, right);
private static unsafe double[] BilinearInterpol_AVX( double[] x, double[] A, double minXA, double maxXA, double[] B, double minXB, double maxXB, double weightB) { double[] z = new double[outputVectorSize]; fixed(double *pX = &x[0], pA = &A[0], pB = &B[0], pZ = &z[0]) { Vector256 <double> vWeightB = Vector256.Create(weightB); Vector256 <double> vWeightA = Vector256.Create(1 - weightB); Vector256 <double> vMinXA = Vector256.Create(minXA); Vector256 <double> vMaxXA = Vector256.Create(maxXA); Vector256 <double> vMinXB = Vector256.Create(minXB); Vector256 <double> vMaxXB = Vector256.Create(maxXB); double deltaA = (maxXA - minXA) / (double)(A.Length - 1); double deltaB = (maxXB - minXB) / (double)(B.Length - 1); Vector256 <double> vDeltaA = Vector256.Create(deltaA); Vector256 <double> vDeltaB = Vector256.Create(deltaB); double invDeltaA = 1.0 / deltaA; double invDeltaB = 1.0 / deltaB; Vector256 <double> vInvDeltaA = Vector256.Create(invDeltaA); Vector256 <double> vInvDeltaB = Vector256.Create(invDeltaB); Vector128 <int> ALengthMinusOne = Vector128.Create(A.Length - 1); Vector128 <int> BLengthMinusOne = Vector128.Create(B.Length - 1); Vector128 <int> One = Vector128.Create(1); for (var i = 0; i < x.Length; i += Vector256 <double> .Count) { Vector256 <double> currentX = Avx.LoadVector256(pX + i); // Determine the largest a, such that A[i] = f(xA) and xA <= x[i]. // This involves casting from double to int; here we use a Vector conversion. Vector256 <double> aDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXA), vInvDeltaA); Vector128 <int> a = Avx.ConvertToVector128Int32WithTruncation(aDouble); a = Sse41.Min(Sse41.Max(a, Vector128 <int> .Zero), ALengthMinusOne); Vector128 <int> aPlusOne = Sse41.Min(Sse2.Add(a, One), ALengthMinusOne); // Now, get the reference input, xA, for our index a. // This involves casting from int to double. Vector256 <double> xA = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(a), vDeltaA), vMinXA); // Now, compute the lambda for our A reference point. Vector256 <double> currentXNormA = Avx.Max(vMinXA, Avx.Min(currentX, vMaxXA)); Vector256 <double> lambdaA = Avx.Multiply(Avx.Subtract(currentXNormA, xA), vInvDeltaA); // Now, we need to load up our reference points using Vector Gather operations. Vector256 <double> AVector = Avx2.GatherVector256(pA, a, 8); Vector256 <double> AVectorPlusOne = Avx2.GatherVector256(pA, aPlusOne, 8); // Now, do the all of the above for our B reference point. Vector256 <double> bDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXB), vInvDeltaB); Vector128 <int> b = Avx.ConvertToVector128Int32WithTruncation(bDouble); b = Sse41.Min(Sse41.Max(b, Vector128 <int> .Zero), BLengthMinusOne); Vector128 <int> bPlusOne = Sse41.Min(Sse2.Add(b, One), BLengthMinusOne); Vector256 <double> xB = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(b), vDeltaB), vMinXB); Vector256 <double> currentXNormB = Avx.Max(vMinXB, Avx.Min(currentX, vMaxXB)); Vector256 <double> lambdaB = Avx.Multiply(Avx.Subtract(currentXNormB, xB), vInvDeltaB); Vector256 <double> BVector = Avx2.GatherVector256(pB, b, 8); Vector256 <double> BVectorPlusOne = Avx2.GatherVector256(pB, bPlusOne, 8); Vector256 <double> newZ = Avx.Add(Avx.Multiply(vWeightA, Avx.Add(AVector, Avx.Multiply(lambdaA, Avx.Subtract(AVectorPlusOne, AVector)))), Avx.Multiply(vWeightB, Avx.Add(BVector, Avx.Multiply(lambdaB, Avx.Subtract(BVectorPlusOne, BVector))))); Avx.Store(pZ + i, newZ); } } return(z); }
public Intro() { var middleVector = Vector128.Create(1.0f); // middleVector = <1,1,1,1> middleVector = Vector128.CreateScalar(-1.0f); // middleVector = <-1,0,0,0> var floatBytes = Vector64.AsByte(Vector64.Create(1.0f, -1.0f)); // floatBytes = <0, 0, 128, 63, 0, 0, 128, 191> if (Avx.IsSupported) { var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5> var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5> Vector256 <float> result = Avx.AddSubtract(left, right); // result = <-7.5, 2.5, -7.5, 2.5, -7.5, 2.5, -7.5, 2.5>xit left = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -50.0f, -60.0f, -70.0f, -80.0f); right = Vector256.Create(0.0f, 2.0f, 3.0f, 4.0f, 50.0f, 60.0f, 70.0f, 80.0f); result = Avx.UnpackHigh(left, right); // result = <-3, 3, -4, 4, -70, 70, -80, 80> result = Avx.UnpackLow(left, right); // result = <-1, 1, -2, 2, -50, 50, -60, 60> result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-30, 0, 0, 0, -17400, 0, 0, 0> bool testResult = Avx.TestC(left, right); // testResult = true testResult = Avx.TestC(right, left); // testResult = false Vector256 <float> result1 = Avx.Divide(left, right); var plusOne = Vector256.Create(1.0f); result = Avx.Compare(right, result1, FloatComparisonMode.OrderedGreaterThanNonSignaling); result = Avx.Compare(right, result1, FloatComparisonMode.UnorderedNotLessThanNonSignaling); left = Vector256.Create(0.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f); right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); Vector256 <float> nanInFirstPosition = Avx.Divide(left, right); left = Vector256.Create(1.1f, 3.3333333f, -3.0f, 4.22f, -50.0f, 60.0f, -70.0f, 80.0f); Vector256 <float> InfInFirstPosition = Avx.Divide(left, right); left = Vector256.Create(-1.1f, 3.0f, 1.0f / 3.0f, MathF.PI, -50.0f, 60.0f, -70.0f, 80.0f); right = Vector256.Create(0.0f, 2.0f, 3.1f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); Vector256 <float> compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> Vector256 <float> mixed = Avx.BlendVariable(left, right, compareResult); // mixed = <-1, 2, -3, 2, -50, -60, -70, -80> //left = Vector256.Create(-1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f); //right = Vector256.Create(1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f); Vector256 <float> other = right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); bool bRes = Avx.TestZ(plusOne, compareResult); bool bRes2 = Avx.TestC(plusOne, compareResult); bool allTrue = !Avx.TestZ(compareResult, compareResult); compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.OrderedEqualNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.UnorderedEqualNonSignaling); compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.UnorderedNotLessThanOrEqualNonSignaling); compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); var left128 = Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f); var right128 = Vector128.Create(2.0f, 3.0f, 4.0f, 5.0f); Vector128 <float> compResult128 = Sse.CompareGreaterThan(left128, right128); // compResult128 = <0, 0, 0, 0> int res = Avx.MoveMask(compareResult); if (Fma.IsSupported) { Vector256 <float> resultFma = Fma.MultiplyAdd(left, right, other); // = left * right + other for each element resultFma = Fma.MultiplyAddNegated(left, right, other); // = -(left * right + other) for each element resultFma = Fma.MultiplySubtract(left, right, other); // = left * right - other for each element Fma.MultiplyAddSubtract(left, right, other); // even elements (0, 2, ...) like MultiplyAdd, odd elements like MultiplySubtract } result = Avx.DotProduct(left, right, 0b1010_0001); // result = <-20, 0, 0, 0, -10000, 0, 0, 0> result = Avx.Floor(left); // result = <-3, -3, -3, -3, -3, -3, -3, -3> result = Avx.Add(left, right); // result = <2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5> result = Avx.Ceiling(left); // result = <-2, -2, -2, -2, -2, -2, -2, -2> result = Avx.Multiply(left, right); // result = <-12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5> result = Avx.HorizontalAdd(left, right); // result = <-5, -5, 10, 10, -5, -5, 10, 10> result = Avx.HorizontalSubtract(left, right); // result = <0, 0, 0, 0, 0, 0, 0, 0> double[] someDoubles = new double[] { 1.0, 3.0, -2.5, 7.5, 10.8, 0.33333 }; double[] someOtherDoubles = new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; double[] someResult = new double[someDoubles.Length]; float[] someFloats = new float[] { 1, 2, 3, 4, 10, 20, 30, 40, 0 }; float[] someOtherFloats = new float[] { 1, 1, 1, 1, 1, 1, 1, 1 }; unsafe { fixed(double *ptr = &someDoubles[1]) { fixed(double *ptr2 = &someResult[0]) { Vector256 <double> res2 = Avx.LoadVector256(ptr); // res2 = <3, -2.5, 7.5, 10.8> Avx.Store(ptr2, res2); } } fixed(float *ptr = &someFloats[0]) { fixed(float *ptr2 = &someOtherFloats[0]) { Vector256 <float> res2 = Avx.DotProduct(Avx.LoadVector256(ptr), Avx.LoadVector256(ptr2), 0b0001_0001); //Avx.Store(ptr2, res2); } } } } }