public static unsafe void ReverseBits(this Span <int> span) { var intsReversed = 0; if (Avx2.IsSupported) { fixed(int *ptr = span) { var vectorCount = span.Length / 8; for (int i = 0; i < vectorCount; i++) { var vector = Avx.LoadVector256((ptr + intsReversed)); var vector2 = Avx2.And(Avx2.And(vector, Vector256.Create(0xFF00FF)), Vector256.Create(-16711936)); vector = Avx2.Add( Avx2.Or( Avx2.ShiftRightLogical(vector, 8), Avx2.ShiftLeftLogical(vector, 24) ), Avx2.Or( Avx2.ShiftLeftLogical(vector2, 8), Avx2.ShiftRightLogical(vector2, 24) ) ); Avx.Store(ptr + intsReversed, vector); intsReversed += 8; } } } for (int i = intsReversed; i < span.Length; i++) { span[i] = BinaryPrimitives.ReverseEndianness(span[i]); } fixed(void *ptr = span) { new Span <byte>(ptr, span.Length * 4).ReverseBits(); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleTernaryOpTest__MultiplySubtractNegatedSingle(); fixed(Vector256 <Single> *pFld1 = &test._fld1) fixed(Vector256 <Single> *pFld2 = &test._fld2) fixed(Vector256 <Single> *pFld3 = &test._fld3) { var result = Fma.MultiplySubtractNegated( Avx.LoadVector256((Single *)(pFld1)), Avx.LoadVector256((Single *)(pFld2)), Avx.LoadVector256((Single *)(pFld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleTernaryOpTest__BlendVariableInt32(); fixed(Vector256 <Int32> *pFld1 = &test._fld1) fixed(Vector256 <Int32> *pFld2 = &test._fld2) fixed(Vector256 <Int32> *pFld3 = &test._fld3) { var result = Avx2.BlendVariable( Avx.LoadVector256((Int32 *)(pFld1)), Avx.LoadVector256((Int32 *)(pFld2)), Avx.LoadVector256((Int32 *)(pFld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); } }
public static unsafe float Sum_AVX_stackalloc(float[] array) { Vector256 <float> sum = Avx.SetZeroVector256 <float>(); fixed(float *ptr = &array[0]) { for (int i = 0; i < array.Length; i += 8) { var current = Avx.LoadVector256(ptr + i); sum = Avx.Add(current, sum); } } // store __m256 into float[8] and sum all values via code - will it be slower than Sum_AVX()? var result = stackalloc float[8]; Avx.Store(result, sum); return(*result + *(result + 1) + *(result + 2) + *(result + 3) + *(result + 4) + *(result + 5) + *(result + 6) + *(result + 7)); }
public static unsafe float Sum_AVX(float[] array) { Vector256 <float> sum = Avx.SetZeroVector256 <float>(); fixed(float *ptr = &array[0]) { for (int i = 0; i < array.Length; i += 8) { var current = Avx.LoadVector256(ptr + i); sum = Avx.Add(current, sum); } } // sum all values in __m256 (horizontal sum) var ha = Avx.HorizontalAdd(sum, sum); var ha2 = Avx.HorizontalAdd(ha, ha); var lo = Avx.ExtractVector128(ha2, 1); var resultV = Sse.Add(Avx.GetLowerHalf(ha2), lo); return(Sse.ConvertToSingle(resultV)); }
public unsafe float[] ProcessDataUnsafe() { float[] results = new float[inputData.Length]; fixed(float *inputPtr = &inputData[0]) { float *inCurrent = inputPtr; fixed(float *resultPtr = &results[0]) { float *resEnd = resultPtr + results.Length; float *resCurrent = resultPtr; while (resCurrent < resEnd) { Avx.Store(resCurrent, Avx.Sqrt(Avx.LoadVector256(inCurrent))); resCurrent += 8; inCurrent += 8; } } } return(results); }
public static bool SequenceEqual_Avx(float[] array1, float[] array2) { if (array1.Length != array2.Length) { return(false); } if (array1.Length == 0) { return(true);//SequenceEqual_Soft(array1, array2, 0); } int i = 0; fixed(float *ptr1 = &array1[0]) fixed(float *ptr2 = &array2[0]) { if (array1.Length < 8) { return(SequenceEqual_Soft(ptr1, ptr2, 0, array1.Length)); } for (; i <= array1.Length - 8; i += 8) //16 for AVX512 { var vec1 = Avx.LoadVector256(ptr1 + i); var vec2 = Avx.LoadVector256(ptr2 + i); var ce = Avx.Compare(vec1, vec2, FloatComparisonMode.NotEqualOrderedNonSignaling); if (!Avx.TestZ(ce, ce)) { return(false); } } return(SequenceEqual_Soft(ptr1, ptr2, i, array1.Length)); } }
public unsafe void Serialize(ref MessagePackWriter writer, double[]?value, MessagePackSerializerOptions options) { if (value == null) { writer.WriteNil(); return; } var inputLength = value.Length; writer.WriteArrayHeader(inputLength); if (inputLength == 0) { return; } var outputLength = inputLength * 9; var destination = writer.GetSpan(outputLength); fixed(byte *pDestination = &destination[0]) { var outputIterator = pDestination; fixed(double *pSource = &value[0]) { var inputEnd = pSource + inputLength; var inputIterator = (ulong *)pSource; if (Avx2.IsSupported) { const int ShiftCount = 2; const int Stride = 1 << ShiftCount; if (inputLength < Stride << 1) { goto ProcessEach; } var vectorShuffle = Vector256.Create((byte)7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride) { // Fetch 4 doubles. var current = Avx.LoadVector256((byte *)inputIterator); // Reorder Little Endian bytes to Big Endian. var answer = Avx2.Shuffle(current, vectorShuffle).AsUInt64(); // Write 4 Big-Endian doubles. *outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(0); outputIterator += 8; *outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(1); outputIterator += 8; *outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(2); outputIterator += 8; *outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(3); outputIterator += 8; } } else if (Ssse3.IsSupported) { const int ShiftCount = 1; const int Stride = 1 << ShiftCount; if (inputLength < Stride << 1) { goto ProcessEach; } var vectorShuffle = Vector128.Create((byte)7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride) { var current = Sse2.LoadVector128((byte *)inputIterator); var answer = Ssse3.Shuffle(current, vectorShuffle).AsUInt64(); * outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(0); outputIterator += 8; *outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(1); outputIterator += 8; } } ProcessEach: while (inputIterator != inputEnd) { * outputIterator++ = MessagePackCode.Float64; var current = *inputIterator++; * outputIterator++ = (byte)(current >> 56); * outputIterator++ = (byte)(current >> 48); * outputIterator++ = (byte)(current >> 40); * outputIterator++ = (byte)(current >> 32); * outputIterator++ = (byte)(current >> 24); * outputIterator++ = (byte)(current >> 16); * outputIterator++ = (byte)(current >> 8); * outputIterator++ = (byte)current; } } } writer.Advance(outputLength); }
private static unsafe double[] BilinearInterpol_AVX( double[] x, double[] A, double minXA, double maxXA, double[] B, double minXB, double maxXB, double weightB) { double[] z = new double[outputVectorSize]; fixed(double *pX = &x[0], pA = &A[0], pB = &B[0], pZ = &z[0]) { Vector256 <double> vWeightB = Vector256.Create(weightB); Vector256 <double> vWeightA = Vector256.Create(1 - weightB); Vector256 <double> vMinXA = Vector256.Create(minXA); Vector256 <double> vMaxXA = Vector256.Create(maxXA); Vector256 <double> vMinXB = Vector256.Create(minXB); Vector256 <double> vMaxXB = Vector256.Create(maxXB); double deltaA = (maxXA - minXA) / (double)(A.Length - 1); double deltaB = (maxXB - minXB) / (double)(B.Length - 1); Vector256 <double> vDeltaA = Vector256.Create(deltaA); Vector256 <double> vDeltaB = Vector256.Create(deltaB); double invDeltaA = 1.0 / deltaA; double invDeltaB = 1.0 / deltaB; Vector256 <double> vInvDeltaA = Vector256.Create(invDeltaA); Vector256 <double> vInvDeltaB = Vector256.Create(invDeltaB); Vector128 <int> ALengthMinusOne = Vector128.Create(A.Length - 1); Vector128 <int> BLengthMinusOne = Vector128.Create(B.Length - 1); Vector128 <int> One = Vector128.Create(1); for (var i = 0; i < x.Length; i += Vector256 <double> .Count) { Vector256 <double> currentX = Avx.LoadVector256(pX + i); // Determine the largest a, such that A[i] = f(xA) and xA <= x[i]. // This involves casting from double to int; here we use a Vector conversion. Vector256 <double> aDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXA), vInvDeltaA); Vector128 <int> a = Avx.ConvertToVector128Int32WithTruncation(aDouble); a = Sse41.Min(Sse41.Max(a, Vector128 <int> .Zero), ALengthMinusOne); Vector128 <int> aPlusOne = Sse41.Min(Sse2.Add(a, One), ALengthMinusOne); // Now, get the reference input, xA, for our index a. // This involves casting from int to double. Vector256 <double> xA = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(a), vDeltaA), vMinXA); // Now, compute the lambda for our A reference point. Vector256 <double> currentXNormA = Avx.Max(vMinXA, Avx.Min(currentX, vMaxXA)); Vector256 <double> lambdaA = Avx.Multiply(Avx.Subtract(currentXNormA, xA), vInvDeltaA); // Now, we need to load up our reference points using Vector Gather operations. Vector256 <double> AVector = Avx2.GatherVector256(pA, a, 8); Vector256 <double> AVectorPlusOne = Avx2.GatherVector256(pA, aPlusOne, 8); // Now, do the all of the above for our B reference point. Vector256 <double> bDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXB), vInvDeltaB); Vector128 <int> b = Avx.ConvertToVector128Int32WithTruncation(bDouble); b = Sse41.Min(Sse41.Max(b, Vector128 <int> .Zero), BLengthMinusOne); Vector128 <int> bPlusOne = Sse41.Min(Sse2.Add(b, One), BLengthMinusOne); Vector256 <double> xB = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(b), vDeltaB), vMinXB); Vector256 <double> currentXNormB = Avx.Max(vMinXB, Avx.Min(currentX, vMaxXB)); Vector256 <double> lambdaB = Avx.Multiply(Avx.Subtract(currentXNormB, xB), vInvDeltaB); Vector256 <double> BVector = Avx2.GatherVector256(pB, b, 8); Vector256 <double> BVectorPlusOne = Avx2.GatherVector256(pB, bPlusOne, 8); Vector256 <double> newZ = Avx.Add(Avx.Multiply(vWeightA, Avx.Add(AVector, Avx.Multiply(lambdaA, Avx.Subtract(AVectorPlusOne, AVector)))), Avx.Multiply(vWeightB, Avx.Add(BVector, Avx.Multiply(lambdaB, Avx.Subtract(BVectorPlusOne, BVector))))); Avx.Store(pZ + i, newZ); } } return(z); }
public unsafe override double[] Applay(double[] values, int halfWindow) { var windowSize = 2 * halfWindow + 1; var resultSize = values.Length - windowSize + 1; if (resultSize == 0) { return(null); } var a = new double[resultSize]; var sum = 0d; fixed(double *valueStart = values, aStart = a) { var valueCurrent = valueStart; var valueEndwindowSize = valueCurrent + windowSize; while (valueCurrent < valueEndwindowSize) { sum += *valueCurrent; valueCurrent++; } var aCurrent = aStart + 1; var aEnd = aStart + resultSize; var aUnrolledEnd = aStart + (((resultSize - 1) >> 4) << 4); valueCurrent = valueStart; var valueWindowSize = valueStart + windowSize; var vWindowSize = Vector256.Create((double)windowSize); var vCurrent = Vector256.Create( (ulong)aCurrent, (ulong)aCurrent + 4 * sizeof(double), (ulong)aCurrent + 8 * sizeof(double), (ulong)aCurrent + 12 * sizeof(double)); var vValueCurrent = Vector256.Create( (ulong)valueCurrent, (ulong)valueCurrent + 4 * sizeof(double), (ulong)valueCurrent + 8 * sizeof(double), (ulong)valueCurrent + 12 * sizeof(double)); var vValueWindowSize = Vector256.Create( (ulong)valueWindowSize, (ulong)valueWindowSize + 4 * sizeof(double), (ulong)valueWindowSize + 8 * sizeof(double), (ulong)valueWindowSize + 12 * sizeof(double)); var vShiftIndex1 = Vector256.Create(16ul * sizeof(double)); while (aCurrent < aUnrolledEnd) { #region 1 Avx.Store( aCurrent, Avx.Divide( Avx.Subtract( Avx.LoadVector256((double *)vValueWindowSize.GetElement(0)), Avx.LoadVector256((double *)vValueCurrent.GetElement(0))), vWindowSize ) ); #endregion #region 2 Avx.Store( (double *)vCurrent.GetElement(1), Avx.Divide( Avx.Subtract( Avx.LoadVector256((double *)vValueWindowSize.GetElement(1)), Avx.LoadVector256((double *)vValueCurrent.GetElement(1))), vWindowSize ) ); #endregion #region 3 Avx.Store( (double *)vCurrent.GetElement(2), Avx.Divide( Avx.Subtract( Avx.LoadVector256((double *)vValueWindowSize.GetElement(2)), Avx.LoadVector256((double *)vValueCurrent.GetElement(2))), vWindowSize ) ); #endregion #region 4 Avx.Store( (double *)vCurrent.GetElement(3), Avx.Divide( Avx.Subtract( Avx.LoadVector256((double *)vValueWindowSize.GetElement(3)), Avx.LoadVector256((double *)vValueCurrent.GetElement(3))), vWindowSize ) ); #endregion vCurrent = Avx.Add(vCurrent.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64(); vValueCurrent = Avx.Add(vValueCurrent.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64(); vValueWindowSize = Avx.Add(vValueWindowSize.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64(); aCurrent = (double *)vCurrent.GetElement(0); } valueWindowSize = (double *)vValueWindowSize.GetElement(0); valueCurrent = (double *)vValueCurrent.GetElement(0); while (aCurrent < aEnd) { *aCurrent = (*valueWindowSize - *valueCurrent) / windowSize; aCurrent++; valueCurrent++; valueWindowSize++; } var aPrev = aStart; aCurrent = aStart + 1; aEnd = aStart + resultSize; *aPrev = sum / windowSize; aUnrolledEnd = aStart + (((resultSize - 1) >> 2) << 2); vCurrent = Vector256.Create( (ulong)aCurrent, (ulong)aCurrent + sizeof(double), (ulong)aCurrent + 2 * sizeof(double), (ulong)aCurrent + 3 * sizeof(double)); var vPrev = Vector256.Create( (ulong)aPrev, (ulong)aPrev + sizeof(double), (ulong)aPrev + 2 * sizeof(double), (ulong)aPrev + 3 * sizeof(double)); var vShiftIndex = Vector256.Create(4ul * sizeof(double)); while (aCurrent < aUnrolledEnd) { #region 1 *aCurrent += *(double *)vPrev.GetElement(0); #endregion #region 2 *(double *)vCurrent.GetElement(1) += *(double *)vPrev.GetElement(1); #endregion #region 3 *(double *)vCurrent.GetElement(2) += *(double *)vPrev.GetElement(2); #endregion #region 4 *(double *)vCurrent.GetElement(3) += *(double *)vPrev.GetElement(3); #endregion vCurrent = Avx.Add(vCurrent.AsDouble(), vShiftIndex.AsDouble()).AsUInt64(); vPrev = Avx.Add(vPrev.AsDouble(), vShiftIndex.AsDouble()).AsUInt64(); aCurrent = (double *)vCurrent.GetElement(0); } aPrev = (double *)vPrev.GetElement(0); while (aCurrent < aEnd) { *aCurrent += *aPrev; aCurrent++; aPrev++; } } return(a); }
public static unsafe String From(string value) { List <int> surrogates = null; var i = 0; var codepointIndex = 0; if (Avx2.IsSupported) { var mask = (ushort)0xF800; var surrogateBits = (ushort)0xD800; var maskVector = Avx2.BroadcastScalarToVector256(&mask); var surrogateBitsVector = Avx2.BroadcastScalarToVector256(&surrogateBits); fixed(char *str = value) { var step = Vector256 <ushort> .Count; while (i + step <= value.Length) { var chars = Avx.LoadVector256((ushort *)(str + i)); var masked = Avx2.And(chars, maskVector); var equality = Avx2.CompareEqual(masked, surrogateBitsVector); var equalityBits = Avx2.MoveMask(equality.As <ushort, byte>()); var surrogate = equalityBits != 0; if (!surrogate) { i += step; codepointIndex += step; } else { var border = i + step; while (i < border) { Parse(); } } } } } while (i < value.Length) { Parse(); } return(surrogates is null ? new SurrogateFreeRegular(value) : new Regular(value, surrogates)); void Parse() { if (char.IsHighSurrogate(value[i])) { if (i + 1 >= value.Length) { throw new ArgumentException( "Value is malformed - it ends with a high surrogate.", nameof(value) ); } if (!char.IsLowSurrogate(value[i + 1])) { throw new ArgumentException( $"Value is malformed - a high surrogate at [{i}] not followed by a low surrogate.", nameof(value) ); } surrogates ??= new List <int>(); surrogates.Add(codepointIndex); i += 2; } else if (char.IsLowSurrogate(value, i)) { throw new ArgumentException( $"Value is malformed - a low surrogate at [{i}] not following a high surrogate.", nameof(value) ); } else { i++; } codepointIndex++; } }
public static unsafe void SalsaCore128(byte rounds, uint *state, byte *source, byte *destination) { var t8 = *(state + 8); var t9 = *(state + 9); var s1 = Avx.LoadVector256(state + 8); // 8 9 10 11 12 13 14 15 if (++*(state + 8) == 0) { ++*(state + 9); } // 4 9 14 3 var x0 = Vector256.Create( *(state + 4), t9, *(state + 14), *(state + 3), *(state + 4), *(state + 9), *(state + 14), *(state + 3)); // 0 5 10 15 var x1 = Vector256.Create( *(state + 0), *(state + 5), *(state + 10), *(state + 15), *(state + 0), *(state + 5), *(state + 10), *(state + 15)); // 12 1 6 11 var x2 = Vector256.Create( *(state + 12), *(state + 1), *(state + 6), *(state + 11), *(state + 12), *(state + 1), *(state + 6), *(state + 11)); // 8 13 2 7 var x3 = Vector256.Create( t8, *(state + 13), *(state + 2), *(state + 7), *(state + 8), *(state + 13), *(state + 2), *(state + 7) ); for (var i = 0; i < rounds; i += 2) { QuarterRound(ref x0, ref x1, ref x2, ref x3); Shuffle(ref x0, ref x2, ref x3); QuarterRound(ref x0, ref x1, ref x2, ref x3); Shuffle(ref x0, ref x2, ref x3); } Shuffle(ref x0, ref x1, ref x2, ref x3); var s0 = Avx.LoadVector256(state); // 0 1 2 3 4 5 6 7 x0 = Avx2.Add(x0, s0); x1 = Avx2.Add(x1, s1); x2 = Avx2.Add(x2, s0); x3 = Avx2.Add(x3, Avx.LoadVector256(state + 8)); var v0 = Avx2.Xor(x0.AsByte(), Avx.LoadVector256(source)); var v1 = Avx2.Xor(x1.AsByte(), Avx.LoadVector256(source + 32)); var v2 = Avx2.Xor(x2.AsByte(), Avx.LoadVector256(source + 64)); var v3 = Avx2.Xor(x3.AsByte(), Avx.LoadVector256(source + 96)); Avx.Store(destination, v0); Avx.Store(destination + 32, v1); Avx.Store(destination + 64, v2); Avx.Store(destination + 96, v3); if (++*(state + 8) == 0) { ++*(state + 9); } }
public unsafe int[,] IntegrateUnsafeVectorBranched() { int w = _data.Width(); int h = _data.Height(); int[,] res = new int[h, w]; Vector256 <int> shiftRight = RotateRight; fixed(byte *pSource = &_data[0, 0]) fixed(int *pTarget = &res[0, 0]) { var pSrc = pSource; var pTrg = pTarget; for (var i = 0; i < h; i++) { var j = 0; var p = Vector256.CreateScalar(0); var pr = Vector256.CreateScalar(0); //handle vector part for (; j + Vector256 <int> .Count <= w; j += Vector256 <int> .Count) { var t = Avx2.ConvertToVector256Int32(pSrc); //(int)*(pSrc) var s = Aggregate(p, t); // this code block has to be p = t; // added to handle the in-line t = Avx2.Add(t, s); // recursion: S[i]=a[i]+S[i-1] if (j > 0) { t = Avx2.Add(t, pr); // t += *(pTrg - 1); } if (i > 0) { t = Avx2.Add(t, Avx.LoadVector256(pTrg - w)); if (j > 0) { t = Avx2.Subtract(t, Avx.LoadVector256(pTrg - w - 8)); } } Avx.Store(pTrg, t); pr = t; pSrc += Vector256 <int> .Count; pTrg += Vector256 <int> .Count; } // handle the tail var pr2 = (j == 0 ? 0 : pr.GetElement(Vector256 <int> .Count - 1)); // Vector256.CreateScalar(0); for (; j < w; j++) { var t = (int)*(pSrc); // Avx2.ConvertToVector256Int32(pSrc); if (j > 0) { t += pr2; // t = Avx2.Add(t, pr); } if (i > 0) { t += *(pTrg - w); // Avx2.Add(t, Avx.LoadVector256(pTrg - w)); if (j > 0) { t -= *(pTrg - w - 1); // Avx2.Subtract(t, Avx.LoadVector256(pTrg - w - 8)); } } *pTrg = t; // Avx2.Store(pTrg, t); pr2 = t; // pr = t pSrc++; pTrg++; } } } return(res); }
public static bool find_structural_bits(uint8_t *buf, size_t len, ParsedJson *pj) { if (len > pj->bytecapacity) { Console.WriteLine("Your ParsedJson object only supports documents up to " + pj->bytecapacity + " bytes but you are trying to process " + len + " bytes\n"); return(false); } uint32_t *base_ptr = pj->structural_indexes; uint32_t @base = 0; const uint64_t even_bits = 0x5555555555555555UL; const uint64_t odd_bits = ~even_bits; // for now, just work in 64-byte chunks // we have padded the input out to 64 byte multiple with the remainder being // zeros // persistent state across loop uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones // effectively the very first char is considered to follow "whitespace" for the // purposes of psuedo-structural character detection uint64_t prev_iter_ends_pseudo_pred = 1UL; size_t lenminus64 = len < 64 ? 0 : len - 64; size_t idx = 0; uint64_t structurals = 0; // C#: assign static readonly fields to locals before the loop Vector256 <byte> low_nibble_mask = s_low_nibble_mask; Vector256 <byte> high_nibble_mask = s_high_nibble_mask; var structural_shufti_mask = Vector256.Create((byte)0x7); var whitespace_shufti_mask = Vector256.Create((byte)0x18); var slashVec = Vector256.Create((bytechar)'\\').AsByte(); var ffVec = Vector128.Create((byte)0xFF).AsUInt64(); var doubleQuoteVec = Vector256.Create((byte)'"'); var zeroBVec = Vector256.Create((byte)0); var vec7f = Vector256.Create((byte)0x7f); for (; idx < lenminus64; idx += 64) { var input_lo = Avx.LoadVector256(buf + idx + 0); var input_hi = Avx.LoadVector256(buf + idx + 32); //////////////////////////////////////////////////////////////////////////////////////////// // Step 1: detect odd sequences of backslashes //////////////////////////////////////////////////////////////////////////////////////////// /// uint64_t bs_bits = cmp_mask_against_input(input_lo, input_hi, slashVec); uint64_t start_edges = bs_bits & ~(bs_bits << 1); // flip lowest if we have an odd-length run at the end of the prior // iteration uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // must record the carry-out of our odd-carries out of bit 63; this // indicates whether the sense of any edge going to the next iteration // should be flipped bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end // if we had an odd-numbered run at the // end of the previous iteration prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; //////////////////////////////////////////////////////////////////////////////////////////// // Step 2: detect insides of quote pairs //////////////////////////////////////////////////////////////////////////////////////////// uint64_t quote_bits = cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0)); uint32_t cnt = (uint32_t)hamming(structurals); uint32_t next_base = @base + cnt; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base; quote_mask ^= prev_iter_inside_quote; prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code var v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), vec7f))); var v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), vec7f))); var tmp_lo = Avx2.CompareEqual( Avx2.And(v_lo, structural_shufti_mask), zeroBVec); var tmp_hi = Avx2.CompareEqual( Avx2.And(v_hi, structural_shufti_mask), zeroBVec); uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo); uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi); structurals = ~(structural_res_0 | (structural_res_1 << 32)); var tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec); var tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); // mask off anything inside quotes structurals &= ~quote_mask; // add the real quote bits back into our bitmask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace // characters that are (a) outside quotes and (b) have a predecessor that's // either whitespace or a structural character. This means that subsequent // passes will get a chance to encounter the first character of every string // of non-whitespace and, if we're parsing an atom like true/false/null or a // number we can stop at the first whitespace or structural character // following it. // a qualified predecessor is something that can happen 1 position before an // psuedo-structural character uint64_t pseudo_pred = structurals | whitespace; uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; prev_iter_ends_pseudo_pred = pseudo_pred >> 63; uint64_t pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask); structurals |= pseudo_structurals; // now, we've used our close quotes all we need to. So let's switch them off // they will be off in the quote mask and on in quote bits. structurals &= ~(quote_bits & ~quote_mask); //Console.WriteLine($"Iter: {idx}, satur: {structurals}"); //*(uint64_t *)(pj->structurals + idx / 8) = structurals; } //////////////// /// we use a giant copy-paste which is ugly. /// but otherwise the string needs to be properly padded or else we /// risk invalidating the UTF-8 checks. //////////// if (idx < len) { uint8_t *tmpbuf = stackalloc uint8_t[64]; memset(tmpbuf, 0x20, 64); memcpy(tmpbuf, buf + idx, len - idx); Vector256 <byte> input_lo = Avx.LoadVector256(tmpbuf + 0); Vector256 <byte> input_hi = Avx.LoadVector256(tmpbuf + 32); //////////////////////////////////////////////////////////////////////////////////////////// // Step 1: detect odd sequences of backslashes //////////////////////////////////////////////////////////////////////////////////////////// uint64_t bs_bits = cmp_mask_against_input(input_lo, input_hi, slashVec); uint64_t start_edges = bs_bits & ~(bs_bits << 1); // flip lowest if we have an odd-length run at the end of the prior // iteration uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // must record the carry-out of our odd-carries out of bit 63; this // indicates whether the sense of any edge going to the next iteration // should be flipped //bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end // if we had an odd-numbered run at the // end of the previous iteration //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; //////////////////////////////////////////////////////////////////////////////////////////// // Step 2: detect insides of quote pairs //////////////////////////////////////////////////////////////////////////////////////////// uint64_t quote_bits = cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = (uint64_t)Sse2.X64.ConvertToInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0).AsInt64()); quote_mask ^= prev_iter_inside_quote; //BUG? https://github.com/dotnet/coreclr/issues/22813 //quote_mask = 60; //prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20 uint32_t cnt = (uint32_t)hamming(structurals); uint32_t next_base = @base + cnt; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base; // How do we build up a user traversable data structure // first, do a 'shufti' to detect structural JSON characters // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c // these go into the first 3 buckets of the comparison (1/2/4) // we are also interested in the four whitespace characters // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d // these go into the next 2 buckets of the comparison (8/16) var v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), vec7f))); var v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), vec7f))); var tmp_lo = Avx2.CompareEqual( Avx2.And(v_lo, structural_shufti_mask), zeroBVec); var tmp_hi = Avx2.CompareEqual( Avx2.And(v_hi, structural_shufti_mask), zeroBVec); uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo); uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi); structurals = ~(structural_res_0 | (structural_res_1 << 32)); // this additional mask and transfer is non-trivially expensive, // unfortunately var tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec); var tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); // mask off anything inside quotes structurals &= ~quote_mask; // add the real quote bits back into our bitmask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace // characters that are (a) outside quotes and (b) have a predecessor that's // either whitespace or a structural character. This means that subsequent // passes will get a chance to encounter the first character of every string // of non-whitespace and, if we're parsing an atom like true/false/null or a // number we can stop at the first whitespace or structural character // following it. // a qualified predecessor is something that can happen 1 position before an // psuedo-structural character uint64_t pseudo_pred = structurals | whitespace; uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; prev_iter_ends_pseudo_pred = pseudo_pred >> 63; uint64_t pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask); structurals |= pseudo_structurals; // now, we've used our close quotes all we need to. So let's switch them off // they will be off in the quote mask and on in quote bits. structurals &= ~(quote_bits & ~quote_mask); //*(uint64_t *)(pj->structurals + idx / 8) = structurals; idx += 64; } uint32_t cnt2 = (uint32_t)hamming(structurals); uint32_t next_base2 = @base + cnt2; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base2; pj->n_structural_indexes = @base; if (base_ptr[pj->n_structural_indexes - 1] > len) { throw new InvalidOperationException("Internal bug"); } if (len != base_ptr[pj->n_structural_indexes - 1]) { // the string might not be NULL terminated, but we add a virtual NULL ending character. base_ptr[pj->n_structural_indexes++] = (uint32_t)len; } base_ptr[pj->n_structural_indexes] = 0; // make it safe to dereference one beyond this array return(true); }
static unsafe int Main(string[] args) { int testResult = Pass; if (Avx2.IsSupported) { Four = 4; Eight = 8; invalid = 15; for (int i = 0; i < N; i++) { floatSourceTable[i] = (float)i * 10.0f; doubleSourceTable[i] = (double)i * 10.0; intSourceTable[i] = i * 10; longSourceTable[i] = i * 10; } Vector256 <int> indexi; Vector256 <long> indexl; Vector128 <int> indexi128; fixed(int *iptr = intIndexTable) fixed(long *lptr = longIndexTable) fixed(int *i128ptr = vector128intIndexTable) { indexi = Avx.LoadVector256(iptr); indexl = Avx.LoadVector256(lptr); indexi128 = Sse2.LoadVector128(i128ptr); } Vector256 <int> maski; Vector256 <uint> maskui; Vector256 <long> maskl; Vector256 <ulong> maskul; Vector256 <float> maskf; Vector256 <double> maskd; fixed(int *iptr = intMaskTable) fixed(long *lptr = longMaskTable) { maski = Avx.LoadVector256(iptr); maskl = Avx.LoadVector256(lptr); maskui = maski.AsUInt32(); maskul = maskl.AsUInt64(); maskf = maski.AsSingle(); maskd = maskl.AsDouble(); } Vector256 <int> sourcei = Vector256 <int> .Zero; Vector256 <uint> sourceui = Vector256 <uint> .Zero; Vector256 <long> sourcel = Vector256 <long> .Zero; Vector256 <ulong> sourceul = Vector256 <ulong> .Zero; Vector256 <float> sourcef = Vector256 <float> .Zero; Vector256 <double> sourced = Vector256 <double> .Zero; // public static unsafe Vector256<float> GatherMaskVector256(Vector256<float> source, float* baseAddress, Vector256<int> index, Vector256<float> mask, byte scale) using (TestTable <float, int> floatTable = new TestTable <float, int>(floatSourceTable, new float[8])) { var vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, 4); Unsafe.Write(floatTable.outArrayPtr, vf); if (!floatTable.CheckResult((x, y) => BitConverter.SingleToInt32Bits(x) == BitConverter.SingleToInt32Bits(y), intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <float>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <float>), typeof(float *), typeof(Vector256 <int>), typeof(Vector256 <float>), typeof(byte) }). Invoke(null, new object[] { sourcef, Pointer.Box(floatTable.inArrayPtr, typeof(float *)), indexi, maskf, (byte)4 }); Unsafe.Write(floatTable.outArrayPtr, vf); if (!floatTable.CheckResult((x, y) => BitConverter.SingleToInt32Bits(x) == BitConverter.SingleToInt32Bits(y), intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on float with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, Four); Unsafe.Write(floatTable.outArrayPtr, vf); if (!floatTable.CheckResult((x, y) => BitConverter.SingleToInt32Bits(x) == BitConverter.SingleToInt32Bits(y), intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on float with non-const scale (IMM):"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on float with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<double> GatherMaskVector256(Vector256<double> source, double* baseAddress, Vector128<int> index, Vector256<double> mask, byte scale) using (TestTable <double, int> doubletTable = new TestTable <double, int>(doubleSourceTable, new double[4])) { var vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, 8); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on double:"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vd = (Vector256 <double>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <double>), typeof(double *), typeof(Vector128 <int>), typeof(Vector256 <double>), typeof(byte) }). Invoke(null, new object[] { sourced, Pointer.Box(doubletTable.inArrayPtr, typeof(double *)), indexi128, maskd, (byte)8 }); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on double:"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, Eight); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on double with non-const scale (IMM):"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<int> GatherMaskVector256(Vector256<int> source, int* baseAddress, Vector256<int> index, Vector256<int> mask, byte scale) using (TestTable <int, int> intTable = new TestTable <int, int>(intSourceTable, new int[8])) { var vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, 4); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on int:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <int>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <int>), typeof(int *), typeof(Vector256 <int>), typeof(Vector256 <int>), typeof(byte) }). Invoke(null, new object[] { sourcei, Pointer.Box(intTable.inArrayPtr, typeof(int *)), indexi, maski, (byte)4 }); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on int:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on int with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, Four); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on int with non-const scale (IMM):"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on int with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<uint> GatherMaskVector256(Vector256<uint> source, uint* baseAddress, Vector256<int> index, Vector256<uint> mask, byte scale) using (TestTable <int, int> intTable = new TestTable <int, int>(intSourceTable, new int[8])) { var vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, 4); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on uint:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <uint>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <uint>), typeof(uint *), typeof(Vector256 <int>), typeof(Vector256 <uint>), typeof(byte) }). Invoke(null, new object[] { sourceui, Pointer.Box(intTable.inArrayPtr, typeof(uint *)), indexi, maskui, (byte)4 }); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on uint:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on uint with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, Four); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on uint with non-const scale (IMM):"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on uint with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<long> GatherMaskVector256(Vector256<long> source, long* baseAddress, Vector128<int> index, Vector256<long> mask, byte scale) using (TestTable <long, int> longTable = new TestTable <long, int>(longSourceTable, new long[4])) { var vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, 8); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on long:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <long>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <long>), typeof(long *), typeof(Vector128 <int>), typeof(Vector256 <long>), typeof(byte) }). Invoke(null, new object[] { sourcel, Pointer.Box(longTable.inArrayPtr, typeof(long *)), indexi128, maskl, (byte)8 }); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on long:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, Eight); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on long with non-const scale (IMM):"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<ulong> GatherMaskVector256(Vector256<ulong> source, ulong* baseAddress, Vector128<int> index, Vector256<ulong> mask, byte scale) using (TestTable <long, int> longTable = new TestTable <long, int>(longSourceTable, new long[4])) { var vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, 8); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <ulong>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <ulong>), typeof(ulong *), typeof(Vector128 <int>), typeof(Vector256 <ulong>), typeof(byte) }). Invoke(null, new object[] { sourceul, Pointer.Box(longTable.inArrayPtr, typeof(ulong *)), indexi128, maskul, (byte)8 }); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on ulong:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, Eight); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with non-const scale (IMM):"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<long> GatherMaskVector256(Vector256<long> source, long* baseAddress, Vector256<long> index, Vector256<long> mask, byte scale) using (TestTable <long, long> longTable = new TestTable <long, long>(longSourceTable, new long[4])) { var vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, 8); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on long with Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <long>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <long>), typeof(long *), typeof(Vector256 <long>), typeof(Vector256 <long>), typeof(byte) }). Invoke(null, new object[] { sourcel, Pointer.Box(longTable.inArrayPtr, typeof(long *)), indexl, maskl, (byte)8 }); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on long with Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, Eight); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on long with non-const scale (IMM) and Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid non-const scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<ulong> GatherMaskVector256(Vector256<ulong> source, ulong* baseAddress, Vector256<long> index, Vector256<ulong> mask, byte scale) using (TestTable <long, long> longTable = new TestTable <long, long>(longSourceTable, new long[4])) { var vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, 8); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <ulong>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <ulong>), typeof(ulong *), typeof(Vector256 <long>), typeof(Vector256 <ulong>), typeof(byte) }). Invoke(null, new object[] { sourceul, Pointer.Box(longTable.inArrayPtr, typeof(ulong *)), indexl, maskul, (byte)8 }); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on ulong with Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with invalid scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, Eight); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with non-const scale (IMM) and Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid non-const scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<double> GatherMaskVector256(Vector256<double> source, double* baseAddress, Vector256<long> index, Vector256<double> mask, byte scale) using (TestTable <double, long> doubletTable = new TestTable <double, long>(doubleSourceTable, new double[4])) { var vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, 8); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on double with Vector256 long index:"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vd = (Vector256 <double>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <double>), typeof(double *), typeof(Vector256 <long>), typeof(Vector256 <double>), typeof(byte) }). Invoke(null, new object[] { sourced, Pointer.Box(doubletTable.inArrayPtr, typeof(double *)), indexl, maskd, (byte)8 }); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on double with Vector256 long index:"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, Eight); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on double with non-const scale (IMM) and Vector256 long index:"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid non-const scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } } return(testResult); }
// This function implements Algorithm 2 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf // Calculate the stochastic gradient and update the model. public static unsafe void CalculateGradientAndUpdate(int *fieldIndices, int *featureIndices, float *featureValues, float *latentSum, float *linearWeights, float *latentWeights, float *linearAccumulatedSquaredGrads, float *latentAccumulatedSquaredGrads, float lambdaLinear, float lambdaLatent, float learningRate, int fieldCount, int latentDim, float weight, int count, float slope) { Contracts.Assert(Avx.IsSupported); int m = fieldCount; int d = latentDim; int c = count; int * pf = fieldIndices; int * pi = featureIndices; float *px = featureValues; float *pq = latentSum; float *pw = linearWeights; float *pv = latentWeights; float *phw = linearAccumulatedSquaredGrads; float *phv = latentAccumulatedSquaredGrads; Vector256 <float> wei = Vector256.Create(weight); Vector256 <float> s = Vector256.Create(slope); Vector256 <float> lr = Vector256.Create(learningRate); Vector256 <float> lambdav = Vector256.Create(lambdaLatent); for (int i = 0; i < count; i++) { int f = pf[i]; int j = pi[i]; // Calculate gradient of linear term w_j. float g = weight * (lambdaLinear * pw[j] + slope * px[i]); // Accumulate the gradient of the linear term. phw[j] += g * g; // Perform ADAGRAD update rule to adjust linear term. pw[j] -= learningRate / MathF.Sqrt(phw[j]) * g; // Update latent term, v_j,f', f'=1,...,m. Vector256 <float> x = Avx.BroadcastScalarToVector256(px + i); for (int fprime = 0; fprime < m; fprime++) { float * vjfprime = pv + j * m * d + fprime * d; float * hvjfprime = phv + j * m * d + fprime * d; float * qfprimef = pq + fprime * m * d + f * d; Vector256 <float> sx = Avx.Multiply(s, x); for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> v = Avx.LoadVector256(vjfprime + k); Vector256 <float> q = Avx.LoadVector256(qfprimef + k); // Calculate L2-norm regularization's gradient. Vector256 <float> gLatent = Avx.Multiply(lambdav, v); Vector256 <float> tmp = q; // Calculate loss function's gradient. if (fprime == f) { tmp = MultiplyAddNegated(v, x, q); } gLatent = MultiplyAdd(sx, tmp, gLatent); gLatent = Avx.Multiply(wei, gLatent); // Accumulate the gradient of latent vectors. Vector256 <float> h = MultiplyAdd(gLatent, gLatent, Avx.LoadVector256(hvjfprime + k)); // Perform ADAGRAD update rule to adjust latent vector. v = MultiplyAddNegated(lr, Avx.Multiply(Avx.ReciprocalSqrt(h), gLatent), v); Avx.Store(vjfprime + k, v); Avx.Store(hvjfprime + k, h); } } } }
// This function implements Algorithm 1 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf. // Compute the output value of the field-aware factorization, as the sum of the linear part and the latent part. // The linear part is the inner product of linearWeights and featureValues. // The latent part is the sum of all intra-field interactions in one field f, for all fields possible public static unsafe void CalculateIntermediateVariables(int *fieldIndices, int *featureIndices, float *featureValues, float *linearWeights, float *latentWeights, float *latentSum, float *response, int fieldCount, int latentDim, int count) { Contracts.Assert(Avx.IsSupported); // The number of all possible fields. int m = fieldCount; int d = latentDim; int c = count; int * pf = fieldIndices; int * pi = featureIndices; float *px = featureValues; float *pw = linearWeights; float *pv = latentWeights; float *pq = latentSum; float linearResponse = 0; float latentResponse = 0; Unsafe.InitBlock(pq, 0, (uint)(m * m * d * sizeof(float))); Vector256 <float> y = Vector256 <float> .Zero; Vector256 <float> tmp = Vector256 <float> .Zero; for (int i = 0; i < c; i++) { int f = pf[i]; int j = pi[i]; linearResponse += pw[j] * px[i]; Vector256 <float> x = Avx.BroadcastScalarToVector256(px + i); Vector256 <float> xx = Avx.Multiply(x, x); // tmp -= <v_j,f, v_j,f> * x * x int vBias = j * m * d + f * d; // j-th feature's latent vector in the f-th field hidden space. float *vjf = pv + vBias; for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> vjfBuffer = Avx.LoadVector256(vjf + k); tmp = MultiplyAddNegated(Avx.Multiply(vjfBuffer, vjfBuffer), xx, tmp); } for (int fprime = 0; fprime < m; fprime++) { vBias = j * m * d + fprime * d; int qBias = f * m * d + fprime * d; float *vjfprime = pv + vBias; float *qffprime = pq + qBias; // q_f,f' += v_j,f' * x for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> vjfprimeBuffer = Avx.LoadVector256(vjfprime + k); Vector256 <float> q = Avx.LoadVector256(qffprime + k); q = MultiplyAdd(vjfprimeBuffer, x, q); Avx.Store(qffprime + k, q); } } } for (int f = 0; f < m; f++) { // tmp += <q_f,f, q_f,f> float *qff = pq + f * m * d + f * d; for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> qffBuffer = Avx.LoadVector256(qff + k); // Intra-field interactions. tmp = MultiplyAdd(qffBuffer, qffBuffer, tmp); } // y += <q_f,f', q_f',f>, f != f' // Whis loop handles inter - field interactions because f != f'. for (int fprime = f + 1; fprime < m; fprime++) { float *qffprime = pq + f * m * d + fprime * d; float *qfprimef = pq + fprime * m * d + f * d; for (int k = 0; k + 8 <= d; k += 8) { // Inter-field interaction. Vector256 <float> qffprimeBuffer = Avx.LoadVector256(qffprime + k); Vector256 <float> qfprimefBuffer = Avx.LoadVector256(qfprimef + k); y = MultiplyAdd(qffprimeBuffer, qfprimefBuffer, y); } } } y = MultiplyAdd(_point5, tmp, y); tmp = Avx.Add(y, Avx.Permute2x128(y, y, 1)); tmp = Avx.HorizontalAdd(tmp, tmp); y = Avx.HorizontalAdd(tmp, tmp); Sse.StoreScalar(&latentResponse, y.GetLower()); // The lowest slot is the response value. *response = linearResponse + latentResponse; }
public unsafe void Vector256FloatMultipleOpsUnsafe() { fixed(float *d1Ptr = &data[0]) { fixed(float *d2Ptr = &data2[0]) { fixed(float *d3Ptr = &data3[0]) { fixed(float *resPtr = &result[0]) { float *currD1 = d1Ptr; float *currD2 = d2Ptr; float *currD3 = d3Ptr; float *currRes = resPtr; float *limitPtr = d1Ptr + numberOfFloatItems; while (currD1 < limitPtr) { Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currD1), Avx.LoadVector256(currD2), Avx.LoadVector256(currD3))); Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currRes), Avx.LoadVector256(currD1), Avx.LoadVector256(currD1))); Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currD1), Avx.LoadVector256(currD2), Avx.LoadVector256(currRes))); currD1 += 8; currD2 += 8; currD3 += 8; currRes += 8; } } } } } }
public static unsafe bool TryGetAsciiString(byte *input, char *output, int count) { Debug.Assert(input != null); Debug.Assert(output != null); var end = input + count; Debug.Assert((long)end >= Vector256 <sbyte> .Count); if (Sse2.IsSupported) { if (Avx2.IsSupported && input <= end - Vector256 <sbyte> .Count) { Vector256 <sbyte> zero = Vector256 <sbyte> .Zero; do { var vector = Avx.LoadVector256(input).AsSByte(); if (!CheckBytesInAsciiRange(vector, zero)) { return(false); } var tmp0 = Avx2.UnpackLow(vector, zero); var tmp1 = Avx2.UnpackHigh(vector, zero); // Bring into the right order var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20); var out1 = Avx2.Permute2x128(tmp0, tmp1, 0x31); Avx.Store((ushort *)output, out0.AsUInt16()); Avx.Store((ushort *)output + Vector256 <ushort> .Count, out1.AsUInt16()); input += Vector256 <sbyte> .Count; output += Vector256 <sbyte> .Count; } while (input <= end - Vector256 <sbyte> .Count); if (input == end) { return(true); } } if (input <= end - Vector128 <sbyte> .Count) { Vector128 <sbyte> zero = Vector128 <sbyte> .Zero; do { var vector = Sse2.LoadVector128(input).AsSByte(); if (!CheckBytesInAsciiRange(vector, zero)) { return(false); } var c0 = Sse2.UnpackLow(vector, zero).AsUInt16(); var c1 = Sse2.UnpackHigh(vector, zero).AsUInt16(); Sse2.Store((ushort *)output, c0); Sse2.Store((ushort *)output + Vector128 <ushort> .Count, c1); input += Vector128 <sbyte> .Count; output += Vector128 <sbyte> .Count; } while (input <= end - Vector128 <sbyte> .Count); if (input == end) { return(true); } } } else if (Vector.IsHardwareAccelerated) { while (input <= end - Vector <sbyte> .Count) { var vector = Unsafe.AsRef <Vector <sbyte> >(input); if (!CheckBytesInAsciiRange(vector)) { return(false); } Vector.Widen( vector, out Unsafe.AsRef <Vector <short> >(output), out Unsafe.AsRef <Vector <short> >(output + Vector <short> .Count)); input += Vector <sbyte> .Count; output += Vector <sbyte> .Count; } if (input == end) { return(true); } } if (Environment.Is64BitProcess) // Use Intrinsic switch for branch elimination { // 64-bit: Loop longs by default while (input <= end - sizeof(long)) { var value = *(long *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } if (Bmi2.X64.IsSupported) { // BMI2 will work regardless of the processor's endianness. ((ulong *)output)[0] = Bmi2.X64.ParallelBitDeposit((ulong)value, 0x00FF00FF_00FF00FFul); ((ulong *)output)[1] = Bmi2.X64.ParallelBitDeposit((ulong)(value >> 32), 0x00FF00FF_00FF00FFul); } else { output[0] = (char)input[0]; output[1] = (char)input[1]; output[2] = (char)input[2]; output[3] = (char)input[3]; output[4] = (char)input[4]; output[5] = (char)input[5]; output[6] = (char)input[6]; output[7] = (char)input[7]; } input += sizeof(long); output += sizeof(long); } if (input <= end - sizeof(int)) { var value = *(int *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } if (Bmi2.IsSupported) { // BMI2 will work regardless of the processor's endianness. ((uint *)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); ((uint *)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); } else { output[0] = (char)input[0]; output[1] = (char)input[1]; output[2] = (char)input[2]; output[3] = (char)input[3]; } input += sizeof(int); output += sizeof(int); } } else { // 32-bit: Loop ints by default while (input <= end - sizeof(int)) { var value = *(int *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } if (Bmi2.IsSupported) { // BMI2 will work regardless of the processor's endianness. ((uint *)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); ((uint *)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); } else { output[0] = (char)input[0]; output[1] = (char)input[1]; output[2] = (char)input[2]; output[3] = (char)input[3]; } input += sizeof(int); output += sizeof(int); } } if (input <= end - sizeof(short)) { if (!CheckBytesInAsciiRange(((short *)input)[0])) { return(false); } output[0] = (char)input[0]; output[1] = (char)input[1]; input += sizeof(short); output += sizeof(short); } if (input < end) { if (!CheckBytesInAsciiRange(((sbyte *)input)[0])) { return(false); } output[0] = (char)input[0]; } return(true); }
unsafe private static void mixAvx2(Blake2bContext *s, ulong *m) { var row1 = Avx.LoadVector256(s->h); var row2 = Avx.LoadVector256(s->h + 4); var row3 = v256iv0; var row4 = v256iv1; row4 = Avx2.Xor(row4, Avx.LoadVector256(s->t)); // reads into f[] as well //ROUND 1 var m0 = Avx2.BroadcastVector128ToVector256(m); var m1 = Avx2.BroadcastVector128ToVector256(m + 2); var m2 = Avx2.BroadcastVector128ToVector256(m + 4); var m3 = Avx2.BroadcastVector128ToVector256(m + 6); var r24 = v256rm0; var r16 = v256rm1; var t0 = Avx2.UnpackLow(m0, m1); var t1 = Avx2.UnpackLow(m2, m3); var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m0, m1); t1 = Avx2.UnpackHigh(m2, m3); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); var m4 = Avx2.BroadcastVector128ToVector256(m + 8); var m5 = Avx2.BroadcastVector128ToVector256(m + 10); var m6 = Avx2.BroadcastVector128ToVector256(m + 12); var m7 = Avx2.BroadcastVector128ToVector256(m + 14); t0 = Avx2.UnpackLow(m4, m5); t1 = Avx2.UnpackLow(m6, m7); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m4, m5); t1 = Avx2.UnpackHigh(m6, m7); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 2 t0 = Avx2.UnpackLow(m7, m2); t1 = Avx2.UnpackHigh(m4, m6); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.AlignRight(m3, m7, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = Avx2.Shuffle(m0.AsUInt32(), 0b_01_00_11_10).AsUInt64(); t1 = Avx2.UnpackHigh(m5, m2); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m6, m1); t1 = Avx2.UnpackHigh(m3, m1); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 3 t0 = Avx2.AlignRight(m6, m5, 8); t1 = Avx2.UnpackHigh(m2, m7); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m4, m0); t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = Avx2.Blend(m5.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.UnpackHigh(m3, m4); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m7, m3); t1 = Avx2.AlignRight(m2, m0, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 4 t0 = Avx2.UnpackHigh(m3, m1); t1 = Avx2.UnpackHigh(m6, m5); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m4, m0); t1 = Avx2.UnpackLow(m6, m7); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m3, m5); t1 = Avx2.UnpackLow(m0, m4); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 5 t0 = Avx2.UnpackHigh(m4, m2); t1 = Avx2.UnpackLow(m1, m5); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = Avx2.Blend(m7.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.Blend(m3.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.AlignRight(m6, m0, 8); t1 = Avx2.Blend(m4.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 6 t0 = Avx2.UnpackLow(m1, m3); t1 = Avx2.UnpackLow(m0, m4); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m6, m5); t1 = Avx2.UnpackHigh(m5, m1); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = Avx2.Blend(m2.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.UnpackHigh(m7, m0); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m6, m2); t1 = Avx2.Blend(m7.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 7 t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.UnpackLow(m7, m2); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m2, m7); t1 = Avx2.AlignRight(m5, m6, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = Avx2.UnpackLow(m0, m3); t1 = Avx2.Shuffle(m4.AsUInt32(), 0b_01_00_11_10).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m3, m1); t1 = Avx2.Blend(m1.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 8 t0 = Avx2.UnpackHigh(m6, m3); t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.AlignRight(m7, m5, 8); t1 = Avx2.UnpackHigh(m0, m4); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = Avx2.UnpackHigh(m2, m7); t1 = Avx2.UnpackLow(m4, m1); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m0, m2); t1 = Avx2.UnpackLow(m3, m5); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 9 t0 = Avx2.UnpackLow(m3, m7); t1 = Avx2.AlignRight(m0, m5, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.AlignRight(m4, m1, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = m6; t1 = Avx2.AlignRight(m5, m0, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = m2; b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 10 t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.UnpackHigh(m3, m0); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m1, m2); t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.UnpackHigh(m1, m6); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.AlignRight(m7, m5, 8); t1 = Avx2.UnpackLow(m6, m0); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 11 t0 = Avx2.UnpackLow(m0, m1); t1 = Avx2.UnpackLow(m2, m3); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m0, m1); t1 = Avx2.UnpackHigh(m2, m3); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = Avx2.UnpackLow(m4, m5); t1 = Avx2.UnpackLow(m6, m7); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m4, m5); t1 = Avx2.UnpackHigh(m6, m7); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); //ROUND 12 t0 = Avx2.UnpackLow(m7, m2); t1 = Avx2.UnpackHigh(m4, m6); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.AlignRight(m3, m7, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01); t0 = Avx2.Shuffle(m0.AsUInt32(), 0b_01_00_11_10).AsUInt64(); t1 = Avx2.UnpackHigh(m5, m2); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m6, m1); t1 = Avx2.UnpackHigh(m3, m1); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01); row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10); row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11); row1 = Avx2.Xor(row1, row3); row2 = Avx2.Xor(row2, row4); row1 = Avx2.Xor(row1, Avx2.LoadVector256(s->h)); row2 = Avx2.Xor(row2, Avx2.LoadVector256(s->h + 4)); Avx2.Store(s->h, row1); Avx2.Store(s->h + 4, row2); }
public void Serialize(ref MessagePackWriter writer, bool[]?value, MessagePackSerializerOptions options) { if (value == null) { writer.WriteNil(); return; } var inputLength = value.Length; writer.WriteArrayHeader(inputLength); if (inputLength == 0) { return; } var outputLength = inputLength; fixed(bool *pSource = &value[0]) { var inputEnd = pSource + inputLength; var inputIterator = pSource; var destination = writer.GetSpan(inputLength); fixed(byte *pDestination = &destination[0]) { var outputIterator = pDestination; if (Avx2.IsSupported) { const int ShiftCount = 5; const int Stride = 1 << ShiftCount; if (inputLength < Stride << 1) { goto ProcessEach; } { // make output span align 32 var offset = UnsafeMemoryAlignmentUtility.CalculateDifferenceAlign32(outputIterator); inputLength -= offset; var offsetEnd = inputIterator + offset; while (inputIterator != offsetEnd) { *outputIterator++ = *inputIterator++ ? MessagePackCode.True : MessagePackCode.False; } } var vectorTrue = Vector256.Create(MessagePackCode.True).AsSByte(); var vectorLoopLength = (inputLength >> ShiftCount) << ShiftCount; for (var vectorizedEnd = inputIterator + vectorLoopLength; inputIterator != vectorizedEnd; inputIterator += Stride, outputIterator += Stride) { // Load 32 bool values. var current = Avx.LoadVector256((sbyte *)inputIterator); // A value of false for the type bool is 0 for the sbyte representation. var isTrue = Avx2.CompareEqual(current, Vector256 <sbyte> .Zero); // A value of true in the SIMD context is -1 for the sbyte representation. // True is 0xc3 as MessagePackCode and false is 0xc2. // Reinterpreted as sbyte values, they are -61 and -62, respectively. // For each of the 32 true Vectors, we can add -1 to the false ones to get the answer. var answer = Avx2.Add(vectorTrue, isTrue); Avx.Store((sbyte *)outputIterator, answer); } } else if (Sse2.IsSupported) { // for older x86 cpu const int ShiftCount = 4; const int Stride = 1 << ShiftCount; if (inputLength < Stride << 1) { goto ProcessEach; } { // make output span align 16 var offset = UnsafeMemoryAlignmentUtility.CalculateDifferenceAlign16(outputIterator); inputLength -= offset; var offsetEnd = inputIterator + offset; while (inputIterator != offsetEnd) { *outputIterator++ = *inputIterator++ ? MessagePackCode.True : MessagePackCode.False; } } var vectorTrue = Vector128.Create(MessagePackCode.True).AsSByte(); var vectorLoopLength = (inputLength >> ShiftCount) << ShiftCount; for (var vectorizedEnd = inputIterator + vectorLoopLength; inputIterator != vectorizedEnd; inputIterator += Stride, outputIterator += Stride) { // Load 16 bool values. var current = Sse2.LoadVector128((sbyte *)inputIterator); // A value of false for the type bool is 0 for the sbyte representation. var isTrue = Sse2.CompareEqual(current, Vector128 <sbyte> .Zero); // A value of true in the SIMD context is -1 for the sbyte representation. // True is 0xc3 as MessagePackCode and false is 0xc2. // Reinterpreted as sbyte values, they are -61 and -62, respectively. // For each of the 16 true Vectors, we can add -1 to the false ones to get the answer. var answer = Sse2.Add(vectorTrue, isTrue); Sse2.Store((sbyte *)outputIterator, answer); } } ProcessEach: while (inputIterator != inputEnd) { *outputIterator++ = *inputIterator++ ? MessagePackCode.True : MessagePackCode.False; } } writer.Advance(outputLength); } }
public static float DotMultiplyIntrinsicWFmaWSpanPtr(ref Memory <float> vector1, ref Memory <float> vector2) { var span1 = vector1.Span; var span2 = vector2.Span; var cnt = Math.Min(span1.Length, span2.Length); var v3 = Vector256.CreateScalarUnsafe(0f); var vectLen = Vector256 <float> .Count; var vectCnt = cnt / vectLen; var total = 0f; #if TEST var file = Path.GetTempFileName(); using var writer = new StreamWriter(file); Console.WriteLine($"Intrinsic with FmaWPtr Mult. results will be written into {file}"); #endif unsafe { int i; var ptr1 = (float *)Unsafe.AsPointer(ref span1[0]); var ptr2 = (float *)Unsafe.AsPointer(ref span2[0]); for (i = 0; i < vectCnt; i++) { var v1 = Avx.LoadVector256(ptr1); var v2 = Avx.LoadVector256(ptr2); v3 = Fma.MultiplyAdd(v1, v2, v3); ptr1 += vectLen; ptr2 += vectLen; #if TEST writer.WriteLine($"{v1.ToString()}\t{v2.ToString()}\t{v3.ToString()}"); #endif } for (i = 0; i < vectLen; i++) { total += v3.GetElement(i); } i = vectCnt * vectLen; if (cnt % vectLen > 0) { ptr1 = (float *)Unsafe.AsPointer(ref span1[i]); ptr2 = (float *)Unsafe.AsPointer(ref span2[i]); for (; i < cnt; i++) { total += *ptr1++ **ptr2++; } } } if (vector1.Length != vector2.Length) { var h = vector1.Length > vector2.Length ? span1 : span2; for (var j = cnt; j < h.Length; j++) { total += h[j]; } } return(total); }
private static void mixAvx2(ulong *sh, ulong *m) { // Rotate shuffle masks. We can safely convert the ref to a pointer because the compiler guarantees the // data is in a fixed location, and the ref itself is converted from a pointer. Same for the IV below. byte *prm = (byte *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(rormask)); var r24 = Avx2.BroadcastVector128ToVector256(prm); var r16 = Avx2.BroadcastVector128ToVector256(prm + Vector128 <byte> .Count); var row1 = Avx.LoadVector256(sh); var row2 = Avx.LoadVector256(sh + Vector256 <ulong> .Count); ulong *piv = (ulong *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(ivle)); var row3 = Avx.LoadVector256(piv); var row4 = Avx.LoadVector256(piv + Vector256 <ulong> .Count); row4 = Avx2.Xor(row4, Avx.LoadVector256(sh + Vector256 <ulong> .Count * 2)); // t[] and f[] //ROUND 1 var m0 = Avx2.BroadcastVector128ToVector256(m); var m1 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count); var m2 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 2); var m3 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 3); var t0 = Avx2.UnpackLow(m0, m1); var t1 = Avx2.UnpackLow(m2, m3); var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m0, m1); t1 = Avx2.UnpackHigh(m2, m3); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); var m4 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 4); var m5 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 5); var m6 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 6); var m7 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 7); t0 = Avx2.UnpackLow(m7, m4); t1 = Avx2.UnpackLow(m5, m6); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.UnpackHigh(m5, m6); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 2 t0 = Avx2.UnpackLow(m7, m2); t1 = Avx2.UnpackHigh(m4, m6); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.AlignRight(m3, m7, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.UnpackHigh(m2, m0); t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.AlignRight(m6, m1, 8); t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 3 t0 = Avx2.AlignRight(m6, m5, 8); t1 = Avx2.UnpackHigh(m2, m7); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m4, m0); t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.AlignRight(m5, m4, 8); t1 = Avx2.UnpackHigh(m1, m3); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m2, m7); t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 4 t0 = Avx2.UnpackHigh(m3, m1); t1 = Avx2.UnpackHigh(m6, m5); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m4, m0); t1 = Avx2.UnpackLow(m6, m7); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.AlignRight(m1, m7, 8); t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m4, m3); t1 = Avx2.UnpackLow(m5, m0); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 5 t0 = Avx2.UnpackHigh(m4, m2); t1 = Avx2.UnpackLow(m1, m5); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.AlignRight(m7, m1, 8); t1 = Avx2.AlignRight(m3, m5, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m6, m0); t1 = Avx2.UnpackLow(m6, m4); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 6 t0 = Avx2.UnpackLow(m1, m3); t1 = Avx2.UnpackLow(m0, m4); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m6, m5); t1 = Avx2.UnpackHigh(m5, m1); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.AlignRight(m2, m0, 8); t1 = Avx2.UnpackHigh(m3, m7); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m4, m6); t1 = Avx2.AlignRight(m7, m2, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 7 t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.UnpackLow(m7, m2); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m2, m7); t1 = Avx2.AlignRight(m5, m6, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.UnpackLow(m4, m0); t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m5, m3); t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 8 t0 = Avx2.UnpackHigh(m6, m3); t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.AlignRight(m7, m5, 8); t1 = Avx2.UnpackHigh(m0, m4); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.AlignRight(m4, m7, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m5, m0); t1 = Avx2.UnpackLow(m2, m3); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 9 t0 = Avx2.UnpackLow(m3, m7); t1 = Avx2.AlignRight(m0, m5, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.AlignRight(m4, m1, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.UnpackLow(m5, m6); t1 = Avx2.UnpackHigh(m6, m0); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.AlignRight(m1, m2, 8); t1 = Avx2.AlignRight(m2, m3, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 10 t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.UnpackHigh(m3, m0); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m1, m2); t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.UnpackHigh(m6, m7); t1 = Avx2.UnpackHigh(m4, m1); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.UnpackLow(m7, m6); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 11 t0 = Avx2.UnpackLow(m0, m1); t1 = Avx2.UnpackLow(m2, m3); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m0, m1); t1 = Avx2.UnpackHigh(m2, m3); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.UnpackLow(m7, m4); t1 = Avx2.UnpackLow(m5, m6); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.UnpackHigh(m5, m6); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); //ROUND 12 t0 = Avx2.UnpackLow(m7, m2); t1 = Avx2.UnpackHigh(m4, m6); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.AlignRight(m3, m7, 8); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //DIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); t0 = Avx2.UnpackHigh(m2, m0); t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G1 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); t0 = Avx2.AlignRight(m6, m1, 8); t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); //G2 row1 = Avx2.Add(Avx2.Add(row1, b0), row2); row4 = Avx2.Xor(row4, row1); row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = Avx2.Add(row3, row4); row2 = Avx2.Xor(row2, row3); row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); //UNDIAGONALIZE row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); row1 = Avx2.Xor(row1, row3); row2 = Avx2.Xor(row2, row4); row1 = Avx2.Xor(row1, Avx.LoadVector256(sh)); row2 = Avx2.Xor(row2, Avx.LoadVector256(sh + Vector256 <ulong> .Count)); Avx.Store(sh, row1); Avx.Store(sh + Vector256 <ulong> .Count, row2); }
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */ private static unsafe void unshuffle16_tiled_avx2(byte *dest, byte *src, int vectorizable_elements, int total_elements, int bytesoftype) { int i; int j; var ymm0 = new Vector256 <byte> [16]; var ymm1 = new Vector256 <byte> [16]; var remainder = bytesoftype % sizeof(Vector128 <byte>); int vecs_rem = remainder; /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2) * to optimize cache utilization. */ int offset_into_type; for (offset_into_type = 0; offset_into_type < bytesoftype; offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int)sizeof(Vector128 <byte>))) { for (i = 0; i < vectorizable_elements; i += sizeof(Vector256 <byte>)) { /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */ byte *src_for_ith_element = src + i; for (j = 0; j < 16; j++) { ymm0[j] = Avx.LoadVector256((src_for_ith_element + (total_elements * (offset_into_type + j)))); } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = Avx2.UnpackLow(ymm0[j * 2], ymm0[j * 2 + 1]); /* Compute the hi 32 bytes */ ymm1[8 + j] = Avx2.UnpackHigh(ymm0[j * 2], ymm0[j * 2 + 1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = Avx2.UnpackLow(ymm1[j * 2].AsInt16(), ymm1[j * 2 + 1].AsInt16()).AsByte(); /* Compute the hi 32 bytes */ ymm0[8 + j] = Avx2.UnpackHigh(ymm1[j * 2].AsInt16(), ymm1[j * 2 + 1].AsInt16()).AsByte(); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = Avx2.UnpackLow(ymm0[j * 2].AsInt32(), ymm0[j * 2 + 1].AsInt32()).AsByte(); /* Compute the hi 32 bytes */ ymm1[8 + j] = Avx2.UnpackHigh(ymm0[j * 2].AsInt32(), ymm0[j * 2 + 1].AsInt32()).AsByte(); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = Avx2.UnpackLow(ymm1[j * 2].AsInt64(), ymm1[j * 2 + 1].AsInt64()).AsByte(); /* Compute the hi 32 bytes */ ymm0[8 + j] = Avx2.UnpackHigh(ymm1[j * 2].AsInt64(), ymm1[j * 2 + 1].AsInt64()).AsByte(); } for (j = 0; j < 8; j++) { ymm1[j] = Avx2.Permute2x128(ymm0[j], ymm0[j + 8], 0x20); ymm1[j + 8] = Avx2.Permute2x128(ymm0[j], ymm0[j + 8], 0x31); } /* Store the result vectors in proper order */ byte *dest_with_offset = dest + offset_into_type; _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x01) * bytesoftype), (byte *)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x03) * bytesoftype), (byte *)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x05) * bytesoftype), (byte *)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x07) * bytesoftype), (byte *)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x09) * bytesoftype), (byte *)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x0b) * bytesoftype), (byte *)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x0d) * bytesoftype), (byte *)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x0f) * bytesoftype), (byte *)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x11) * bytesoftype), (byte *)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x13) * bytesoftype), (byte *)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x15) * bytesoftype), (byte *)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x17) * bytesoftype), (byte *)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x19) * bytesoftype), (byte *)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x1b) * bytesoftype), (byte *)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x1d) * bytesoftype), (byte *)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]); _mm256_storeu2_m128i( (byte *)(dest_with_offset + (i + 0x1f) * bytesoftype), (byte *)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]); } } }
public static unsafe bool TryGetAsciiString(byte *input, char *output, int count) { Debug.Assert(input != null); Debug.Assert(output != null); var end = input + count; Debug.Assert((long)end >= Vector256 <sbyte> .Count); // PERF: so the JIT can reuse the zero from a register Vector128 <sbyte> zero = Vector128 <sbyte> .Zero; if (Sse2.IsSupported) { if (Avx2.IsSupported && input <= end - Vector256 <sbyte> .Count) { Vector256 <sbyte> avxZero = Vector256 <sbyte> .Zero; do { var vector = Avx.LoadVector256(input).AsSByte(); if (!CheckBytesInAsciiRange(vector, avxZero)) { return(false); } var tmp0 = Avx2.UnpackLow(vector, avxZero); var tmp1 = Avx2.UnpackHigh(vector, avxZero); // Bring into the right order var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20); var out1 = Avx2.Permute2x128(tmp0, tmp1, 0x31); Avx.Store((ushort *)output, out0.AsUInt16()); Avx.Store((ushort *)output + Vector256 <ushort> .Count, out1.AsUInt16()); input += Vector256 <sbyte> .Count; output += Vector256 <sbyte> .Count; } while (input <= end - Vector256 <sbyte> .Count); if (input == end) { return(true); } } if (input <= end - Vector128 <sbyte> .Count) { do { var vector = Sse2.LoadVector128(input).AsSByte(); if (!CheckBytesInAsciiRange(vector, zero)) { return(false); } var c0 = Sse2.UnpackLow(vector, zero).AsUInt16(); var c1 = Sse2.UnpackHigh(vector, zero).AsUInt16(); Sse2.Store((ushort *)output, c0); Sse2.Store((ushort *)output + Vector128 <ushort> .Count, c1); input += Vector128 <sbyte> .Count; output += Vector128 <sbyte> .Count; } while (input <= end - Vector128 <sbyte> .Count); if (input == end) { return(true); } } } else if (Vector.IsHardwareAccelerated) { while (input <= end - Vector <sbyte> .Count) { var vector = Unsafe.AsRef <Vector <sbyte> >(input); if (!CheckBytesInAsciiRange(vector)) { return(false); } Vector.Widen( vector, out Unsafe.AsRef <Vector <short> >(output), out Unsafe.AsRef <Vector <short> >(output + Vector <short> .Count)); input += Vector <sbyte> .Count; output += Vector <sbyte> .Count; } if (input == end) { return(true); } } if (Environment.Is64BitProcess) // Use Intrinsic switch for branch elimination { // 64-bit: Loop longs by default while (input <= end - sizeof(long)) { var value = *(long *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } // BMI2 could be used, but this variant is faster on both Intel and AMD. if (Sse2.X64.IsSupported) { Vector128 <sbyte> vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte(); Vector128 <ulong> vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); Sse2.Store((ulong *)output, vecWide); } else { output[0] = (char)input[0]; output[1] = (char)input[1]; output[2] = (char)input[2]; output[3] = (char)input[3]; output[4] = (char)input[4]; output[5] = (char)input[5]; output[6] = (char)input[6]; output[7] = (char)input[7]; } input += sizeof(long); output += sizeof(long); } if (input <= end - sizeof(int)) { var value = *(int *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero); input += sizeof(int); output += sizeof(int); } } else { // 32-bit: Loop ints by default while (input <= end - sizeof(int)) { var value = *(int *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero); input += sizeof(int); output += sizeof(int); } } if (input <= end - sizeof(short)) { if (!CheckBytesInAsciiRange(((short *)input)[0])) { return(false); } output[0] = (char)input[0]; output[1] = (char)input[1]; input += sizeof(short); output += sizeof(short); } if (input < end) { if (!CheckBytesInAsciiRange(((sbyte *)input)[0])) { return(false); } output[0] = (char)input[0]; } return(true); }
unsafe public int[,] IntegrateUnsafeVector() { if (!Avx2.IsSupported) { throw new InvalidOperationException("Avx2 is not supported - cannot perform vector operation"); } int w = _data.Width(); int h = _data.Height(); int[,] res = new int[h, w]; fixed(byte *pSource = &_data[0, 0]) fixed(int *pTarget = &res[0, 0]) { var pSrc = pSource; var pTrg = pTarget; {// handle the first line var j = 0; var c = 0; //handle vector part for (; j + Vector256 <int> .Count <= w; j += Vector256 <int> .Count) { var t = Avx2.ConvertToVector256Int32(pSrc); t = Aggregate(t, c); Avx.Store(pTrg, t); c = t.GetElement(Vector256 <int> .Count - 1); pSrc += Vector256 <int> .Count; pTrg += Vector256 <int> .Count; } // handle the tail for (; j < w; j++) { c += *pSrc++; *pTrg++ = c; } } //handle the other lines for (var i = 1; i < h; i++) { var j = 0; var c = 0; //handle vector part for (; j + Vector256 <int> .Count <= w; j += Vector256 <int> .Count) { var t = Avx2.ConvertToVector256Int32(pSrc); t = Aggregate(t, c); c = t.GetElement(Vector256 <int> .Count - 1); var p = Avx.LoadVector256(pTrg - w); // prev line vector t = Avx2.Add(t, p); Avx.Store(pTrg, t); pSrc += Vector256 <int> .Count; pTrg += Vector256 <int> .Count; } // handle the tail for (; j < w; j++) { c += *pSrc++; var q = *(pTrg - w); *pTrg++ = c + q; } } } return(res); }
private void Word_32Bytes_WithPrefetch_Internal <TPrefetchConfiguration>(void *originalBuffer, void *modifiedBuffer, int size) where TPrefetchConfiguration : struct, IPrefetchConfiguration { Debug.Assert(size % 4096 == 0); Debug.Assert(size % sizeof(long) == 0); TPrefetchConfiguration config = default; byte *writePtr = destination; long writePtrOffset = 16; // This stops the JIT from accesing originalBuffer directly, as we know // it is not mutable, this lowers the number of generated instructions byte *ptr = (byte *)originalBuffer; long offset = (byte *)modifiedBuffer - (byte *)originalBuffer; bool started = false; for (byte *end = ptr + size; ptr < end; ptr += 32) { Vector256 <byte> o = Avx.LoadVector256(ptr); Vector256 <byte> m = Avx.LoadVector256(ptr + offset); o = Avx2.Xor(o, m); if (!Avx.TestZ(o, o)) { if (started == false) { // Write the start index of the run based from the start of the page we are diffing. *(long *)(writePtr + 0) = ptr - (byte *)originalBuffer; started = true; } Avx.Store((writePtr + writePtrOffset), m); writePtrOffset += 32; } else if (started) // our byte is untouched here. { // We write the actual size of the stored data. *(long *)(writePtr + 8) = writePtrOffset - 16; // We advance the write pointer to the start of the next. writePtr += writePtrOffset; // We reset the write pointer but not before actually substracting the written amount // from the available space. writePtrOffset = 16; started = false; } config.Prefetch(ptr + 2048); config.Prefetch(ptr + offset + 2048); } // If the block hasnt been touched, nothing to do here unless we have an open pointer. if (started) { // We write the actual size of the stored data. *(long *)(writePtr + 8) = writePtrOffset - 16; } }
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ private static unsafe void shuffle16_avx2(byte *dest, byte *src, int vectorizable_elements, int total_elements) { int bytesoftype = 16; int j; int k, l; var ymm0 = new Vector256 <byte> [16]; var ymm1 = new Vector256 <byte> [16]; /* Create the shuffle mask. * NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from * most to least significant (i.e., their order is reversed when compared to * loading the mask from an array). */ var shmask = Vector256.Create((byte) 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(Vector256 <byte>)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ for (k = 0; k < 16; k++) { ymm0[k] = Avx.LoadVector256((src + (j * bytesoftype) + (k * sizeof(Vector256 <byte>)))); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l += 2) { ymm1[k * 2] = Avx2.UnpackLow(ymm0[l], ymm0[l + 1]); ymm1[k * 2 + 1] = Avx2.UnpackHigh(ymm0[l], ymm0[l + 1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k % 2) == 0) { l += 2; } ymm0[k * 2] = Avx2.UnpackLow(ymm1[l].AsInt16(), ymm1[l + 2].AsInt16()).AsByte(); ymm0[k * 2 + 1] = Avx2.UnpackHigh(ymm1[l].AsInt16(), ymm1[l + 2].AsInt16()).AsByte(); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k % 4) == 0) { l += 4; } ymm1[k * 2] = Avx2.UnpackLow(ymm0[l].AsInt32(), ymm0[l + 4].AsInt32()).AsByte(); ymm1[k * 2 + 1] = Avx2.UnpackHigh(ymm0[l].AsInt32(), ymm0[l + 4].AsInt32()).AsByte(); } /* Transpose quad words */ for (k = 0; k < 8; k++) { ymm0[k * 2] = Avx2.UnpackLow(ymm1[k].AsInt64(), ymm1[k + 8].AsInt64()).AsByte(); ymm0[k * 2 + 1] = Avx2.UnpackHigh(ymm1[k].AsInt64(), ymm1[k + 8].AsInt64()).AsByte(); } for (k = 0; k < 16; k++) { ymm0[k] = Avx2.Permute4x64(ymm0[k].AsInt64(), 0xd8).AsByte(); ymm0[k] = Avx2.Shuffle(ymm0[k], shmask); } /* Store the result vectors */ byte *dest_for_jth_element = dest + j; for (k = 0; k < 16; k++) { Avx2.Store((dest_for_jth_element + (k * total_elements)), ymm0[k]); } } }
public void RunLclVarScenario_Load() { var firstOp = Avx.LoadVector256((Byte *)(_dataTable.inArrayPtr)); Avx2.ExtractVector128((Byte *)_dataTable.outArrayPtr, firstOp, 1); }
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ private static unsafe void unshuffle16_avx2(byte *dest, byte *src, int vectorizable_elements, int total_elements) { int bytesoftype = 16; int i; int j; var ymm0 = new Vector256 <byte> [16]; var ymm1 = new Vector256 <byte> [16]; for (i = 0; i < vectorizable_elements; i += sizeof(Vector256 <byte>)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ byte *src_for_ith_element = src + i; for (j = 0; j < 16; j++) { ymm0[j] = Avx.LoadVector256((src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = Avx2.UnpackLow(ymm0[j * 2], ymm0[j * 2 + 1]); /* Compute the hi 32 bytes */ ymm1[8 + j] = Avx2.UnpackHigh(ymm0[j * 2], ymm0[j * 2 + 1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = Avx2.UnpackLow(ymm1[j * 2].AsInt16(), ymm1[j * 2 + 1].AsInt16()).AsByte(); /* Compute the hi 32 bytes */ ymm0[8 + j] = Avx2.UnpackHigh(ymm1[j * 2].AsInt16(), ymm1[j * 2 + 1].AsInt16()).AsByte(); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = Avx2.UnpackLow(ymm0[j * 2].AsInt32(), ymm0[j * 2 + 1].AsInt32()).AsByte(); /* Compute the hi 32 bytes */ ymm1[8 + j] = Avx2.UnpackHigh(ymm0[j * 2].AsInt32(), ymm0[j * 2 + 1].AsInt32()).AsByte(); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = Avx2.UnpackLow(ymm1[j * 2].AsInt64(), ymm1[j * 2 + 1].AsInt64()).AsByte(); /* Compute the hi 32 bytes */ ymm0[8 + j] = Avx2.UnpackHigh(ymm1[j * 2].AsInt64(), ymm1[j * 2 + 1].AsInt64()).AsByte(); } for (j = 0; j < 8; j++) { ymm1[j] = Avx2.Permute2x128(ymm0[j], ymm0[j + 8], 0x20); ymm1[j + 8] = Avx2.Permute2x128(ymm0[j], ymm0[j + 8], 0x31); } /* Store the result vectors in proper order */ Avx2.Store((dest + (i * bytesoftype) + (0 * sizeof(Vector256 <byte>))), ymm1[0]); Avx2.Store((dest + (i * bytesoftype) + (1 * sizeof(Vector256 <byte>))), ymm1[4]); Avx2.Store((dest + (i * bytesoftype) + (2 * sizeof(Vector256 <byte>))), ymm1[2]); Avx2.Store((dest + (i * bytesoftype) + (3 * sizeof(Vector256 <byte>))), ymm1[6]); Avx2.Store((dest + (i * bytesoftype) + (4 * sizeof(Vector256 <byte>))), ymm1[1]); Avx2.Store((dest + (i * bytesoftype) + (5 * sizeof(Vector256 <byte>))), ymm1[5]); Avx2.Store((dest + (i * bytesoftype) + (6 * sizeof(Vector256 <byte>))), ymm1[3]); Avx2.Store((dest + (i * bytesoftype) + (7 * sizeof(Vector256 <byte>))), ymm1[7]); Avx2.Store((dest + (i * bytesoftype) + (8 * sizeof(Vector256 <byte>))), ymm1[8]); Avx2.Store((dest + (i * bytesoftype) + (9 * sizeof(Vector256 <byte>))), ymm1[12]); Avx2.Store((dest + (i * bytesoftype) + (10 * sizeof(Vector256 <byte>))), ymm1[10]); Avx2.Store((dest + (i * bytesoftype) + (11 * sizeof(Vector256 <byte>))), ymm1[14]); Avx2.Store((dest + (i * bytesoftype) + (12 * sizeof(Vector256 <byte>))), ymm1[9]); Avx2.Store((dest + (i * bytesoftype) + (13 * sizeof(Vector256 <byte>))), ymm1[13]); Avx2.Store((dest + (i * bytesoftype) + (14 * sizeof(Vector256 <byte>))), ymm1[11]); Avx2.Store((dest + (i * bytesoftype) + (15 * sizeof(Vector256 <byte>))), ymm1[15]); } }