public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = AdvSimd.Add( AdvSimd.LoadVector64((UInt32 *)(&test._fld1)), AdvSimd.LoadVector64((UInt32 *)(&test._fld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructFldScenario_Load(SimpleBinaryOpTest__Add_Vector64_Int16 testClass) { fixed(Vector64 <Int16> *pFld1 = &_fld1) fixed(Vector64 <Int16> *pFld2 = &_fld2) { var result = AdvSimd.Add( AdvSimd.LoadVector64((Int16 *)(pFld1)), AdvSimd.LoadVector64((Int16 *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector64 <Int16> *pFld1 = &_fld1) fixed(Vector64 <Int16> *pFld2 = &_fld2) { var result = AdvSimd.Add( AdvSimd.LoadVector64((Int16 *)(pFld1)), AdvSimd.LoadVector64((Int16 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector128 <Single> *pClsVar1 = &_clsVar1) fixed(Vector128 <Single> *pClsVar2 = &_clsVar2) { var result = AdvSimd.Add( AdvSimd.LoadVector128((Single *)(pClsVar1)), AdvSimd.LoadVector128((Single *)(pClsVar2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleBinaryOpTest__Add_Vector64_Int16(); fixed(Vector64 <Int16> *pFld1 = &test._fld1) fixed(Vector64 <Int16> *pFld2 = &test._fld2) { var result = AdvSimd.Add( AdvSimd.LoadVector64((Int16 *)(pFld1)), AdvSimd.LoadVector64((Int16 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
private void ProcessMixAdvSimd(Span <float> outputMix, ReadOnlySpan <float> inputMix) { Vector128 <float> volumeVec = Vector128.Create(Volume); ReadOnlySpan <Vector128 <float> > inputVec = MemoryMarshal.Cast <float, Vector128 <float> >(inputMix); Span <Vector128 <float> > outputVec = MemoryMarshal.Cast <float, Vector128 <float> >(outputMix); int sisdStart = inputVec.Length * 4; for (int i = 0; i < inputVec.Length; i++) { outputVec[i] = AdvSimd.Add(outputVec[i], AdvSimd.Ceiling(AdvSimd.Multiply(inputVec[i], volumeVec))); } for (int i = sisdStart; i < inputMix.Length; i++) { outputMix[i] += FloatingPointHelper.MultiplyRoundUp(inputMix[i], Volume); } }
// Returns &inputBuffer[inputLength] if the input buffer is valid. /// <summary> /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>, /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. /// </summary> /// <remarks> /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. /// </remarks> public static char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); // First, we'll handle the common case of all-ASCII. If this is able to // consume the entire buffer, we'll skip the remainder of this method's logic. int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength); Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength); pInputBuffer += (uint)numAsciiCharsConsumedJustNow; inputLength -= numAsciiCharsConsumedJustNow; if (inputLength == 0) { utf8CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return(pInputBuffer); } // If we got here, it means we saw some non-ASCII data, so within our // vectorized code paths below we'll handle all non-surrogate UTF-16 // code points branchlessly. We'll only branch if we see surrogates. // // We still optimistically assume the data is mostly ASCII. This means that the // number of UTF-8 code units and the number of scalars almost matches the number // of UTF-16 code units. As we go through the input and find non-ASCII // characters, we'll keep track of these "adjustment" fixups. To get the // total number of UTF-8 code units required to encode the input data, add // the UTF-8 code unit count adjustment to the number of UTF-16 code units // seen. To get the total number of scalars present in the input data, // add the scalar count adjustment to the number of UTF-16 code units seen. long tempUtf8CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported) { if (inputLength >= Vector128 <ushort> .Count) { Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80); Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800); Vector128 <short> vector8800 = Vector128.Create(unchecked ((short)0x8800)); Vector128 <ushort> vectorZero = Vector128 <ushort> .Zero; do { Vector128 <ushort> utf16Data; if (AdvSimd.Arm64.IsSupported) { utf16Data = AdvSimd.LoadVector128((ushort *)pInputBuffer); // unaligned } else { utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); // unaligned } Vector128 <ushort> charIsNonAscii; if (AdvSimd.Arm64.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) charIsNonAscii = AdvSimd.Min(utf16Data, vector0080); } else if (Sse41.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) charIsNonAscii = Sse41.Min(utf16Data, vector0080); } else { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will // be handled in a few lines. charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080); } #if DEBUG // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element. uint debugMask; if (AdvSimd.Arm64.IsSupported) { debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte()); } else { debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte()); } Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'."); #endif // DEBUG // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding // input was 0x0800 <= [value]. This also handles the missing range a few lines above. Vector128 <ushort> charIsThreeByteUtf8Encoded; uint mask; if (AdvSimd.IsSupported) { charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11)); mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); } else { charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11)); mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); } // Each even bit of mask will be 1 only if the char was >= 0x0080, // and each odd bit of mask will be 1 only if the char was >= 0x0800. // // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...": // // ,-- set if char[1] is >= 0x0800 // | ,-- set if char[0] is >= 0x0800 // v v // mask = ... 1 1 0 1 // ^ ^-- set if char[0] is non-ASCII // `-- set if char[1] is non-ASCII // // This means we can popcnt the number of set bits, and the result is the // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as // it expands. This results in the wrong count for UTF-16 surrogate code // units (we just counted that each individual code unit expands to 3 bytes, // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes). // We'll handle this in just a moment. // // For now, compute the popcnt but squirrel it away. We'll fold it in to the // cumulative UTF-8 adjustment factor once we determine that there are no // unpaired surrogates in our data. (Unpaired surrogates would invalidate // our computed result and we'd have to throw it away.) uint popcnt = (uint)BitOperations.PopCount(mask); // Surrogates need to be special-cased for two reasons: (a) we need // to account for the fact that we over-counted in the addition above; // and (b) they require separate validation. if (AdvSimd.Arm64.IsSupported) { utf16Data = AdvSimd.Add(utf16Data, vectorA800); mask = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); } else { utf16Data = Sse2.Add(utf16Data, vectorA800); mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); } if (mask != 0) { // There's at least one UTF-16 surrogate code unit present. // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw, // the resulting bits of 'mask' will occur in pairs: // - 00 if the corresponding UTF-16 char was not a surrogate code unit; // - 11 if the corresponding UTF-16 char was a surrogate code unit. // // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ], // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents // a low surrogate. Since we added 0xA800 in the vectorized operation above, // our surrogate pairs will now have the bit pattern [ 10000q## ######## ]. // If we logical right-shift each word by 3, we'll end up with the bit pattern // [ 00010000 q####### ], which means that we can immediately use pmovmskb to // determine whether a given char was a high or a low surrogate. // // Therefore the resulting bits of 'mask2' will occur in pairs: // - 00 if the corresponding UTF-16 char was a high surrogate code unit; // - 01 if the corresponding UTF-16 char was a low surrogate code unit; // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit. // Since 'mask' already has 00 in these positions (since the corresponding char // wasn't a surrogate), "mask AND mask2 == 00" holds for these positions. uint mask2; if (AdvSimd.Arm64.IsSupported) { mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte()); } else { mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); } // 'lowSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a low surrogate char, // - 00 if the corresponding char was a high surrogate char or not a surrogate at all. uint lowSurrogatesMask = mask2 & mask; // 'highSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a high surrogate char, // - 00 if the corresponding char was a low surrogate char or not a surrogate at all. uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask; Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0, "A char cannot simultaneously be both a high and a low surrogate char."); Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0, "Only even bits (no odd bits) of the masks should be set."); // Now check that each high surrogate is followed by a low surrogate and that each // low surrogate follows a high surrogate. We make an exception for the case where // the final char of the vector is a high surrogate, since we can't perform validation // on it until the next iteration of the loop when we hope to consume the matching // low surrogate. highSurrogatesMask <<= 2; if ((ushort)highSurrogatesMask != lowSurrogatesMask) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } if (highSurrogatesMask > ushort.MaxValue) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here pInputBuffer--; inputLength++; } // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for // free right now, saving the extension step a few lines below. If we're 32-bit, the // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real // 64 -bit extension a few lines below. nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask); // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNuint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation // assumes that the pair is encoded as 6 UTF-8 code units. Since each // pair is in reality only encoded as 4 UTF-8 code units, we need to // perform this adjustment now. if (IntPtr.Size == 8) { // Since we've already zero-extended surrogatePairsCountNuint, we can directly // sub + sub. It's more efficient than shl + sub. tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; } else { // Take the hit of the 64-bit extension now. tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint; } } tempUtf8CodeUnitCountAdjustment += popcnt; pInputBuffer += Vector128 <ushort> .Count; inputLength -= Vector128 <ushort> .Count; } while (inputLength >= Vector128 <ushort> .Count); } } else if (Vector.IsHardwareAccelerated) { if (inputLength >= Vector <ushort> .Count) { Vector <ushort> vector0080 = new Vector <ushort>(0x0080); Vector <ushort> vector0400 = new Vector <ushort>(0x0400); Vector <ushort> vector0800 = new Vector <ushort>(0x0800); Vector <ushort> vectorD800 = new Vector <ushort>(0xD800); do { // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these // vectors, each element of the sum will contain one of three values: // // 0x0000 ( 0) = original char was 0000..007F // 0xFFFF (-1) = original char was 0080..07FF // 0xFFFE (-2) = original char was 0800..FFFF // // We'll negate them to produce a value 0..2 for each element, then sum all the // elements together to produce the number of *additional* UTF-8 code units // required to represent this UTF-16 data. This is similar to the popcnt step // performed by the SSE2 code path. This will overcount surrogates, but we'll // handle that shortly. Vector <ushort> utf16Data = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer); Vector <ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080); Vector <ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800); Vector <nuint_t> sumVector = (Vector <nuint_t>)(Vector <ushort> .Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes); // We'll try summing by a natural word (rather than a 16-bit word) at a time, // which should halve the number of operations we must perform. nuint popcnt = 0; for (int i = 0; i < Vector <nuint_t> .Count; i++) { popcnt += (nuint)sumVector[i]; } uint popcnt32 = (uint)popcnt; if (IntPtr.Size == 8) { popcnt32 += (uint)(popcnt >> 32); } // As in the SSE4.1 paths, compute popcnt but don't fold it in until we // know there aren't any unpaired surrogates in the input data. popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16); // Now check for surrogates. utf16Data -= vectorD800; Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800); if (surrogateChars != Vector <ushort> .Zero) { // There's at least one surrogate (high or low) UTF-16 code unit in // the vector. We'll build up additional vectors: 'highSurrogateChars' // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original // UTF-16 code unit was a high or low surrogate, respectively. Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400); Vector <ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars); // We want to make sure that each high surrogate code unit is followed by // a low surrogate code unit and each low surrogate code unit follows a // high surrogate code unit. Since we don't have an equivalent of pmovmskb // or palignr available to us, we'll do this as a loop. We won't look at // the very last high surrogate char element since we don't yet know if // the next vector read will have a low surrogate char element. if (lowSurrogateChars[0] != 0) { goto Error; // error: start of buffer contains standalone low surrogate char } ushort surrogatePairsCount = 0; for (int i = 0; i < Vector <ushort> .Count - 1; i++) { surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0 if (highSurrogateChars[i] != lowSurrogateChars[i + 1]) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } } if (highSurrogateChars[Vector <ushort> .Count - 1] != 0) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. pInputBuffer--; inputLength++; popcnt32 -= 2; } nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), // so we'll adjust this now. tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; } tempUtf8CodeUnitCountAdjustment += popcnt32; pInputBuffer += Vector <ushort> .Count; inputLength -= Vector <ushort> .Count; } while (inputLength >= Vector <ushort> .Count); } } NonVectorizedLoop: // Vectorization isn't supported on our current platform, or the input was too small to benefit // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to // drain remaining valid chars before we report failure. for (; inputLength > 0; pInputBuffer++, inputLength--) { uint thisChar = pInputBuffer[0]; if (thisChar <= 0x7F) { continue; } // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF. // This optimistically assumes no surrogates, which we'll handle shortly. tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16; if (!UnicodeUtility.IsSurrogateCodePoint(thisChar)) { continue; } // Found a surrogate char. Back out the adjustment we made above, then // try to consume the entire surrogate pair all at once. We won't bother // trying to interpret the surrogate pair as a scalar value; we'll only // validate that its bit pattern matches what's expected for a surrogate pair. tempUtf8CodeUnitCountAdjustment -= 2; if (inputLength == 1) { goto Error; // input buffer too small to read a surrogate pair } thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0) { goto Error; // not a well-formed surrogate pair } tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units pInputBuffer++; // consumed one extra char inputLength--; } Error: // Also used for normal return. utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment; scalarCountAdjustment = tempScalarCountAdjustment; return(pInputBuffer); }
internal static void Step(ref ushort preSum1, ref ushort preSum2, byte[] buf, uint len) { /* * Split Adler-32 into component sums. */ uint s1 = preSum1; uint s2 = preSum2; int bufPos = 0; /* * Process the data in blocks. */ uint BLOCK_SIZE = 1 << 5; uint blocks = len / BLOCK_SIZE; len -= blocks * BLOCK_SIZE; while (blocks != 0) { uint n = Adler32Context.NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; /* * Process n blocks of data. At most NMAX data bytes can be * processed before s2 must be reduced modulo ADLER_MODULE. */ Vector128 <uint> v_s2 = Vector128.Create(s1 * n, 0, 0, 0); Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0); Vector128 <ushort> v_column_sum_1 = AdvSimd.DuplicateToVector128((ushort)0); Vector128 <ushort> v_column_sum_2 = AdvSimd.DuplicateToVector128((ushort)0); Vector128 <ushort> v_column_sum_3 = AdvSimd.DuplicateToVector128((ushort)0); Vector128 <ushort> v_column_sum_4 = AdvSimd.DuplicateToVector128((ushort)0); do { /* * Load 32 input bytes. */ Vector128 <byte> bytes1 = Vector128.Create(buf[bufPos], buf[bufPos + 1], buf[bufPos + 2], buf[bufPos + 3], buf[bufPos + 4], buf[bufPos + 5], buf[bufPos + 6], buf[bufPos + 7], buf[bufPos + 8], buf[bufPos + 9], buf[bufPos + 10], buf[bufPos + 11], buf[bufPos + 12], buf[bufPos + 13], buf[bufPos + 14], buf[bufPos + 15]); bufPos += 16; Vector128 <byte> bytes2 = Vector128.Create(buf[bufPos], buf[bufPos + 1], buf[bufPos + 2], buf[bufPos + 3], buf[bufPos + 4], buf[bufPos + 5], buf[bufPos + 6], buf[bufPos + 7], buf[bufPos + 8], buf[bufPos + 9], buf[bufPos + 10], buf[bufPos + 11], buf[bufPos + 12], buf[bufPos + 13], buf[bufPos + 14], buf[bufPos + 15]); bufPos += 16; /* * Add previous block byte sum to v_s2. */ v_s2 = AdvSimd.Add(v_s2, v_s1); /* * Horizontally add the bytes for s1. */ v_s1 = AdvSimd.AddPairwiseWideningAndAdd(v_s1, AdvSimd. AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1), bytes2)); /* * Vertically add the bytes for s2. */ v_column_sum_1 = AdvSimd.AddWideningLower(v_column_sum_1, bytes1.GetLower()); v_column_sum_2 = AdvSimd.AddWideningLower(v_column_sum_2, bytes1.GetUpper()); v_column_sum_3 = AdvSimd.AddWideningLower(v_column_sum_3, bytes2.GetLower()); v_column_sum_4 = AdvSimd.AddWideningLower(v_column_sum_4, bytes2.GetUpper()); } while(--n != 0); v_s2 = AdvSimd.ShiftLeftLogical(v_s2, 5); /* * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. */ v_s2 = AdvSimd.MultiplyWideningLowerAndAdd(v_s2, v_column_sum_1.GetLower(), Vector64.Create((ushort)32, 31, 30, 29)); v_s2 = AdvSimd.MultiplyWideningLowerAndAdd(v_s2, v_column_sum_1.GetUpper(), Vector64.Create((ushort)28, 27, 26, 25)); v_s2 = AdvSimd.MultiplyWideningLowerAndAdd(v_s2, v_column_sum_2.GetLower(), Vector64.Create((ushort)24, 23, 22, 21)); v_s2 = AdvSimd.MultiplyWideningLowerAndAdd(v_s2, v_column_sum_2.GetUpper(), Vector64.Create((ushort)20, 19, 18, 17)); v_s2 = AdvSimd.MultiplyWideningLowerAndAdd(v_s2, v_column_sum_3.GetLower(), Vector64.Create((ushort)16, 15, 14, 13)); v_s2 = AdvSimd.MultiplyWideningLowerAndAdd(v_s2, v_column_sum_3.GetUpper(), Vector64.Create((ushort)12, 11, 10, 9)); v_s2 = AdvSimd.MultiplyWideningLowerAndAdd(v_s2, v_column_sum_4.GetLower(), Vector64.Create((ushort)8, 7, 6, 5)); v_s2 = AdvSimd.MultiplyWideningLowerAndAdd(v_s2, v_column_sum_4.GetUpper(), Vector64.Create((ushort)4, 3, 2, 1)); /* * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). */ Vector64 <uint> sum1 = AdvSimd.AddPairwise(v_s1.GetLower(), v_s1.GetUpper()); Vector64 <uint> sum2 = AdvSimd.AddPairwise(v_s2.GetLower(), v_s2.GetUpper()); Vector64 <uint> s1s2 = AdvSimd.AddPairwise(sum1, sum2); s1 += AdvSimd.Extract(s1s2, 0); s2 += AdvSimd.Extract(s1s2, 1); /* * Reduce. */ s1 %= Adler32Context.ADLER_MODULE; s2 %= Adler32Context.ADLER_MODULE; } /* * Handle leftover data. */ if (len != 0) { if (len >= 16) { s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; len -= 16; } while (len-- != 0) { s2 += s1 += buf[bufPos++]; } if (s1 >= Adler32Context.ADLER_MODULE) { s1 -= Adler32Context.ADLER_MODULE; } s2 %= Adler32Context.ADLER_MODULE; } /* * Return the recombined sums. */ preSum1 = (ushort)(s1 & 0xFFFF); preSum2 = (ushort)(s2 & 0xFFFF); }
private static void Accumulate512Neon(ref Accumulator accumulator, byte *data, byte *secret) { if (UnrollCount > 2u) { var accumulatorVecLo0 = accumulator.Data128.Data0; var accumulatorVecLo1 = accumulator.Data128.Data1; var accumulatorVecLo2 = accumulator.Data128.Data2; var accumulatorVecLo3 = accumulator.Data128.Data3; var dataVec0 = AdvSimd.LoadVector128((ulong *)(data + 0x00u)); var dataVec1 = AdvSimd.LoadVector128((ulong *)(data + 0x10u)); var dataVec2 = AdvSimd.LoadVector128((ulong *)(data + 0x20u)); var dataVec3 = AdvSimd.LoadVector128((ulong *)(data + 0x30u)); var keyVec0 = AdvSimd.LoadVector128((ulong *)(secret + 0x00u)); var keyVec1 = AdvSimd.LoadVector128((ulong *)(secret + 0x10u)); var keyVec2 = AdvSimd.LoadVector128((ulong *)(secret + 0x20u)); var keyVec3 = AdvSimd.LoadVector128((ulong *)(secret + 0x30u)); var accumulatorVecHi0 = AdvSimd.ExtractVector128(dataVec0, dataVec0, 1); var accumulatorVecHi1 = AdvSimd.ExtractVector128(dataVec1, dataVec1, 1); var accumulatorVecHi2 = AdvSimd.ExtractVector128(dataVec2, dataVec2, 1); var accumulatorVecHi3 = AdvSimd.ExtractVector128(dataVec3, dataVec3, 1); var dataKey0 = AdvSimd.Xor(dataVec0, keyVec0); var dataKey1 = AdvSimd.Xor(dataVec1, keyVec1); var dataKey2 = AdvSimd.Xor(dataVec2, keyVec2); var dataKey3 = AdvSimd.Xor(dataVec3, keyVec3); var dataKeyLo0 = AdvSimd.ExtractNarrowingLower(dataKey0); var dataKeyLo1 = AdvSimd.ExtractNarrowingLower(dataKey1); var dataKeyLo2 = AdvSimd.ExtractNarrowingLower(dataKey2); var dataKeyLo3 = AdvSimd.ExtractNarrowingLower(dataKey3); var dataKeyHi0 = AdvSimd.ShiftRightLogicalNarrowingLower(dataKey0, 32); var dataKeyHi1 = AdvSimd.ShiftRightLogicalNarrowingLower(dataKey1, 32); var dataKeyHi2 = AdvSimd.ShiftRightLogicalNarrowingLower(dataKey2, 32); var dataKeyHi3 = AdvSimd.ShiftRightLogicalNarrowingLower(dataKey3, 32); accumulatorVecHi0 = AdvSimd.MultiplyWideningLowerAndAdd(accumulatorVecHi0, dataKeyLo0, dataKeyHi0); accumulatorVecHi1 = AdvSimd.MultiplyWideningLowerAndAdd(accumulatorVecHi1, dataKeyLo1, dataKeyHi1); accumulatorVecHi2 = AdvSimd.MultiplyWideningLowerAndAdd(accumulatorVecHi2, dataKeyLo2, dataKeyHi2); accumulatorVecHi3 = AdvSimd.MultiplyWideningLowerAndAdd(accumulatorVecHi3, dataKeyLo3, dataKeyHi3); var result0 = AdvSimd.Add(accumulatorVecLo0, accumulatorVecHi0); var result1 = AdvSimd.Add(accumulatorVecLo1, accumulatorVecHi1); var result2 = AdvSimd.Add(accumulatorVecLo2, accumulatorVecHi2); var result3 = AdvSimd.Add(accumulatorVecLo3, accumulatorVecHi3); accumulator.Data128.Data0 = result0; accumulator.Data128.Data1 = result1; accumulator.Data128.Data2 = result2; accumulator.Data128.Data3 = result3; } else if (UnrollCount == 2u) { for (uint i = 0u; i < StripeLength; i += 0x20u) { var accumulatorVecLo0 = accumulator.Data128.AtOffset(i + 0x00u); var accumulatorVecLo1 = accumulator.Data128.AtOffset(i + 0x10u); var dataVec0 = AdvSimd.LoadVector128((ulong *)(data + i + 0x00u)); var dataVec1 = AdvSimd.LoadVector128((ulong *)(data + i + 0x10u)); var keyVec0 = AdvSimd.LoadVector128((ulong *)(secret + i + 0x00u)); var keyVec1 = AdvSimd.LoadVector128((ulong *)(secret + i + 0x10u)); var accumulatorVecHi0 = AdvSimd.ExtractVector128(dataVec0, dataVec0, 1); var accumulatorVecHi1 = AdvSimd.ExtractVector128(dataVec1, dataVec1, 1); var dataKey0 = AdvSimd.Xor(dataVec0, keyVec0); var dataKey1 = AdvSimd.Xor(dataVec1, keyVec1); var dataKeyLo0 = AdvSimd.ExtractNarrowingLower(dataKey0); var dataKeyLo1 = AdvSimd.ExtractNarrowingLower(dataKey1); var dataKeyHi0 = AdvSimd.ShiftRightLogicalNarrowingLower(dataKey0, 32); var dataKeyHi1 = AdvSimd.ShiftRightLogicalNarrowingLower(dataKey1, 32); accumulatorVecHi0 = AdvSimd.MultiplyWideningLowerAndAdd(accumulatorVecHi0, dataKeyLo0, dataKeyHi0); accumulatorVecHi1 = AdvSimd.MultiplyWideningLowerAndAdd(accumulatorVecHi1, dataKeyLo1, dataKeyHi1); var result0 = AdvSimd.Add(accumulatorVecLo0, accumulatorVecHi0); var result1 = AdvSimd.Add(accumulatorVecLo1, accumulatorVecHi1); accumulator.Data128.AtOffset(i + 0x00u) = result0; accumulator.Data128.AtOffset(i + 0x10u) = result1; } } else { for (uint i = 0u; i < StripeLength; i += 0x10u) { var accumulatorVecLo = accumulator.Data128.AtOffset(i); var dataVec = AdvSimd.LoadVector128((ulong *)(data + i)); var keyVec = AdvSimd.LoadVector128((ulong *)(secret + i)); var accumulatorVecHi = AdvSimd.ExtractVector128(dataVec, dataVec, 1); var dataKey = AdvSimd.Xor(dataVec, keyVec); var dataKeyLo = AdvSimd.ExtractNarrowingLower(dataKey); var dataKeyHi = AdvSimd.ShiftRightLogicalNarrowingLower(dataKey, 32); accumulatorVecHi = AdvSimd.MultiplyWideningLowerAndAdd(accumulatorVecHi, dataKeyLo, dataKeyHi); var result = AdvSimd.Add(accumulatorVecLo, accumulatorVecHi); accumulator.Data128.AtOffset(i) = result; } } }