private static void diagonalize(ref Vector128 <ulong> row1l, ref Vector128 <ulong> row2l, ref Vector128 <ulong> row3l, ref Vector128 <ulong> row4l, ref Vector128 <ulong> row1h, ref Vector128 <ulong> row2h, ref Vector128 <ulong> row3h, ref Vector128 <ulong> row4h, ref Vector128 <ulong> b0) { var t0 = Ssse3.AlignRight(row2h.AsSByte(), row2l.AsSByte(), 8); var t1 = Ssse3.AlignRight(row2l.AsSByte(), row2h.AsSByte(), 8); row2l = t0.AsUInt64(); row2h = t1.AsUInt64(); b0 = row3l; row3l = row3h; row3h = b0; t0 = Ssse3.AlignRight(row4h.AsSByte(), row4l.AsSByte(), 8); t1 = Ssse3.AlignRight(row4l.AsSByte(), row4h.AsSByte(), 8); row4l = t1.AsUInt64(); row4h = t0.AsUInt64(); }
public static Vector128 <sbyte> ReadVector128(this ref char src) { Vector128 <short> c0 = Unsafe.As <char, Vector128 <short> >(ref src); Vector128 <short> c1 = Unsafe.As <char, Vector128 <short> >(ref Unsafe.Add(ref src, 8)); Vector128 <byte> tmp = Sse2.PackUnsignedSaturate(c0, c1); #if NETCOREAPP2_1 return(Sse.StaticCast <byte, sbyte>(tmp)); #else return(tmp.AsSByte()); #endif }
private static ulong GetNonAsciiBytes(Vector128 <byte> value, Vector128 <byte> bitMask128) { if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian) { throw new PlatformNotSupportedException(); } Vector128 <byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); Vector128 <byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitMask128); extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); return(extractedBits.AsUInt64().ToScalar()); }
private static uint GetNonAsciiBytes(Vector128 <byte> value) { Debug.Assert(AdvSimd.Arm64.IsSupported); Vector128 <byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); Vector128 <byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, s_bitMask128); // self-pairwise add until all flags have moved to the first two bytes of the vector extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); return(extractedBits.AsUInt16().ToScalar()); }
public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
public static int GetIndexOfFirstNonAsciiByte(Vector128 <byte> value) { if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian) { throw new PlatformNotSupportedException(); } // extractedBits[i] = (value[i] >> 7) & (1 << (12 * (i % 2))); Vector128 <byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); Vector128 <byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, s_bitmask); // collapse mask to lower bits extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); ulong mask = extractedBits.AsUInt64().ToScalar(); // calculate the index int index = BitOperations.TrailingZeroCount(mask) >> 2; Debug.Assert((mask != 0) ? index < 16 : index >= 16); return(index); }
private unsafe nuint GetIndexOfFirstCharToEncodeAdvSimd64(char *pData, nuint lengthInChars) { // See GetIndexOfFirstByteToEncodeAdvSimd64 for the central logic behind this method. // The main difference here is that we need to pack WORDs to BYTEs before performing // the main vectorized logic. It doesn't matter if we use signed or unsigned saturation // while packing, as saturation will convert out-of-range (non-ASCII char) WORDs to // 0x00 or 0x7F..0xFF, all of which are forbidden by the encoder. Debug.Assert(AdvSimd.Arm64.IsSupported); Debug.Assert(BitConverter.IsLittleEndian); Vector128 <byte> vec0xF = Vector128.Create((byte)0xF); Vector128 <byte> vecPowersOfTwo = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0); Vector128 <byte> vecPairwiseAddNibbleBitmask = Vector128.Create((ushort)0xF00F).AsByte(); // little endian only Vector128 <byte> allowedCodePoints = _allowedAsciiCodePoints.AsVector; ulong resultScalar; nuint i = 0; if (lengthInChars >= 16) { nuint lastLegalIterationFor16CharRead = lengthInChars & unchecked ((nuint)(nint) ~0xF); do { // Read 16 chars at a time into 2x 128-bit vectors, then pack into a single 128-bit vector. // We turn 16 chars (256 bits) into 16 nibbles (64 bits) during this process. Vector128 <byte> packed = AdvSimd.ExtractNarrowingSaturateUnsignedUpper( AdvSimd.ExtractNarrowingSaturateUnsignedLower(AdvSimd.LoadVector128((/* unaligned */ short *)(pData + i))), AdvSimd.LoadVector128((/* unaligned */ short *)(pData + 8 + i))); var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF)); var vecPowersOfTwoShuffled = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte()); var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled); var maskedResult = AdvSimd.And(result, vecPairwiseAddNibbleBitmask); resultScalar = AdvSimd.Arm64.AddPairwise(maskedResult, maskedResult).AsUInt64().ToScalar(); if (resultScalar != ulong.MaxValue) { goto PairwiseAddMaskContainsDataWhichRequiresEscaping; } } while ((i += 16) < lastLegalIterationFor16CharRead); } if ((lengthInChars & 8) != 0) { // Read 8 chars at a time into a single 128-bit vector, then pack into a 64-bit // vector, then extend to 128 bits. We turn 8 chars (128 bits) into 8 bytes (64 bits) // during this process. Only the low 64 bits of the 'result' vector have meaningful // data. Vector128 <byte> packed = AdvSimd.ExtractNarrowingSaturateUnsignedLower(AdvSimd.LoadVector128((/* unaligned */ short *)(pData + i))).AsByte().ToVector128Unsafe(); var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF)); var vecPowersOfTwoShuffled = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte()); var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled); resultScalar = result.AsUInt64().ToScalar(); if (resultScalar != ulong.MaxValue) { goto MaskContainsDataWhichRequiresEscaping; } i += 8; } if ((lengthInChars & 4) != 0) { // Read 4 chars at a time into a single 64-bit vector, then pack into the low 32 bits // of a 128-bit vector. We turn 4 chars (64 bits) into 4 bytes (32 bits) during this // process. Only the low 32 bits of the 'result' vector have meaningful data. Vector128 <byte> packed = AdvSimd.ExtractNarrowingSaturateUnsignedLower(AdvSimd.LoadVector64((/* unaligned */ short *)(pData + i)).ToVector128Unsafe()).ToVector128Unsafe(); var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF)); var vecPowersOfTwoShuffled = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte()); var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled); resultScalar = result.AsUInt32().ToScalar(); // n.b. implicit conversion uint -> ulong; high 32 bits will be zeroed if (resultScalar != uint.MaxValue) { goto MaskContainsDataWhichRequiresEscaping; } i += 4; } // Beyond this point, vectorization isn't worthwhile. Just do a normal loop. if ((lengthInChars & 3) != 0) { Debug.Assert(lengthInChars - i <= 3); do { if (!_allowedAsciiCodePoints.IsAllowedAsciiCodePoint(pData[i])) { break; } } while (++i != lengthInChars); } Return: return(i); PairwiseAddMaskContainsDataWhichRequiresEscaping: Debug.Assert(resultScalar != ulong.MaxValue); // Each nibble is 4 (1 << 2) bits, so we shr by 2 to account for per-nibble stride. i += (uint)BitOperations.TrailingZeroCount(~resultScalar) >> 2; // location of lowest set bit is where we must begin escaping goto Return; MaskContainsDataWhichRequiresEscaping: Debug.Assert(resultScalar != ulong.MaxValue); // Each byte is 8 (1 << 3) bits, so we shr by 3 to account for per-byte stride. i += (uint)BitOperations.TrailingZeroCount(~resultScalar) >> 3; // location of lowest set bit is where we must begin escaping goto Return; }
private unsafe nuint GetIndexOfFirstByteToEncodeAdvSimd64(byte *pData, nuint lengthInBytes) { Debug.Assert(AdvSimd.Arm64.IsSupported); Debug.Assert(BitConverter.IsLittleEndian); Vector128 <byte> vec0xF = Vector128.Create((byte)0xF); Vector128 <byte> vecPowersOfTwo = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0); Vector128 <byte> vecPairwiseAddNibbleBitmask = Vector128.Create((ushort)0xF00F).AsByte(); // little endian only Vector128 <byte> allowedCodePoints = _allowedAsciiCodePoints.AsVector; ulong resultScalar; nuint i = 0; if (lengthInBytes >= 16) { nuint lastLegalIterationFor16CharRead = lengthInBytes & unchecked ((nuint)(nint) ~0xF); do { // Read 16 bytes at a time into a single 128-bit vector. Vector128 <byte> packed = AdvSimd.LoadVector128(pData + i); // unaligned read // Each element of the packed vector corresponds to a byte of untrusted source data. It will // have the format [ ..., 0xYZ, ... ]. We use the low nibble of each byte to index into // the 'allowedCodePoints' vector, and we use the high nibble of each byte to select a bit // from the corresponding element in the 'allowedCodePoints' vector. // // Example: let packed := [ ..., 0x6D ('m'), ... ] // The final 'result' vector will contain a non-zero value in the corresponding space iff the // 0xD element in the 'allowedCodePoints' vector has its 1 << 0x6 bit set. // // We rely on the fact that when we perform an arithmetic shift of vector values to get the // high nibble into the low 4 bits, we'll smear the high (non-ASCII) bit, causing the vector // element value to be in the range [ 128..255 ]. This causes the tbl lookup to return 0x00 // for that particular element in the 'vecPowersOfTwoShuffled' vector, meaning that escaping is required. var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF)); var vecPowersOfTwoShuffled = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte()); var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled); // Now, each element of 'result' contains 0xFF if the corresponding element in 'packed' is allowed; // and it contains a zero value if the corresponding element in 'packed' is disallowed. We'll convert // this into a vector where if 0xFF occurs in an even-numbered index, it gets converted to 0x0F; and // if 0xFF occurs in an odd-numbered index, it gets converted to 0xF0. This allows us to collapse // the Vector128<byte> to a 64-bit unsigned integer, where each of the 16 nibbles in the 64-bit integer // corresponds to whether an element in the 'result' vector was originally 0xFF or 0x00. var maskedResult = AdvSimd.And(result, vecPairwiseAddNibbleBitmask); resultScalar = AdvSimd.Arm64.AddPairwise(maskedResult, maskedResult).AsUInt64().ToScalar(); if (resultScalar != ulong.MaxValue) { goto PairwiseAddMaskContainsDataWhichRequiresEscaping; } } while ((i += 16) < lastLegalIterationFor16CharRead); } if ((lengthInBytes & 8) != 0) { // Read 8 bytes at a time into a single 64-bit vector, extended to 128 bits. // Same logic as the 16-byte case, but we don't need to worry about the pairwise add step. // We'll treat the low 64 bits of the 'result' vector as its own scalar element. Vector128 <byte> packed = AdvSimd.LoadVector64(pData + i).ToVector128Unsafe(); // unaligned read var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF)); var vecPowersOfTwoShuffled = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte()); var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled); resultScalar = result.AsUInt64().ToScalar(); if (resultScalar != ulong.MaxValue) { goto MaskContainsDataWhichRequiresEscaping; } i += 8; } if ((lengthInBytes & 4) != 0) { // Read 4 bytes at a time into a single element, extended to a 128-bit vector. // Same logic as the 16-byte case, but we don't need to worry about the pairwise add step. // We'll treat the low 32 bits of the 'result' vector as its own scalar element. Vector128 <byte> packed = Vector128.CreateScalarUnsafe(Unsafe.ReadUnaligned <uint>(pData + i)).AsByte(); var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF)); var vecPowersOfTwoShuffled = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte()); var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled); resultScalar = result.AsUInt32().ToScalar(); // n.b. implicit conversion uint -> ulong; high 32 bits will be zeroed if (resultScalar != uint.MaxValue) { goto MaskContainsDataWhichRequiresEscaping; } i += 4; } // Beyond this point, vectorization isn't worthwhile. Just do a normal loop. if ((lengthInBytes & 3) != 0) { Debug.Assert(lengthInBytes - i <= 3); do { if (!_allowedAsciiCodePoints.IsAllowedAsciiCodePoint(pData[i])) { break; } } while (++i != lengthInBytes); } Return: return(i); PairwiseAddMaskContainsDataWhichRequiresEscaping: Debug.Assert(resultScalar != ulong.MaxValue); // Each nibble is 4 (1 << 2) bits, so we shr by 2 to account for per-nibble stride. i += (uint)BitOperations.TrailingZeroCount(~resultScalar) >> 2; // location of lowest set bit is where we must begin escaping goto Return; MaskContainsDataWhichRequiresEscaping: Debug.Assert(resultScalar != ulong.MaxValue); // Each byte is 8 (1 << 3) bits, so we shr by 3 to account for per-byte stride. i += (uint)BitOperations.TrailingZeroCount(~resultScalar) >> 3; // location of lowest set bit is where we must begin escaping goto Return; }
public static Vector128 <byte> CompareBit8Equal(Vector128 <byte> left, Vector128 <byte> right) => CompareBit8Equal(left.AsSByte(), right.AsSByte()).AsByte();
private static Vector128 <ulong> alignr_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) => Ssse3.AlignRight(x.AsSByte(), y.AsSByte(), m).AsUInt64();
private static Vector128 <ulong> ror64_shuffle(ref Vector128 <ulong> x, ref Vector128 <sbyte> y) => Ssse3.Shuffle(x.AsSByte(), y).AsUInt64();
public byte[] GenerateLuhnCodeSimdSse2() { if (!Sse2.IsSupported) { throw new NotSupportedException(); } byte[] arr = new byte[16]; arr[15] = (byte)'0'; Vector128 <byte> inputVector; unsafe { fixed(byte *arrRef = arr) { fixed(byte *cardCodeRef = CardCode) { Buffer.MemoryCopy(cardCodeRef, arrRef, 16, 15); } inputVector = Sse2.LoadVector128(arrRef); } } arr[15] = (byte)'0'; Vector128 <byte> substractResult = Sse2.Subtract( inputVector, Vector128.Create((byte)'0') ); Vector128 <byte> zeroVector = Vector128 <byte> .Zero; Vector128 <byte> multiplyResult = Sse2.Add( substractResult, MultiplyBytes( Sse2.Subtract( zeroVector.AsSByte(), Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0).AsSByte() ).AsByte(), substractResult ) ); Vector128 <byte> nineVector = Vector128.Create((byte)9); Vector128 <ushort> vsum = Sse2.SumAbsoluteDifferences( Sse2.Subtract( multiplyResult, MultiplyBytes( Sse2.Subtract( zeroVector.AsSByte(), Sse2.CompareGreaterThan( multiplyResult.AsSByte(), nineVector.AsSByte() ) ).AsByte(), nineVector ) ), zeroVector ); // ref: https://stackoverflow.com/a/36998778 byte sum = (byte)(Sse2.Extract(vsum, 0) + Sse2.Extract(vsum, 4)); arr[15] = (byte)(sum % 10 == 0 ? '0' : 10 - sum % 10 + '0'); return(arr); }
public static unsafe void ReadVector(IntBlock block, BufferedReader reader, int[] buffer) { // Build first unadjusted vector and per-vector increment Vector128 <int> unadjusted = SetIncrement(block.Base, block.Slope); Vector128 <int> increment = Set1(block.Slope * 4); if (block.BitsPerAdjustment == 0) { // If no adjustments, calculate in blocks and return fixed(int *resultPtr = buffer) { for (int i = 0; i < block.Count; i += 4) { Unsafe.WriteUnaligned(&resultPtr[i], unadjusted); unadjusted = Sse2.Add(unadjusted, increment); } } return; } fixed(byte *bufferPtr = reader.Buffer) fixed(int *resultPtr = buffer) fixed(sbyte *shuffleMaskPtr = ShuffleMasks) fixed(int *multiplyMaskPtr = MultiplyMasks) { byte bitsPerAdjustment = block.BitsPerAdjustment; int index = reader.Index; int count = block.Count; // Calculate bytes consumed for the first and second four ints decoded (different for odd bit lengths) byte bytesPerEight = bitsPerAdjustment; byte bytes1 = (byte)(bytesPerEight / 2); // Calculate how much to shift values (from top of each int to bottom) byte shiftRightBits = (byte)(32 - bitsPerAdjustment); // Get shuffle mask (to get correct bits) and multiply value (to shift to top of each int) for halves Vector128 <sbyte> shuffle1 = Unsafe.ReadUnaligned <Vector128 <sbyte> >(&shuffleMaskPtr[32 * bitsPerAdjustment]); Vector128 <int> multiply1 = Unsafe.ReadUnaligned <Vector128 <int> >(&multiplyMaskPtr[8 * bitsPerAdjustment]); Vector128 <sbyte> shuffle2 = Unsafe.ReadUnaligned <Vector128 <sbyte> >(&shuffleMaskPtr[32 * bitsPerAdjustment + 16]); Vector128 <int> multiply2 = Unsafe.ReadUnaligned <Vector128 <int> >(&multiplyMaskPtr[8 * bitsPerAdjustment + 4]); for (int i = 0; i < count; i += 8, index += bytesPerEight) { // Read source bytes Vector128 <int> vector1 = Unsafe.ReadUnaligned <Vector128 <int> >(&bufferPtr[index]); Vector128 <int> vector2 = Unsafe.ReadUnaligned <Vector128 <int> >(&bufferPtr[index + bytes1]); // Shuffle to get the right bytes in each integer vector1 = (Ssse3.Shuffle(vector1.AsSByte(), shuffle1).AsInt32()); vector2 = (Ssse3.Shuffle(vector2.AsSByte(), shuffle2).AsInt32()); // Multiply to shift each int so the desired bits are at the top vector1 = Sse41.MultiplyLow(vector1, multiply1); vector2 = Sse41.MultiplyLow(vector2, multiply2); // Shift the desired bits to the bottom and zero the top vector1 = Sse2.ShiftRightLogical(vector1, shiftRightBits); vector2 = Sse2.ShiftRightLogical(vector2, shiftRightBits); // Add the delta base value vector1 = Sse2.Add(vector1, unadjusted); unadjusted = Sse2.Add(unadjusted, increment); vector2 = Sse2.Add(vector2, unadjusted); unadjusted = Sse2.Add(unadjusted, increment); // Write the decoded integers Unsafe.WriteUnaligned(&resultPtr[i], vector1); Unsafe.WriteUnaligned(&resultPtr[i + 4], vector2); } reader.Index = index; } }
internal static void Step(ref ushort sum1, ref ushort sum2, byte[] buf, uint len) { uint s1 = sum1; uint s2 = sum2; int bufPos = 0; /* * Process the data in blocks. */ uint BLOCK_SIZE = 1 << 5; uint blocks = len / BLOCK_SIZE; len -= blocks * BLOCK_SIZE; while (blocks != 0) { uint n = Adler32Context.NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; Vector128 <byte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17). AsByte(); Vector128 <byte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1).AsByte(); Vector128 <byte> zero = Vector128.Create(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte(); Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1); /* * Process n blocks of data. At most NMAX data bytes can be * processed before s2 must be reduced modulo BASE. */ Vector128 <uint> v_ps = Vector128.Create(s1 * n, 0, 0, 0); Vector128 <uint> v_s2 = Vector128.Create(s2, 0, 0, 0); Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0); do { /* * Load 32 input bytes. */ Vector128 <uint> bytes1 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos), BitConverter.ToUInt32(buf, bufPos + 4), BitConverter.ToUInt32(buf, bufPos + 8), BitConverter.ToUInt32(buf, bufPos + 12)); bufPos += 16; Vector128 <uint> bytes2 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos), BitConverter.ToUInt32(buf, bufPos + 4), BitConverter.ToUInt32(buf, bufPos + 8), BitConverter.ToUInt32(buf, bufPos + 12)); bufPos += 16; /* * Add previous block byte sum to v_ps. */ v_ps = Sse2.Add(v_ps, v_s1); /* * Horizontally add the bytes for s1, multiply-adds the * bytes by [ 32, 31, 30, ... ] for s2. */ v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1.AsByte(), zero).AsUInt32()); Vector128 <short> mad1 = System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes1.AsByte(), tap1.AsSByte()); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1.AsInt16(), ones.AsInt16()).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2.AsByte(), zero).AsUInt32()); Vector128 <short> mad2 = System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes2.AsByte(), tap2.AsSByte()); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2.AsInt16(), ones.AsInt16()).AsUInt32()); } while(--n != 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); /* * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). */ v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 177)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 78)); s1 += (uint)Sse2.ConvertToInt32(v_s1.AsInt32()); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 177)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 78)); s2 = (uint)Sse2.ConvertToInt32(v_s2.AsInt32()); /* * Reduce. */ s1 %= Adler32Context.ADLER_MODULE; s2 %= Adler32Context.ADLER_MODULE; } /* * Handle leftover data. */ if (len != 0) { if (len >= 16) { s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; len -= 16; } while (len-- != 0) { s2 += s1 += buf[bufPos++]; } if (s1 >= Adler32Context.ADLER_MODULE) { s1 -= Adler32Context.ADLER_MODULE; } s2 %= Adler32Context.ADLER_MODULE; } /* * Return the recombined sums. */ sum1 = (ushort)(s1 & 0xFFFF); sum2 = (ushort)(s2 & 0xFFFF); }