public void RunClassFldScenario() { var result = Ssse3.MultiplyAddAdjacent(_fld1, _fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); }
public void RunStructFldScenario(SimpleBinaryOpTest__MultiplyAddAdjacentInt16 testClass) { var result = Ssse3.MultiplyAddAdjacent(_fld1, _fld2); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); }
public void RunClassLclFldScenario() { var test = new SimpleBinaryOpTest__MultiplyAddAdjacentInt16(); var result = Ssse3.MultiplyAddAdjacent(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { var test = TestStruct.Create(); var result = Ssse3.MultiplyAddAdjacent(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); var result = Ssse3.MultiplyAddAdjacent(_fld1, _fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { var left = Unsafe.Read <Vector128 <Byte> >(_dataTable.inArray1Ptr); var right = Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray2Ptr); var result = Ssse3.MultiplyAddAdjacent(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { var left = Sse2.LoadAlignedVector128((Byte *)(_dataTable.inArray1Ptr)); var right = Sse2.LoadAlignedVector128((SByte *)(_dataTable.inArray2Ptr)); var result = Ssse3.MultiplyAddAdjacent(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Ssse3.MultiplyAddAdjacent( Unsafe.Read <Vector128 <Byte> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { var result = Ssse3.MultiplyAddAdjacent( Sse2.LoadAlignedVector128((Byte *)(_dataTable.inArray1Ptr)), Sse2.LoadAlignedVector128((SByte *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { var result = Ssse3.MultiplyAddAdjacent( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Ssse3.MultiplyAddAdjacent(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_Load)); var left = Sse2.LoadVector128((Byte *)(_dataTable.inArray1Ptr)); var right = Sse2.LoadVector128((SByte *)(_dataTable.inArray2Ptr)); var result = Ssse3.MultiplyAddAdjacent(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var op1 = Unsafe.Read <Vector128 <Byte> >(_dataTable.inArray1Ptr); var op2 = Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray2Ptr); var result = Ssse3.MultiplyAddAdjacent(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_LoadAligned)); var result = Ssse3.MultiplyAddAdjacent( Sse2.LoadAlignedVector128((Byte *)(_dataTable.inArray1Ptr)), Sse2.LoadAlignedVector128((SByte *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
private static uint32_t parse_eight_digits_unrolled(bytechar *chars) { // this actually computes *16* values so we are being wasteful. Vector128 <sbyte> ascii0 = Vector128.Create((bytechar)'0'); Vector128 <sbyte> input = Sse2.Subtract(Sse2.LoadVector128(chars), ascii0); Vector128 <short> t1 = Ssse3.MultiplyAddAdjacent(input.AsByte(), mul_1_10); Vector128 <int> t2 = Sse2.MultiplyAddAdjacent(t1, mul_1_100); Vector128 <ushort> t3 = Sse41.PackUnsignedSaturate(t2, t2); Vector128 <int> t4 = Sse2.MultiplyAddAdjacent(t3.AsInt16(), mul_1_10000); return(Sse2.ConvertToUInt32(t4.AsUInt32())); // only captures the sum of the first 8 digits, drop the rest }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Ssse3.MultiplyAddAdjacent( Sse2.LoadVector128((Byte *)(&test._fld1)), Sse2.LoadVector128((SByte *)(&test._fld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructFldScenario_Load(SimpleBinaryOpTest__MultiplyAddAdjacentInt16 testClass) { fixed(Vector128 <Byte> *pFld1 = &_fld1) fixed(Vector128 <SByte> *pFld2 = &_fld2) { var result = Ssse3.MultiplyAddAdjacent( Sse2.LoadVector128((Byte *)(pFld1)), Sse2.LoadVector128((SByte *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector128 <Byte> *pFld1 = &_fld1) fixed(Vector128 <SByte> *pFld2 = &_fld2) { var result = Ssse3.MultiplyAddAdjacent( Sse2.LoadVector128((Byte *)(pFld1)), Sse2.LoadVector128((SByte *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } }
private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. const int BLOCK_SIZE = 1 << 5; uint length = (uint)buffer.Length; uint blocks = length / BLOCK_SIZE; length -= blocks * BLOCK_SIZE; int index = 0; fixed(byte *bufferPtr = buffer) fixed(byte *tapPtr = Tap1Tap2) { index += (int)blocks * BLOCK_SIZE; var localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr); Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10)); Vector128 <byte> zero = Vector128 <byte> .Zero; var ones = Vector128.Create((short)1); while (blocks > 0) { uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n); Vector128 <uint> v_s2 = Vector128.CreateScalar(s2); Vector128 <uint> v_s1 = Vector128 <uint> .Zero; do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); localBufferPtr += BLOCK_SIZE; }while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). const byte S2301 = 0b1011_0001; // A B C D -> B A D C const byte S1032 = 0b0100_1110; // A B C D -> C D A B v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += v_s1.ToScalar(); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = v_s2.ToScalar(); // Reduce. s1 %= BASE; s2 %= BASE; } if (length > 0) { if (length >= 16) { s2 += s1 += localBufferPtr[0]; s2 += s1 += localBufferPtr[1]; s2 += s1 += localBufferPtr[2]; s2 += s1 += localBufferPtr[3]; s2 += s1 += localBufferPtr[4]; s2 += s1 += localBufferPtr[5]; s2 += s1 += localBufferPtr[6]; s2 += s1 += localBufferPtr[7]; s2 += s1 += localBufferPtr[8]; s2 += s1 += localBufferPtr[9]; s2 += s1 += localBufferPtr[10]; s2 += s1 += localBufferPtr[11]; s2 += s1 += localBufferPtr[12]; s2 += s1 += localBufferPtr[13]; s2 += s1 += localBufferPtr[14]; s2 += s1 += localBufferPtr[15]; localBufferPtr += 16; length -= 16; } while (length-- > 0) { s2 += s1 += *localBufferPtr++; } if (s1 >= BASE) { s1 -= BASE; } s2 %= BASE; } return(s1 | (s2 << 16)); } }
internal static unsafe uint GetSse(ReadOnlySpan <byte> buffer, uint s1, uint s2) { uint len = (uint)buffer.Length; uint blocks = len / BLOCK_SIZE; len = len - blocks * BLOCK_SIZE; Vector128 <sbyte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); Vector128 <sbyte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); Vector128 <byte> zero = Vector128 <byte> .Zero; Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1); fixed(byte *bufPtr = &MemoryMarshal.GetReference(buffer)) { var buf = bufPtr; while (blocks != 0) { uint n = NMAX32 / BLOCK_SIZE; if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.Create(0, 0, 0, s1 * n); Vector128 <uint> v_s2 = Vector128.Create(0, 0, 0, s2); Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0); do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse2.LoadVector128(&buf[0]); Vector128 <byte> bytes2 = Sse2.LoadVector128(&buf[16]); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. Vector128 <ushort> sad1 = Sse2.SumAbsoluteDifferences(bytes1, zero); v_s1 = Sse2.Add(v_s1, sad1.AsUInt32()); Vector128 <short> mad11 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); Vector128 <int> mad12 = Sse2.MultiplyAddAdjacent(mad11, ones); v_s2 = Sse2.Add(v_s2, mad12.AsUInt32()); Vector128 <ushort> sad2 = Sse2.SumAbsoluteDifferences(bytes2, zero); v_s1 = Sse2.Add(v_s1, sad2.AsUInt32()); Vector128 <short> mad21 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); Vector128 <int> mad22 = Sse2.MultiplyAddAdjacent(mad21, ones); v_s2 = Sse2.Add(v_s2, mad22.AsUInt32()); buf += BLOCK_SIZE; n--; } while (n != 0); var shift = Sse2.ShiftLeftLogical(v_ps, 5); v_s2 = Sse2.Add(v_s2, shift); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). // A B C D -> B A D C const int S2301 = 2 << 6 | 3 << 4 | 0 << 2 | 1; // A B C D -> C D A B const int S1032 = 1 << 6 | 0 << 4 | 3 << 2 | 2; v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += Sse2.ConvertToUInt32(v_s1); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = Sse2.ConvertToUInt32(v_s2); s1 %= MOD32; s2 %= MOD32; } if (len > 0) { if (len >= 16) { s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); len -= 16; } while (len-- > 0) { s2 += (s1 += *buf++); } if (s1 >= MOD32) { s1 -= MOD32; } s2 %= MOD32; } return(s1 | (s2 << 16)); } }
internal unsafe static ulong GetSse(ReadOnlySpan <byte> buffer, ulong s1, ulong s2) { uint len = (uint)buffer.Length; uint blocks = len / BLOCK_SIZE; len = len - blocks * BLOCK_SIZE; Vector128 <sbyte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); Vector128 <sbyte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); Vector128 <byte> zero = Vector128 <byte> .Zero; Vector128 <short> onesShort = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1); Vector128 <int> onesInt = Vector128.Create(1, 1, 1, 1); Vector128 <byte> shuffleMask2301 = Vector128.Create((byte)4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); Vector128 <byte> shuffleMask1032 = Vector128.Create((byte)8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); Vector128 <byte> shuffleMaskTrim = Vector128.Create(0, 1, 2, 3, 255, 255, 255, 255, 8, 9, 10, 11, 255, 255, 255, 255); // A B C D -> B A D C const int S2301 = 2 << 6 | 3 << 4 | 0 << 2 | 1; fixed(byte *bufPtr = &MemoryMarshal.GetReference(buffer)) { var buf = bufPtr; while (blocks != 0) { uint n = NMAX64 / BLOCK_SIZE; if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <ulong> v_ps = Vector128.Create(0, s1 * n); Vector128 <ulong> v_s2 = Vector128.Create(0, s2); Vector128 <ulong> v_s1 = Vector128.Create(0ul, 0); do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse2.LoadVector128(&buf[0]); Vector128 <byte> bytes2 = Sse2.LoadVector128(&buf[16]); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. Vector128 <ushort> sad1 = Sse2.SumAbsoluteDifferences(bytes1, zero); v_s1 = Sse2.Add(v_s1, sad1.AsUInt64()); Vector128 <short> mad11 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); Vector128 <int> mad12 = Sse2.MultiplyAddAdjacent(mad11, onesShort); var mad121 = Sse2.Add(mad12, Sse2.Shuffle(mad12, S2301)); var madTrimmed1 = Ssse3.Shuffle(mad121.AsByte(), shuffleMaskTrim); var madTimmed1ULong = madTrimmed1.AsUInt64(); v_s2 = Sse2.Add(v_s2, madTimmed1ULong); Vector128 <ushort> sad2 = Sse2.SumAbsoluteDifferences(bytes2, zero); v_s1 = Sse2.Add(v_s1, sad2.AsUInt64()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); Vector128 <int> mad22 = Sse2.MultiplyAddAdjacent(mad2, onesShort); var mad221 = Sse2.Add(mad22, Sse2.Shuffle(mad22, S2301)); var madTrimmed2 = Ssse3.Shuffle(mad221.AsByte(), shuffleMaskTrim); var madTimmed2ULong = madTrimmed2.AsUInt64(); v_s2 = Sse2.Add(v_s2, madTimmed2ULong); buf += BLOCK_SIZE; n--; } while (n != 0); var shifted = Sse2.ShiftLeftLogical(v_ps, 5); v_s2 = Sse2.Add(v_s2, shifted); s1 += v_s1.GetElement(0); s1 += v_s1.GetElement(1); s2 = v_s2.GetElement(0); s2 += v_s2.GetElement(1); s1 %= MOD64; s2 %= MOD64; } if (len > 0) { if (len >= 16) { s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); len -= 16; } while (len-- > 0) { s2 += (s1 += *buf++); } if (s1 >= MOD64) { s1 -= MOD64; } s2 %= MOD64; } return(s1 | (s2 << 32)); } }
// Based on https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c #if !NETSTANDARD2_0 && !NETSTANDARD2_1 private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. const int BLOCK_SIZE = 1 << 5; uint length = (uint)buffer.Length; uint blocks = length / BLOCK_SIZE; length -= blocks * BLOCK_SIZE; int index = 0; fixed(byte *bufferPtr = &buffer[0]) { index += (int)blocks * BLOCK_SIZE; var localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 var tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); var tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); Vector128 <byte> zero = Vector128 <byte> .Zero; var ones = Vector128.Create((short)1); while (blocks > 0) { uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <int> v_ps = Vector128.CreateScalar(s1 * n).AsInt32(); Vector128 <int> v_s2 = Vector128.CreateScalar(s2).AsInt32(); Vector128 <int> v_s1 = Vector128 <int> .Zero; do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 16); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones)); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsInt32()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones)); localBufferPtr += BLOCK_SIZE; }while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). const byte S2301 = 0b1011_0001; // A B C D -> B A D C const byte S1032 = 0b0100_1110; // A B C D -> C D A B v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += (uint)v_s1.ToScalar(); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = (uint)v_s2.ToScalar(); // Reduce. s1 %= BASE; s2 %= BASE; } } ref byte bufferRef = ref MemoryMarshal.GetReference(buffer);
/// <summary> /// SSSE3 Version of Adler32 /// https://chromium.googlesource.com/chromium/src/third_party/zlib/+/master/adler32_simd.c /// </summary> /// <param name="adler"></param> /// <param name="buff"></param> /// <returns></returns> private static unsafe uint HashSsse3(uint adler, ReadOnlySpan <byte> buff) { fixed(byte *buffAddr = buff) { uint s1 = adler & 0xffff; uint s2 = adler >> 16; int dof = 0; int len = buff.Length; int blocks = len / BLOCK_SIZE; len -= blocks * BLOCK_SIZE; while (blocks > 0) { uint n = NMAX / BLOCK_SIZE; if (n > blocks) { n = (uint)blocks; } blocks -= (int)n; Vector128 <byte> zero = Vector128 <byte> .Zero; Vector128 <short> ones = Vector128.Create((short)1); // Process n blocks of data. At most NMAX data bytes can be processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.Create(0, 0, 0, s1 * n); Vector128 <uint> v_s2 = Vector128.Create(0, 0, 0, s2); Vector128 <uint> v_s1 = Vector128 <uint> .Zero; do { // Load 32 input bytes. var bytes1 = Sse2.LoadVector128((buffAddr + dof)); var bytes2 = Sse2.LoadVector128((buffAddr + dof) + 16); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the bytes by[32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); var mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); dof += BLOCK_SIZE; } while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). // Shuffling 2301 then 1032 achieves the same thing as described here. // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 // Vector128<uint> hi64 = Sse2.Shuffle(v_s1, S1O32); // Vector128<uint> sum64 = Sse2.Add(hi64, v_s1); // Vector128<uint> hi32 = Sse2.ShuffleLow(sum64.AsUInt16(), S1O32).AsUInt32(); // Vector128<uint> sum32 = Sse2.Add(sum64, hi32); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S23O1)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1O32)); s1 += Sse2.ConvertToUInt32(v_s1); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S23O1)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1O32)); s2 = Sse2.ConvertToUInt32(v_s2); // Reduce s1 %= BASE; s2 %= BASE; } // Handle leftover data if (len > 0) { while (len >= 16) { len -= 16; Do(ref s1, ref s2, buff, dof, 16); dof += 16; } while (len-- > 0) { s1 += buffAddr[dof++]; s2 += s1; } if (s1 >= BASE) { s1 -= BASE; } s2 %= BASE; } /* * Return the recombined sums. */ return(s1 | (s2 << 16)); } }
public static Vector128 <short> _mm_maddubs_epi16(Vector128 <byte> left, Vector128 <sbyte> right) { return(Ssse3.MultiplyAddAdjacent(left, right)); }
private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. uint length = (uint)buffer.Length; uint blocks = length / BlockSize; length -= blocks * BlockSize; fixed(byte *bufferPtr = &MemoryMarshal.GetReference(buffer)) { fixed(byte *tapPtr = &MemoryMarshal.GetReference(Tap1Tap2)) { byte *localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr); Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10)); Vector128 <byte> zero = Vector128 <byte> .Zero; var ones = Vector128.Create((short)1); while (blocks > 0) { uint n = NMAX / BlockSize; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n); Vector128 <uint> v_s2 = Vector128.CreateScalar(s2); Vector128 <uint> v_s1 = Vector128 <uint> .Zero; do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); localBufferPtr += BlockSize; }while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). const byte S2301 = 0b1011_0001; // A B C D -> B A D C const byte S1032 = 0b0100_1110; // A B C D -> C D A B v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += v_s1.ToScalar(); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = v_s2.ToScalar(); // Reduce. s1 %= BASE; s2 %= BASE; } if (length > 0) { HandleLeftOver(localBufferPtr, length, ref s1, ref s2); } return(s1 | (s2 << 16)); } } }