public static unsafe void CalculateDiagonalSection_Sse41 <T>(void *refDiag1Ptr, void *refDiag2Ptr, char *sourcePtr, char *targetPtr, ref int rowIndex, int columnIndex) where T : struct { if (typeof(T) == typeof(int)) { var diag1Ptr = (int *)refDiag1Ptr; var diag2Ptr = (int *)refDiag2Ptr; var sourceVector = Sse41.ConvertToVector128Int32((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count); var targetVector = Sse41.ConvertToVector128Int32((ushort *)targetPtr + columnIndex - 1); targetVector = Sse2.Shuffle(targetVector, 0x1b); var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector); var substitutionCost = Sse2.Add( Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count), substitutionCostAdjustment ); var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1)); var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count); var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost); localCost = Sse2.Add(localCost, Vector128.Create(1)); Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost); } else if (typeof(T) == typeof(ushort)) { var diag1Ptr = (ushort *)refDiag1Ptr; var diag2Ptr = (ushort *)refDiag2Ptr; var sourceVector = Sse3.LoadDquVector128((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count); var targetVector = Sse3.LoadDquVector128((ushort *)targetPtr + columnIndex - 1); targetVector = Ssse3.Shuffle(targetVector.AsByte(), REVERSE_USHORT_AS_BYTE_128).AsUInt16(); var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector); var substitutionCost = Sse2.Add( Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count), substitutionCostAdjustment ); var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1)); var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count); var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost); localCost = Sse2.Add(localCost, Vector128.Create((ushort)1)); Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost); } }
private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. const int BLOCK_SIZE = 1 << 5; uint length = (uint)buffer.Length; uint blocks = length / BLOCK_SIZE; length -= blocks * BLOCK_SIZE; int index = 0; fixed(byte *bufferPtr = buffer) fixed(byte *tapPtr = Tap1Tap2) { index += (int)blocks * BLOCK_SIZE; var localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr); Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10)); Vector128 <byte> zero = Vector128 <byte> .Zero; var ones = Vector128.Create((short)1); while (blocks > 0) { uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n); Vector128 <uint> v_s2 = Vector128.CreateScalar(s2); Vector128 <uint> v_s1 = Vector128 <uint> .Zero; do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); localBufferPtr += BLOCK_SIZE; }while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). const byte S2301 = 0b1011_0001; // A B C D -> B A D C const byte S1032 = 0b0100_1110; // A B C D -> C D A B v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += v_s1.ToScalar(); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = v_s2.ToScalar(); // Reduce. s1 %= BASE; s2 %= BASE; } if (length > 0) { if (length >= 16) { s2 += s1 += localBufferPtr[0]; s2 += s1 += localBufferPtr[1]; s2 += s1 += localBufferPtr[2]; s2 += s1 += localBufferPtr[3]; s2 += s1 += localBufferPtr[4]; s2 += s1 += localBufferPtr[5]; s2 += s1 += localBufferPtr[6]; s2 += s1 += localBufferPtr[7]; s2 += s1 += localBufferPtr[8]; s2 += s1 += localBufferPtr[9]; s2 += s1 += localBufferPtr[10]; s2 += s1 += localBufferPtr[11]; s2 += s1 += localBufferPtr[12]; s2 += s1 += localBufferPtr[13]; s2 += s1 += localBufferPtr[14]; s2 += s1 += localBufferPtr[15]; localBufferPtr += 16; length -= 16; } while (length-- > 0) { s2 += s1 += *localBufferPtr++; } if (s1 >= BASE) { s1 -= BASE; } s2 %= BASE; } return(s1 | (s2 << 16)); } }
static unsafe int Main(string[] args) { int testResult = Pass; if (Sse3.IsSupported) { using (TestTable <int> intTable = new TestTable <int>(new int[4] { 1, -5, 100, 0 }, new int[4])) { var vf = Sse3.LoadDquVector128((int *)(intTable.inArrayPtr)); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y)) { Console.WriteLine("Sse3 LoadDquVector128 failed on int:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <uint> intTable = new TestTable <uint>(new uint[4] { 1, 5, 100, 0 }, new uint[4])) { var vf = Sse3.LoadDquVector128((uint *)(intTable.inArrayPtr)); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y)) { Console.WriteLine("Sse3 LoadDquVector128 failed on uint:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <long> intTable = new TestTable <long>(new long[2] { 1, -5 }, new long[2])) { var vf = Sse3.LoadDquVector128((long *)(intTable.inArrayPtr)); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y)) { Console.WriteLine("Sse3 LoadDquVector128 failed on long:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <ulong> intTable = new TestTable <ulong>(new ulong[2] { 1, 5 }, new ulong[2])) { var vf = Sse3.LoadDquVector128((ulong *)(intTable.inArrayPtr)); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y)) { Console.WriteLine("Sse3 LoadDquVector128 failed on ulong:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <short> intTable = new TestTable <short>(new short[8] { 1, -5, 100, 0, 1, 2, 3, 4 }, new short[8])) { var vf = Sse3.LoadDquVector128((short *)(intTable.inArrayPtr)); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y)) { Console.WriteLine("Sse3 LoadDquVector128 failed on short:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <ushort> intTable = new TestTable <ushort>(new ushort[8] { 1, 5, 100, 0, 1, 2, 3, 4 }, new ushort[8])) { var vf = Sse3.LoadDquVector128((ushort *)(intTable.inArrayPtr)); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y)) { Console.WriteLine("Sse3 LoadDquVector128 failed on ushort:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <byte> intTable = new TestTable <byte>(new byte[16] { 1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4 }, new byte[16])) { var vf = Sse3.LoadDquVector128((byte *)(intTable.inArrayPtr)); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y)) { Console.WriteLine("Sse3 LoadDquVector128 failed on byte:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <sbyte> intTable = new TestTable <sbyte>(new sbyte[16] { 1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4 }, new sbyte[16])) { var vf = Sse3.LoadDquVector128((sbyte *)(intTable.inArrayPtr)); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y)) { Console.WriteLine("Sse3 LoadDquVector128 failed on sbyte:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } } return(testResult); }
public static unsafe Vector128 <sbyte> _mm_lddqu_si128(sbyte *address) { return(Sse3.LoadDquVector128(address)); }
// Based on https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c #if !NETSTANDARD2_0 && !NETSTANDARD2_1 private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. const int BLOCK_SIZE = 1 << 5; uint length = (uint)buffer.Length; uint blocks = length / BLOCK_SIZE; length -= blocks * BLOCK_SIZE; int index = 0; fixed(byte *bufferPtr = &buffer[0]) { index += (int)blocks * BLOCK_SIZE; var localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 var tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); var tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); Vector128 <byte> zero = Vector128 <byte> .Zero; var ones = Vector128.Create((short)1); while (blocks > 0) { uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <int> v_ps = Vector128.CreateScalar(s1 * n).AsInt32(); Vector128 <int> v_s2 = Vector128.CreateScalar(s2).AsInt32(); Vector128 <int> v_s1 = Vector128 <int> .Zero; do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 16); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones)); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsInt32()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones)); localBufferPtr += BLOCK_SIZE; }while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). const byte S2301 = 0b1011_0001; // A B C D -> B A D C const byte S1032 = 0b0100_1110; // A B C D -> C D A B v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += (uint)v_s1.ToScalar(); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = (uint)v_s2.ToScalar(); // Reduce. s1 %= BASE; s2 %= BASE; } } ref byte bufferRef = ref MemoryMarshal.GetReference(buffer);
private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. uint length = (uint)buffer.Length; uint blocks = length / BlockSize; length -= blocks * BlockSize; fixed(byte *bufferPtr = &MemoryMarshal.GetReference(buffer)) { fixed(byte *tapPtr = &MemoryMarshal.GetReference(Tap1Tap2)) { byte *localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr); Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10)); Vector128 <byte> zero = Vector128 <byte> .Zero; var ones = Vector128.Create((short)1); while (blocks > 0) { uint n = NMAX / BlockSize; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n); Vector128 <uint> v_s2 = Vector128.CreateScalar(s2); Vector128 <uint> v_s1 = Vector128 <uint> .Zero; do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); localBufferPtr += BlockSize; }while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). const byte S2301 = 0b1011_0001; // A B C D -> B A D C const byte S1032 = 0b0100_1110; // A B C D -> C D A B v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += v_s1.ToScalar(); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = v_s2.ToScalar(); // Reduce. s1 %= BASE; s2 %= BASE; } if (length > 0) { HandleLeftOver(localBufferPtr, length, ref s1, ref s2); } return(s1 | (s2 << 16)); } } }