示例#1
0
        public static unsafe void CalculateDiagonalSection_Sse41 <T>(void *refDiag1Ptr, void *refDiag2Ptr, char *sourcePtr, char *targetPtr, ref int rowIndex, int columnIndex) where T : struct
        {
            if (typeof(T) == typeof(int))
            {
                var diag1Ptr = (int *)refDiag1Ptr;
                var diag2Ptr = (int *)refDiag2Ptr;

                var sourceVector = Sse41.ConvertToVector128Int32((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count);
                var targetVector = Sse41.ConvertToVector128Int32((ushort *)targetPtr + columnIndex - 1);
                targetVector = Sse2.Shuffle(targetVector, 0x1b);
                var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector);

                var substitutionCost = Sse2.Add(
                    Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count),
                    substitutionCostAdjustment
                    );

                var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1));
                var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count);

                var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost);
                localCost = Sse2.Add(localCost, Vector128.Create(1));

                Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost);
            }
            else if (typeof(T) == typeof(ushort))
            {
                var diag1Ptr = (ushort *)refDiag1Ptr;
                var diag2Ptr = (ushort *)refDiag2Ptr;

                var sourceVector = Sse3.LoadDquVector128((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count);
                var targetVector = Sse3.LoadDquVector128((ushort *)targetPtr + columnIndex - 1);
                targetVector = Ssse3.Shuffle(targetVector.AsByte(), REVERSE_USHORT_AS_BYTE_128).AsUInt16();
                var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector);

                var substitutionCost = Sse2.Add(
                    Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count),
                    substitutionCostAdjustment
                    );

                var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1));
                var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count);

                var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost);
                localCost = Sse2.Add(localCost, Vector128.Create((ushort)1));

                Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost);
            }
        }
示例#2
0
        private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer)
        {
            uint s1 = adler & 0xFFFF;
            uint s2 = (adler >> 16) & 0xFFFF;

            // Process the data in blocks.
            const int BLOCK_SIZE = 1 << 5;

            uint length = (uint)buffer.Length;
            uint blocks = length / BLOCK_SIZE;

            length -= blocks * BLOCK_SIZE;

            int index = 0;

            fixed(byte *bufferPtr = buffer)
            fixed(byte *tapPtr = Tap1Tap2)
            {
                index += (int)blocks * BLOCK_SIZE;
                var localBufferPtr = bufferPtr;

                // _mm_setr_epi8 on x86
                Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr);
                Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10));
                Vector128 <byte>  zero = Vector128 <byte> .Zero;
                var ones = Vector128.Create((short)1);

                while (blocks > 0)
                {
                    uint n = NMAX / BLOCK_SIZE;  /* The NMAX constraint. */
                    if (n > blocks)
                    {
                        n = blocks;
                    }

                    blocks -= n;

                    // Process n blocks of data. At most NMAX data bytes can be
                    // processed before s2 must be reduced modulo BASE.
                    Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n);
                    Vector128 <uint> v_s2 = Vector128.CreateScalar(s2);
                    Vector128 <uint> v_s1 = Vector128 <uint> .Zero;

                    do
                    {
                        // Load 32 input bytes.
                        Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
                        Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);

                        // Add previous block byte sum to v_ps.
                        v_ps = Sse2.Add(v_ps, v_s1);

                        // Horizontally add the bytes for s1, multiply-adds the
                        // bytes by [ 32, 31, 30, ... ] for s2.
                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
                        Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());

                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
                        Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());

                        localBufferPtr += BLOCK_SIZE;
                    }while (--n > 0);

                    v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                    // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                    const byte S2301 = 0b1011_0001;  // A B C D -> B A D C
                    const byte S1032 = 0b0100_1110;  // A B C D -> C D A B

                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));

                    s1 += v_s1.ToScalar();

                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));

                    s2 = v_s2.ToScalar();

                    // Reduce.
                    s1 %= BASE;
                    s2 %= BASE;
                }

                if (length > 0)
                {
                    if (length >= 16)
                    {
                        s2 += s1 += localBufferPtr[0];
                        s2 += s1 += localBufferPtr[1];
                        s2 += s1 += localBufferPtr[2];
                        s2 += s1 += localBufferPtr[3];
                        s2 += s1 += localBufferPtr[4];
                        s2 += s1 += localBufferPtr[5];
                        s2 += s1 += localBufferPtr[6];
                        s2 += s1 += localBufferPtr[7];
                        s2 += s1 += localBufferPtr[8];
                        s2 += s1 += localBufferPtr[9];
                        s2 += s1 += localBufferPtr[10];
                        s2 += s1 += localBufferPtr[11];
                        s2 += s1 += localBufferPtr[12];
                        s2 += s1 += localBufferPtr[13];
                        s2 += s1 += localBufferPtr[14];
                        s2 += s1 += localBufferPtr[15];

                        localBufferPtr += 16;
                        length         -= 16;
                    }

                    while (length-- > 0)
                    {
                        s2 += s1 += *localBufferPtr++;
                    }

                    if (s1 >= BASE)
                    {
                        s1 -= BASE;
                    }

                    s2 %= BASE;
                }

                return(s1 | (s2 << 16));
            }
        }
示例#3
0
        static unsafe int Main(string[] args)
        {
            int testResult = Pass;

            if (Sse3.IsSupported)
            {
                using (TestTable <int> intTable = new TestTable <int>(new int[4] {
                    1, -5, 100, 0
                }, new int[4]))
                {
                    var vf = Sse3.LoadDquVector128((int *)(intTable.inArrayPtr));
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y))
                    {
                        Console.WriteLine("Sse3 LoadDquVector128 failed on int:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <uint> intTable = new TestTable <uint>(new uint[4] {
                    1, 5, 100, 0
                }, new uint[4]))
                {
                    var vf = Sse3.LoadDquVector128((uint *)(intTable.inArrayPtr));
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y))
                    {
                        Console.WriteLine("Sse3 LoadDquVector128 failed on uint:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <long> intTable = new TestTable <long>(new long[2] {
                    1, -5
                }, new long[2]))
                {
                    var vf = Sse3.LoadDquVector128((long *)(intTable.inArrayPtr));
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y))
                    {
                        Console.WriteLine("Sse3 LoadDquVector128 failed on long:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <ulong> intTable = new TestTable <ulong>(new ulong[2] {
                    1, 5
                }, new ulong[2]))
                {
                    var vf = Sse3.LoadDquVector128((ulong *)(intTable.inArrayPtr));
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y))
                    {
                        Console.WriteLine("Sse3 LoadDquVector128 failed on ulong:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <short> intTable = new TestTable <short>(new short[8] {
                    1, -5, 100, 0, 1, 2, 3, 4
                }, new short[8]))
                {
                    var vf = Sse3.LoadDquVector128((short *)(intTable.inArrayPtr));
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y))
                    {
                        Console.WriteLine("Sse3 LoadDquVector128 failed on short:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <ushort> intTable = new TestTable <ushort>(new ushort[8] {
                    1, 5, 100, 0, 1, 2, 3, 4
                }, new ushort[8]))
                {
                    var vf = Sse3.LoadDquVector128((ushort *)(intTable.inArrayPtr));
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y))
                    {
                        Console.WriteLine("Sse3 LoadDquVector128 failed on ushort:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <byte> intTable = new TestTable <byte>(new byte[16] {
                    1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4
                }, new byte[16]))
                {
                    var vf = Sse3.LoadDquVector128((byte *)(intTable.inArrayPtr));
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y))
                    {
                        Console.WriteLine("Sse3 LoadDquVector128 failed on byte:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <sbyte> intTable = new TestTable <sbyte>(new sbyte[16] {
                    1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4
                }, new sbyte[16]))
                {
                    var vf = Sse3.LoadDquVector128((sbyte *)(intTable.inArrayPtr));
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y))
                    {
                        Console.WriteLine("Sse3 LoadDquVector128 failed on sbyte:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }
            }

            return(testResult);
        }
示例#4
0
 public static unsafe Vector128 <sbyte> _mm_lddqu_si128(sbyte *address)
 {
     return(Sse3.LoadDquVector128(address));
 }
示例#5
0
        // Based on https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
#if !NETSTANDARD2_0 && !NETSTANDARD2_1
        private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer)
        {
            uint s1 = adler & 0xFFFF;
            uint s2 = (adler >> 16) & 0xFFFF;

            // Process the data in blocks.
            const int BLOCK_SIZE = 1 << 5;

            uint length = (uint)buffer.Length;
            uint blocks = length / BLOCK_SIZE;

            length -= blocks * BLOCK_SIZE;

            int index = 0;

            fixed(byte *bufferPtr = &buffer[0])
            {
                index += (int)blocks * BLOCK_SIZE;
                var localBufferPtr = bufferPtr;

                // _mm_setr_epi8 on x86
                var tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
                var tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
                Vector128 <byte> zero = Vector128 <byte> .Zero;
                var ones = Vector128.Create((short)1);

                while (blocks > 0)
                {
                    uint n = NMAX / BLOCK_SIZE;  /* The NMAX constraint. */
                    if (n > blocks)
                    {
                        n = blocks;
                    }

                    blocks -= n;

                    // Process n blocks of data. At most NMAX data bytes can be
                    // processed before s2 must be reduced modulo BASE.
                    Vector128 <int> v_ps = Vector128.CreateScalar(s1 * n).AsInt32();
                    Vector128 <int> v_s2 = Vector128.CreateScalar(s2).AsInt32();
                    Vector128 <int> v_s1 = Vector128 <int> .Zero;

                    do
                    {
                        // Load 32 input bytes.
                        Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
                        Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 16);

                        // Add previous block byte sum to v_ps.
                        v_ps = Sse2.Add(v_ps, v_s1);

                        // Horizontally add the bytes for s1, multiply-adds the
                        // bytes by [ 32, 31, 30, ... ] for s2.
                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsInt32());
                        Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones));

                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsInt32());
                        Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones));

                        localBufferPtr += BLOCK_SIZE;
                    }while (--n > 0);

                    v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                    // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                    const byte S2301 = 0b1011_0001;  // A B C D -> B A D C
                    const byte S1032 = 0b0100_1110;  // A B C D -> C D A B

                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301));
                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));

                    s1 += (uint)v_s1.ToScalar();

                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));

                    s2 = (uint)v_s2.ToScalar();

                    // Reduce.
                    s1 %= BASE;
                    s2 %= BASE;
                }
            }

            ref byte bufferRef = ref MemoryMarshal.GetReference(buffer);
示例#6
0
        private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer)
        {
            uint s1 = adler & 0xFFFF;
            uint s2 = (adler >> 16) & 0xFFFF;

            // Process the data in blocks.
            uint length = (uint)buffer.Length;
            uint blocks = length / BlockSize;

            length -= blocks * BlockSize;

            fixed(byte *bufferPtr = &MemoryMarshal.GetReference(buffer))
            {
                fixed(byte *tapPtr = &MemoryMarshal.GetReference(Tap1Tap2))
                {
                    byte *localBufferPtr = bufferPtr;

                    // _mm_setr_epi8 on x86
                    Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr);
                    Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10));
                    Vector128 <byte>  zero = Vector128 <byte> .Zero;
                    var ones = Vector128.Create((short)1);

                    while (blocks > 0)
                    {
                        uint n = NMAX / BlockSize;  /* The NMAX constraint. */
                        if (n > blocks)
                        {
                            n = blocks;
                        }

                        blocks -= n;

                        // Process n blocks of data. At most NMAX data bytes can be
                        // processed before s2 must be reduced modulo BASE.
                        Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n);
                        Vector128 <uint> v_s2 = Vector128.CreateScalar(s2);
                        Vector128 <uint> v_s1 = Vector128 <uint> .Zero;

                        do
                        {
                            // Load 32 input bytes.
                            Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
                            Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);

                            // Add previous block byte sum to v_ps.
                            v_ps = Sse2.Add(v_ps, v_s1);

                            // Horizontally add the bytes for s1, multiply-adds the
                            // bytes by [ 32, 31, 30, ... ] for s2.
                            v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
                            Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
                            v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());

                            v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
                            Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
                            v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());

                            localBufferPtr += BlockSize;
                        }while (--n > 0);

                        v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                        // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                        const byte S2301 = 0b1011_0001;  // A B C D -> B A D C
                        const byte S1032 = 0b0100_1110;  // A B C D -> C D A B

                        v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));

                        s1 += v_s1.ToScalar();

                        v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
                        v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));

                        s2 = v_s2.ToScalar();

                        // Reduce.
                        s1 %= BASE;
                        s2 %= BASE;
                    }

                    if (length > 0)
                    {
                        HandleLeftOver(localBufferPtr, length, ref s1, ref s2);
                    }

                    return(s1 | (s2 << 16));
                }
            }
        }