コード例 #1
0
ファイル: Blake2bSse4.cs プロジェクト: pangfd/Spreads
        private static void diagonalize(ref Vector128 <ulong> row1l, ref Vector128 <ulong> row2l, ref Vector128 <ulong> row3l, ref Vector128 <ulong> row4l,
                                        ref Vector128 <ulong> row1h, ref Vector128 <ulong> row2h, ref Vector128 <ulong> row3h, ref Vector128 <ulong> row4h, ref Vector128 <ulong> b0)
        {
            var t0 = Ssse3.AlignRight(row2h.AsSByte(), row2l.AsSByte(), 8);
            var t1 = Ssse3.AlignRight(row2l.AsSByte(), row2h.AsSByte(), 8);

            row2l = t0.AsUInt64();
            row2h = t1.AsUInt64();

            b0    = row3l;
            row3l = row3h;
            row3h = b0;

            t0    = Ssse3.AlignRight(row4h.AsSByte(), row4l.AsSByte(), 8);
            t1    = Ssse3.AlignRight(row4l.AsSByte(), row4h.AsSByte(), 8);
            row4l = t1.AsUInt64();
            row4h = t0.AsUInt64();
        }
コード例 #2
0
ファイル: Vector128Helper.cs プロジェクト: jinqi166/Base64
        public static Vector128 <sbyte> ReadVector128(this ref char src)
        {
            Vector128 <short> c0  = Unsafe.As <char, Vector128 <short> >(ref src);
            Vector128 <short> c1  = Unsafe.As <char, Vector128 <short> >(ref Unsafe.Add(ref src, 8));
            Vector128 <byte>  tmp = Sse2.PackUnsignedSaturate(c0, c1);

#if NETCOREAPP2_1
            return(Sse.StaticCast <byte, sbyte>(tmp));
#else
            return(tmp.AsSByte());
#endif
        }
コード例 #3
0
        private static ulong GetNonAsciiBytes(Vector128 <byte> value, Vector128 <byte> bitMask128)
        {
            if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian)
            {
                throw new PlatformNotSupportedException();
            }

            Vector128 <byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
            Vector128 <byte> extractedBits           = AdvSimd.And(mostSignificantBitIsSet, bitMask128);

            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
            return(extractedBits.AsUInt64().ToScalar());
        }
コード例 #4
0
        private static uint GetNonAsciiBytes(Vector128 <byte> value)
        {
            Debug.Assert(AdvSimd.Arm64.IsSupported);

            Vector128 <byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
            Vector128 <byte> extractedBits           = AdvSimd.And(mostSignificantBitIsSet, s_bitMask128);

            // self-pairwise add until all flags have moved to the first two bytes of the vector
            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
            return(extractedBits.AsUInt16().ToScalar());
        }
コード例 #5
0
 public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct
 {
     if (typeof(T) == typeof(byte))
     {
         return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>());
     }
     else if (typeof(T) == typeof(sbyte))
     {
         return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>());
     }
     else if (typeof(T) == typeof(short))
     {
         return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>());
     }
     else if (typeof(T) == typeof(ushort))
     {
         return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>());
     }
     else if (typeof(T) == typeof(int))
     {
         return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>());
     }
     else if (typeof(T) == typeof(uint))
     {
         return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>());
     }
     else if (typeof(T) == typeof(long))
     {
         return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>());
     }
     else if (typeof(T) == typeof(ulong))
     {
         return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>());
     }
     else if (typeof(T) == typeof(float))
     {
         return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>());
     }
     else if (typeof(T) == typeof(double))
     {
         return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>());
     }
     else
     {
         throw new NotSupportedException();
     }
 }
コード例 #6
0
        public static int GetIndexOfFirstNonAsciiByte(Vector128 <byte> value)
        {
            if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian)
            {
                throw new PlatformNotSupportedException();
            }

            // extractedBits[i] = (value[i] >> 7) & (1 << (12 * (i % 2)));
            Vector128 <byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
            Vector128 <byte> extractedBits           = AdvSimd.And(mostSignificantBitIsSet, s_bitmask);

            // collapse mask to lower bits
            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
            ulong mask = extractedBits.AsUInt64().ToScalar();

            // calculate the index
            int index = BitOperations.TrailingZeroCount(mask) >> 2;

            Debug.Assert((mask != 0) ? index < 16 : index >= 16);
            return(index);
        }
コード例 #7
0
        private unsafe nuint GetIndexOfFirstCharToEncodeAdvSimd64(char *pData, nuint lengthInChars)
        {
            // See GetIndexOfFirstByteToEncodeAdvSimd64 for the central logic behind this method.
            // The main difference here is that we need to pack WORDs to BYTEs before performing
            // the main vectorized logic. It doesn't matter if we use signed or unsigned saturation
            // while packing, as saturation will convert out-of-range (non-ASCII char) WORDs to
            // 0x00 or 0x7F..0xFF, all of which are forbidden by the encoder.

            Debug.Assert(AdvSimd.Arm64.IsSupported);
            Debug.Assert(BitConverter.IsLittleEndian);

            Vector128 <byte> vec0xF         = Vector128.Create((byte)0xF);
            Vector128 <byte> vecPowersOfTwo = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0);
            Vector128 <byte> vecPairwiseAddNibbleBitmask = Vector128.Create((ushort)0xF00F).AsByte(); // little endian only
            Vector128 <byte> allowedCodePoints           = _allowedAsciiCodePoints.AsVector;
            ulong            resultScalar;

            nuint i = 0;

            if (lengthInChars >= 16)
            {
                nuint lastLegalIterationFor16CharRead = lengthInChars & unchecked ((nuint)(nint) ~0xF);

                do
                {
                    // Read 16 chars at a time into 2x 128-bit vectors, then pack into a single 128-bit vector.
                    // We turn 16 chars (256 bits) into 16 nibbles (64 bits) during this process.

                    Vector128 <byte> packed = AdvSimd.ExtractNarrowingSaturateUnsignedUpper(
                        AdvSimd.ExtractNarrowingSaturateUnsignedLower(AdvSimd.LoadVector128((/* unaligned */ short *)(pData + i))),
                        AdvSimd.LoadVector128((/* unaligned */ short *)(pData + 8 + i)));
                    var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF));
                    var vecPowersOfTwoShuffled    = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte());
                    var result       = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled);
                    var maskedResult = AdvSimd.And(result, vecPairwiseAddNibbleBitmask);
                    resultScalar = AdvSimd.Arm64.AddPairwise(maskedResult, maskedResult).AsUInt64().ToScalar();

                    if (resultScalar != ulong.MaxValue)
                    {
                        goto PairwiseAddMaskContainsDataWhichRequiresEscaping;
                    }
                } while ((i += 16) < lastLegalIterationFor16CharRead);
            }

            if ((lengthInChars & 8) != 0)
            {
                // Read 8 chars at a time into a single 128-bit vector, then pack into a 64-bit
                // vector, then extend to 128 bits. We turn 8 chars (128 bits) into 8 bytes (64 bits)
                // during this process. Only the low 64 bits of the 'result' vector have meaningful
                // data.

                Vector128 <byte> packed       = AdvSimd.ExtractNarrowingSaturateUnsignedLower(AdvSimd.LoadVector128((/* unaligned */ short *)(pData + i))).AsByte().ToVector128Unsafe();
                var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF));
                var vecPowersOfTwoShuffled    = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte());
                var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled);
                resultScalar = result.AsUInt64().ToScalar();

                if (resultScalar != ulong.MaxValue)
                {
                    goto MaskContainsDataWhichRequiresEscaping;
                }

                i += 8;
            }

            if ((lengthInChars & 4) != 0)
            {
                // Read 4 chars at a time into a single 64-bit vector, then pack into the low 32 bits
                // of a 128-bit vector. We turn 4 chars (64 bits) into 4 bytes (32 bits) during this
                // process. Only the low 32 bits of the 'result' vector have meaningful data.

                Vector128 <byte> packed       = AdvSimd.ExtractNarrowingSaturateUnsignedLower(AdvSimd.LoadVector64((/* unaligned */ short *)(pData + i)).ToVector128Unsafe()).ToVector128Unsafe();
                var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF));
                var vecPowersOfTwoShuffled    = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte());
                var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled);
                resultScalar = result.AsUInt32().ToScalar(); // n.b. implicit conversion uint -> ulong; high 32 bits will be zeroed

                if (resultScalar != uint.MaxValue)
                {
                    goto MaskContainsDataWhichRequiresEscaping;
                }

                i += 4;
            }

            // Beyond this point, vectorization isn't worthwhile. Just do a normal loop.

            if ((lengthInChars & 3) != 0)
            {
                Debug.Assert(lengthInChars - i <= 3);

                do
                {
                    if (!_allowedAsciiCodePoints.IsAllowedAsciiCodePoint(pData[i]))
                    {
                        break;
                    }
                } while (++i != lengthInChars);
            }

Return:

            return(i);

PairwiseAddMaskContainsDataWhichRequiresEscaping:

            Debug.Assert(resultScalar != ulong.MaxValue);
            // Each nibble is 4 (1 << 2) bits, so we shr by 2 to account for per-nibble stride.
            i += (uint)BitOperations.TrailingZeroCount(~resultScalar) >> 2; // location of lowest set bit is where we must begin escaping
            goto Return;

MaskContainsDataWhichRequiresEscaping:

            Debug.Assert(resultScalar != ulong.MaxValue);
            // Each byte is 8 (1 << 3) bits, so we shr by 3 to account for per-byte stride.
            i += (uint)BitOperations.TrailingZeroCount(~resultScalar) >> 3; // location of lowest set bit is where we must begin escaping
            goto Return;
        }
コード例 #8
0
        private unsafe nuint GetIndexOfFirstByteToEncodeAdvSimd64(byte *pData, nuint lengthInBytes)
        {
            Debug.Assert(AdvSimd.Arm64.IsSupported);
            Debug.Assert(BitConverter.IsLittleEndian);

            Vector128 <byte> vec0xF         = Vector128.Create((byte)0xF);
            Vector128 <byte> vecPowersOfTwo = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0);
            Vector128 <byte> vecPairwiseAddNibbleBitmask = Vector128.Create((ushort)0xF00F).AsByte(); // little endian only
            Vector128 <byte> allowedCodePoints           = _allowedAsciiCodePoints.AsVector;
            ulong            resultScalar;

            nuint i = 0;

            if (lengthInBytes >= 16)
            {
                nuint lastLegalIterationFor16CharRead = lengthInBytes & unchecked ((nuint)(nint) ~0xF);

                do
                {
                    // Read 16 bytes at a time into a single 128-bit vector.

                    Vector128 <byte> packed = AdvSimd.LoadVector128(pData + i); // unaligned read

                    // Each element of the packed vector corresponds to a byte of untrusted source data. It will
                    // have the format [ ..., 0xYZ, ... ]. We use the low nibble of each byte to index into
                    // the 'allowedCodePoints' vector, and we use the high nibble of each byte to select a bit
                    // from the corresponding element in the 'allowedCodePoints' vector.
                    //
                    // Example: let packed := [ ..., 0x6D ('m'), ... ]
                    // The final 'result' vector will contain a non-zero value in the corresponding space iff the
                    // 0xD element in the 'allowedCodePoints' vector has its 1 << 0x6 bit set.
                    //
                    // We rely on the fact that when we perform an arithmetic shift of vector values to get the
                    // high nibble into the low 4 bits, we'll smear the high (non-ASCII) bit, causing the vector
                    // element value to be in the range [ 128..255 ]. This causes the tbl lookup to return 0x00
                    // for that particular element in the 'vecPowersOfTwoShuffled' vector, meaning that escaping is required.

                    var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF));
                    var vecPowersOfTwoShuffled    = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte());
                    var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled);

                    // Now, each element of 'result' contains 0xFF if the corresponding element in 'packed' is allowed;
                    // and it contains a zero value if the corresponding element in 'packed' is disallowed. We'll convert
                    // this into a vector where if 0xFF occurs in an even-numbered index, it gets converted to 0x0F; and
                    // if 0xFF occurs in an odd-numbered index, it gets converted to 0xF0. This allows us to collapse
                    // the Vector128<byte> to a 64-bit unsigned integer, where each of the 16 nibbles in the 64-bit integer
                    // corresponds to whether an element in the 'result' vector was originally 0xFF or 0x00.

                    var maskedResult = AdvSimd.And(result, vecPairwiseAddNibbleBitmask);
                    resultScalar = AdvSimd.Arm64.AddPairwise(maskedResult, maskedResult).AsUInt64().ToScalar();

                    if (resultScalar != ulong.MaxValue)
                    {
                        goto PairwiseAddMaskContainsDataWhichRequiresEscaping;
                    }
                } while ((i += 16) < lastLegalIterationFor16CharRead);
            }

            if ((lengthInBytes & 8) != 0)
            {
                // Read 8 bytes at a time into a single 64-bit vector, extended to 128 bits.
                // Same logic as the 16-byte case, but we don't need to worry about the pairwise add step.
                // We'll treat the low 64 bits of the 'result' vector as its own scalar element.

                Vector128 <byte> packed       = AdvSimd.LoadVector64(pData + i).ToVector128Unsafe(); // unaligned read
                var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF));
                var vecPowersOfTwoShuffled    = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte());
                var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled);
                resultScalar = result.AsUInt64().ToScalar();

                if (resultScalar != ulong.MaxValue)
                {
                    goto MaskContainsDataWhichRequiresEscaping;
                }

                i += 8;
            }

            if ((lengthInBytes & 4) != 0)
            {
                // Read 4 bytes at a time into a single element, extended to a 128-bit vector.
                // Same logic as the 16-byte case, but we don't need to worry about the pairwise add step.
                // We'll treat the low 32 bits of the 'result' vector as its own scalar element.

                Vector128 <byte> packed       = Vector128.CreateScalarUnsafe(Unsafe.ReadUnaligned <uint>(pData + i)).AsByte();
                var allowedCodePointsShuffled = AdvSimd.Arm64.VectorTableLookup(allowedCodePoints, AdvSimd.And(packed, vec0xF));
                var vecPowersOfTwoShuffled    = AdvSimd.Arm64.VectorTableLookup(vecPowersOfTwo, AdvSimd.ShiftRightArithmetic(packed.AsSByte(), 4).AsByte());
                var result = AdvSimd.CompareTest(allowedCodePointsShuffled, vecPowersOfTwoShuffled);
                resultScalar = result.AsUInt32().ToScalar(); // n.b. implicit conversion uint -> ulong; high 32 bits will be zeroed

                if (resultScalar != uint.MaxValue)
                {
                    goto MaskContainsDataWhichRequiresEscaping;
                }

                i += 4;
            }

            // Beyond this point, vectorization isn't worthwhile. Just do a normal loop.

            if ((lengthInBytes & 3) != 0)
            {
                Debug.Assert(lengthInBytes - i <= 3);

                do
                {
                    if (!_allowedAsciiCodePoints.IsAllowedAsciiCodePoint(pData[i]))
                    {
                        break;
                    }
                } while (++i != lengthInBytes);
            }

Return:

            return(i);

PairwiseAddMaskContainsDataWhichRequiresEscaping:

            Debug.Assert(resultScalar != ulong.MaxValue);
            // Each nibble is 4 (1 << 2) bits, so we shr by 2 to account for per-nibble stride.
            i += (uint)BitOperations.TrailingZeroCount(~resultScalar) >> 2; // location of lowest set bit is where we must begin escaping
            goto Return;

MaskContainsDataWhichRequiresEscaping:

            Debug.Assert(resultScalar != ulong.MaxValue);
            // Each byte is 8 (1 << 3) bits, so we shr by 3 to account for per-byte stride.
            i += (uint)BitOperations.TrailingZeroCount(~resultScalar) >> 3; // location of lowest set bit is where we must begin escaping
            goto Return;
        }
コード例 #9
0
 public static Vector128 <byte> CompareBit8Equal(Vector128 <byte> left, Vector128 <byte> right)
 => CompareBit8Equal(left.AsSByte(), right.AsSByte()).AsByte();
コード例 #10
0
ファイル: Blake2bSse4.cs プロジェクト: pangfd/Spreads
 private static Vector128 <ulong> alignr_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) =>
 Ssse3.AlignRight(x.AsSByte(), y.AsSByte(), m).AsUInt64();
コード例 #11
0
ファイル: Blake2bSse4.cs プロジェクト: pangfd/Spreads
 private static Vector128 <ulong> ror64_shuffle(ref Vector128 <ulong> x, ref Vector128 <sbyte> y) =>
 Ssse3.Shuffle(x.AsSByte(), y).AsUInt64();
コード例 #12
0
        public byte[] GenerateLuhnCodeSimdSse2()
        {
            if (!Sse2.IsSupported)
            {
                throw new NotSupportedException();
            }

            byte[] arr = new byte[16];
            arr[15] = (byte)'0';

            Vector128 <byte> inputVector;

            unsafe
            {
                fixed(byte *arrRef = arr)
                {
                    fixed(byte *cardCodeRef = CardCode)
                    {
                        Buffer.MemoryCopy(cardCodeRef, arrRef, 16, 15);
                    }

                    inputVector = Sse2.LoadVector128(arrRef);
                }
            }

            arr[15] = (byte)'0';
            Vector128 <byte> substractResult = Sse2.Subtract(
                inputVector,
                Vector128.Create((byte)'0')
                );

            Vector128 <byte> zeroVector     = Vector128 <byte> .Zero;
            Vector128 <byte> multiplyResult = Sse2.Add(
                substractResult,
                MultiplyBytes(
                    Sse2.Subtract(
                        zeroVector.AsSByte(),
                        Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0).AsSByte()
                        ).AsByte(),
                    substractResult
                    )
                );

            Vector128 <byte> nineVector = Vector128.Create((byte)9);

            Vector128 <ushort> vsum = Sse2.SumAbsoluteDifferences(
                Sse2.Subtract(
                    multiplyResult,
                    MultiplyBytes(
                        Sse2.Subtract(
                            zeroVector.AsSByte(),
                            Sse2.CompareGreaterThan(
                                multiplyResult.AsSByte(),
                                nineVector.AsSByte()
                                )
                            ).AsByte(),
                        nineVector
                        )
                    ),
                zeroVector
                );

            // ref: https://stackoverflow.com/a/36998778
            byte sum = (byte)(Sse2.Extract(vsum, 0) + Sse2.Extract(vsum, 4));

            arr[15] = (byte)(sum % 10 == 0 ? '0' : 10 - sum % 10 + '0');

            return(arr);
        }
コード例 #13
0
        public static unsafe void ReadVector(IntBlock block, BufferedReader reader, int[] buffer)
        {
            // Build first unadjusted vector and per-vector increment
            Vector128 <int> unadjusted = SetIncrement(block.Base, block.Slope);
            Vector128 <int> increment  = Set1(block.Slope * 4);

            if (block.BitsPerAdjustment == 0)
            {
                // If no adjustments, calculate in blocks and return
                fixed(int *resultPtr = buffer)
                {
                    for (int i = 0; i < block.Count; i += 4)
                    {
                        Unsafe.WriteUnaligned(&resultPtr[i], unadjusted);
                        unadjusted = Sse2.Add(unadjusted, increment);
                    }
                }

                return;
            }

            fixed(byte *bufferPtr = reader.Buffer)
            fixed(int *resultPtr        = buffer)
            fixed(sbyte *shuffleMaskPtr = ShuffleMasks)
            fixed(int *multiplyMaskPtr  = MultiplyMasks)
            {
                byte bitsPerAdjustment = block.BitsPerAdjustment;
                int  index             = reader.Index;
                int  count             = block.Count;

                // Calculate bytes consumed for the first and second four ints decoded (different for odd bit lengths)
                byte bytesPerEight = bitsPerAdjustment;
                byte bytes1        = (byte)(bytesPerEight / 2);

                // Calculate how much to shift values (from top of each int to bottom)
                byte shiftRightBits = (byte)(32 - bitsPerAdjustment);

                // Get shuffle mask (to get correct bits) and multiply value (to shift to top of each int) for halves
                Vector128 <sbyte> shuffle1  = Unsafe.ReadUnaligned <Vector128 <sbyte> >(&shuffleMaskPtr[32 * bitsPerAdjustment]);
                Vector128 <int>   multiply1 = Unsafe.ReadUnaligned <Vector128 <int> >(&multiplyMaskPtr[8 * bitsPerAdjustment]);

                Vector128 <sbyte> shuffle2  = Unsafe.ReadUnaligned <Vector128 <sbyte> >(&shuffleMaskPtr[32 * bitsPerAdjustment + 16]);
                Vector128 <int>   multiply2 = Unsafe.ReadUnaligned <Vector128 <int> >(&multiplyMaskPtr[8 * bitsPerAdjustment + 4]);

                for (int i = 0; i < count; i += 8, index += bytesPerEight)
                {
                    // Read source bytes
                    Vector128 <int> vector1 = Unsafe.ReadUnaligned <Vector128 <int> >(&bufferPtr[index]);
                    Vector128 <int> vector2 = Unsafe.ReadUnaligned <Vector128 <int> >(&bufferPtr[index + bytes1]);

                    // Shuffle to get the right bytes in each integer
                    vector1 = (Ssse3.Shuffle(vector1.AsSByte(), shuffle1).AsInt32());
                    vector2 = (Ssse3.Shuffle(vector2.AsSByte(), shuffle2).AsInt32());

                    // Multiply to shift each int so the desired bits are at the top
                    vector1 = Sse41.MultiplyLow(vector1, multiply1);
                    vector2 = Sse41.MultiplyLow(vector2, multiply2);

                    // Shift the desired bits to the bottom and zero the top
                    vector1 = Sse2.ShiftRightLogical(vector1, shiftRightBits);
                    vector2 = Sse2.ShiftRightLogical(vector2, shiftRightBits);

                    // Add the delta base value
                    vector1    = Sse2.Add(vector1, unadjusted);
                    unadjusted = Sse2.Add(unadjusted, increment);

                    vector2    = Sse2.Add(vector2, unadjusted);
                    unadjusted = Sse2.Add(unadjusted, increment);

                    // Write the decoded integers
                    Unsafe.WriteUnaligned(&resultPtr[i], vector1);
                    Unsafe.WriteUnaligned(&resultPtr[i + 4], vector2);
                }

                reader.Index = index;
            }
        }
コード例 #14
0
        internal static void Step(ref ushort sum1, ref ushort sum2, byte[] buf, uint len)
        {
            uint s1     = sum1;
            uint s2     = sum2;
            int  bufPos = 0;

            /*
             * Process the data in blocks.
             */
            uint BLOCK_SIZE = 1 << 5;
            uint blocks     = len / BLOCK_SIZE;

            len -= blocks * BLOCK_SIZE;

            while (blocks != 0)
            {
                uint n = Adler32Context.NMAX / BLOCK_SIZE; /* The NMAX constraint. */

                if (n > blocks)
                {
                    n = blocks;
                }

                blocks -= n;

                Vector128 <byte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17).
                                        AsByte();

                Vector128 <byte>  tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1).AsByte();
                Vector128 <byte>  zero = Vector128.Create(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte();
                Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1);

                /*
                 * Process n blocks of data. At most NMAX data bytes can be
                 * processed before s2 must be reduced modulo BASE.
                 */
                Vector128 <uint> v_ps = Vector128.Create(s1 * n, 0, 0, 0);
                Vector128 <uint> v_s2 = Vector128.Create(s2, 0, 0, 0);
                Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0);

                do
                {
                    /*
                     * Load 32 input bytes.
                     */
                    Vector128 <uint> bytes1 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos),
                                                               BitConverter.ToUInt32(buf, bufPos + 4),
                                                               BitConverter.ToUInt32(buf, bufPos + 8),
                                                               BitConverter.ToUInt32(buf, bufPos + 12));

                    bufPos += 16;

                    Vector128 <uint> bytes2 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos),
                                                               BitConverter.ToUInt32(buf, bufPos + 4),
                                                               BitConverter.ToUInt32(buf, bufPos + 8),
                                                               BitConverter.ToUInt32(buf, bufPos + 12));

                    bufPos += 16;

                    /*
                     * Add previous block byte sum to v_ps.
                     */
                    v_ps = Sse2.Add(v_ps, v_s1);

                    /*
                     * Horizontally add the bytes for s1, multiply-adds the
                     * bytes by [ 32, 31, 30, ... ] for s2.
                     */
                    v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1.AsByte(), zero).AsUInt32());

                    Vector128 <short> mad1 =
                        System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes1.AsByte(), tap1.AsSByte());

                    v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1.AsInt16(), ones.AsInt16()).AsUInt32());
                    v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2.AsByte(), zero).AsUInt32());

                    Vector128 <short> mad2 =
                        System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes2.AsByte(), tap2.AsSByte());

                    v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2.AsInt16(), ones.AsInt16()).AsUInt32());
                } while(--n != 0);

                v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                /*
                 * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                 */
                v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 177));
                v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 78));
                s1  += (uint)Sse2.ConvertToInt32(v_s1.AsInt32());
                v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 177));
                v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 78));
                s2   = (uint)Sse2.ConvertToInt32(v_s2.AsInt32());

                /*
                 * Reduce.
                 */
                s1 %= Adler32Context.ADLER_MODULE;
                s2 %= Adler32Context.ADLER_MODULE;
            }

            /*
             * Handle leftover data.
             */
            if (len != 0)
            {
                if (len >= 16)
                {
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    len -= 16;
                }

                while (len-- != 0)
                {
                    s2 += s1 += buf[bufPos++];
                }

                if (s1 >= Adler32Context.ADLER_MODULE)
                {
                    s1 -= Adler32Context.ADLER_MODULE;
                }

                s2 %= Adler32Context.ADLER_MODULE;
            }

            /*
             * Return the recombined sums.
             */
            sum1 = (ushort)(s1 & 0xFFFF);
            sum2 = (ushort)(s2 & 0xFFFF);
        }