Ejemplo n.º 1
0
        public void RunStructLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load));

            var test   = TestStruct.Create();
            var result = Sse2.AndNot(
                Sse2.LoadVector128((Int32 *)(&test._fld1)),
                Sse2.LoadVector128((Int32 *)(&test._fld2))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
Ejemplo n.º 2
0
            public void RunStructFldScenario_Load(SimpleBinaryOpTest__AndNotUInt32 testClass)
            {
                fixed(Vector128 <UInt32> *pFld1 = &_fld1)
                fixed(Vector128 <UInt32> *pFld2 = &_fld2)
                {
                    var result = Sse2.AndNot(
                        Sse2.LoadVector128((UInt32 *)(pFld1)),
                        Sse2.LoadVector128((UInt32 *)(pFld2))
                        );

                    Unsafe.Write(testClass._dataTable.outArrayPtr, result);
                    testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr);
                }
            }
Ejemplo n.º 3
0
        public void RunClassFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load));

            fixed(Vector128 <UInt32> *pFld1 = &_fld1)
            fixed(Vector128 <UInt32> *pFld2 = &_fld2)
            {
                var result = Sse2.AndNot(
                    Sse2.LoadVector128((UInt32 *)(pFld1)),
                    Sse2.LoadVector128((UInt32 *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr);
            }
        }
Ejemplo n.º 4
0
        public void RunClsVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load));

            fixed(Vector128 <Double> *pClsVar1 = &_clsVar1)
            fixed(Vector128 <Double> *pClsVar2 = &_clsVar2)
            {
                var result = Sse2.AndNot(
                    Sse2.LoadVector128((Double *)(pClsVar1)),
                    Sse2.LoadVector128((Double *)(pClsVar2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
            }
        }
Ejemplo n.º 5
0
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new SimpleBinaryOpTest__AndNotDouble();

            fixed(Vector128 <Double> *pFld1 = &test._fld1)
            fixed(Vector128 <Double> *pFld2 = &test._fld2)
            {
                var result = Sse2.AndNot(
                    Sse2.LoadVector128((Double *)(pFld1)),
                    Sse2.LoadVector128((Double *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
            }
        }
Ejemplo n.º 6
0
        public static Vector128 <double> ConditionalSelectBitwise(Vector128 <double> selector, Vector128 <double> ifTrue, Vector128 <double> ifFalse)
        {
            // This implementation is based on the DirectX Math Library XMVector4NotEqual method
            // https://github.com/microsoft/DirectXMath/blob/master/Inc/DirectXMathVector.inl

            if (AdvSimd.IsSupported)
            {
                return(AdvSimd.BitwiseSelect(selector, ifTrue, ifFalse));
            }
            else if (Sse2.IsSupported)
            {
                return(Sse2.Or(Sse2.And(ifTrue, selector), Sse2.AndNot(selector, ifFalse)));
            }
            else
            {
                // Redundant test so we won't prejit remainder of this method on platforms without AdvSimd.
                throw new PlatformNotSupportedException();
            }
        }
Ejemplo n.º 7
0
        // Returns &inputBuffer[inputLength] if the input buffer is valid.
        /// <summary>
        /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
        /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
        /// </summary>
        /// <remarks>
        /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
        /// </remarks>
        public static char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
        {
            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");

            // First, we'll handle the common case of all-ASCII. If this is able to
            // consume the entire buffer, we'll skip the remainder of this method's logic.

            int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);

            Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);

            pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
            inputLength  -= numAsciiCharsConsumedJustNow;

            if (inputLength == 0)
            {
                utf8CodeUnitCountAdjustment = 0;
                scalarCountAdjustment       = 0;
                return(pInputBuffer);
            }

            // If we got here, it means we saw some non-ASCII data, so within our
            // vectorized code paths below we'll handle all non-surrogate UTF-16
            // code points branchlessly. We'll only branch if we see surrogates.
            //
            // We still optimistically assume the data is mostly ASCII. This means that the
            // number of UTF-8 code units and the number of scalars almost matches the number
            // of UTF-16 code units. As we go through the input and find non-ASCII
            // characters, we'll keep track of these "adjustment" fixups. To get the
            // total number of UTF-8 code units required to encode the input data, add
            // the UTF-8 code unit count adjustment to the number of UTF-16 code units
            // seen.  To get the total number of scalars present in the input data,
            // add the scalar count adjustment to the number of UTF-16 code units seen.

            long tempUtf8CodeUnitCountAdjustment = 0;
            int  tempScalarCountAdjustment       = 0;

            if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported)
            {
                if (inputLength >= Vector128 <ushort> .Count)
                {
                    Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80);
                    Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800);
                    Vector128 <short>  vector8800 = Vector128.Create(unchecked ((short)0x8800));
                    Vector128 <ushort> vectorZero = Vector128 <ushort> .Zero;
                    do
                    {
                        Vector128 <ushort> utf16Data;
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            utf16Data = AdvSimd.LoadVector128((ushort *)pInputBuffer); // unaligned
                        }
                        else
                        {
                            utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); // unaligned
                        }

                        Vector128 <ushort> charIsNonAscii;

                        if (AdvSimd.Arm64.IsSupported)
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
                            charIsNonAscii = AdvSimd.Min(utf16Data, vector0080);
                        }
                        else if (Sse41.IsSupported)
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
                            charIsNonAscii = Sse41.Min(utf16Data, vector0080);
                        }
                        else
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will
                            // be handled in a few lines.

                            charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080);
                        }

#if DEBUG
                        // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element.
                        uint debugMask;
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte());
                        }
                        else
                        {
                            debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte());
                        }
                        Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'.");
#endif // DEBUG

                        // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding
                        // input was 0x0800 <= [value]. This also handles the missing range a few lines above.

                        Vector128 <ushort> charIsThreeByteUtf8Encoded;
                        uint mask;

                        if (AdvSimd.IsSupported)
                        {
                            charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11));
                            mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
                        }
                        else
                        {
                            charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
                            mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
                        }

                        // Each even bit of mask will be 1 only if the char was >= 0x0080,
                        // and each odd bit of mask will be 1 only if the char was >= 0x0800.
                        //
                        // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
                        //
                        //            ,-- set if char[1] is >= 0x0800
                        //            |   ,-- set if char[0] is >= 0x0800
                        //            v   v
                        // mask = ... 1 1 0 1
                        //              ^   ^-- set if char[0] is non-ASCII
                        //              `-- set if char[1] is non-ASCII
                        //
                        // This means we can popcnt the number of set bits, and the result is the
                        // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
                        // it expands. This results in the wrong count for UTF-16 surrogate code
                        // units (we just counted that each individual code unit expands to 3 bytes,
                        // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
                        // We'll handle this in just a moment.
                        //
                        // For now, compute the popcnt but squirrel it away. We'll fold it in to the
                        // cumulative UTF-8 adjustment factor once we determine that there are no
                        // unpaired surrogates in our data. (Unpaired surrogates would invalidate
                        // our computed result and we'd have to throw it away.)

                        uint popcnt = (uint)BitOperations.PopCount(mask);

                        // Surrogates need to be special-cased for two reasons: (a) we need
                        // to account for the fact that we over-counted in the addition above;
                        // and (b) they require separate validation.
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            utf16Data = AdvSimd.Add(utf16Data, vectorA800);
                            mask      = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
                        }
                        else
                        {
                            utf16Data = Sse2.Add(utf16Data, vectorA800);
                            mask      = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
                        }

                        if (mask != 0)
                        {
                            // There's at least one UTF-16 surrogate code unit present.
                            // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
                            // the resulting bits of 'mask' will occur in pairs:
                            // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
                            // - 11 if the corresponding UTF-16 char was a surrogate code unit.
                            //
                            // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
                            // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
                            // a low surrogate. Since we added 0xA800 in the vectorized operation above,
                            // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
                            // If we logical right-shift each word by 3, we'll end up with the bit pattern
                            // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
                            // determine whether a given char was a high or a low surrogate.
                            //
                            // Therefore the resulting bits of 'mask2' will occur in pairs:
                            // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
                            // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
                            // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
                            //   Since 'mask' already has 00 in these positions (since the corresponding char
                            //   wasn't a surrogate), "mask AND mask2 == 00" holds for these positions.

                            uint mask2;
                            if (AdvSimd.Arm64.IsSupported)
                            {
                                mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte());
                            }
                            else
                            {
                                mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
                            }

                            // 'lowSurrogatesMask' has its bits occur in pairs:
                            // - 01 if the corresponding char was a low surrogate char,
                            // - 00 if the corresponding char was a high surrogate char or not a surrogate at all.

                            uint lowSurrogatesMask = mask2 & mask;

                            // 'highSurrogatesMask' has its bits occur in pairs:
                            // - 01 if the corresponding char was a high surrogate char,
                            // - 00 if the corresponding char was a low surrogate char or not a surrogate at all.

                            uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask;

                            Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0,
                                         "A char cannot simultaneously be both a high and a low surrogate char.");

                            Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0,
                                         "Only even bits (no odd bits) of the masks should be set.");

                            // Now check that each high surrogate is followed by a low surrogate and that each
                            // low surrogate follows a high surrogate. We make an exception for the case where
                            // the final char of the vector is a high surrogate, since we can't perform validation
                            // on it until the next iteration of the loop when we hope to consume the matching
                            // low surrogate.

                            highSurrogatesMask <<= 2;
                            if ((ushort)highSurrogatesMask != lowSurrogatesMask)
                            {
                                goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
                            }

                            if (highSurrogatesMask > ushort.MaxValue)
                            {
                                // There was a standalone high surrogate at the end of the vector.
                                // We'll adjust our counters so that we don't consider this char consumed.

                                highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
                                popcnt            -= 2;                          // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
                                pInputBuffer--;
                                inputLength++;
                            }

                            // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for
                            // free right now, saving the extension step a few lines below. If we're 32-bit, the
                            // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real
                            // 64 -bit extension a few lines below.
                            nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask);

                            // 2 UTF-16 chars become 1 Unicode scalar

                            tempScalarCountAdjustment -= (int)surrogatePairsCountNuint;

                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
                            // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
                            // assumes that the pair is encoded as 6 UTF-8 code units. Since each
                            // pair is in reality only encoded as 4 UTF-8 code units, we need to
                            // perform this adjustment now.

                            if (IntPtr.Size == 8)
                            {
                                // Since we've already zero-extended surrogatePairsCountNuint, we can directly
                                // sub + sub. It's more efficient than shl + sub.
                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
                            }
                            else
                            {
                                // Take the hit of the 64-bit extension now.
                                tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint;
                            }
                        }

                        tempUtf8CodeUnitCountAdjustment += popcnt;
                        pInputBuffer += Vector128 <ushort> .Count;
                        inputLength  -= Vector128 <ushort> .Count;
                    } while (inputLength >= Vector128 <ushort> .Count);
                }
            }
            else if (Vector.IsHardwareAccelerated)
            {
                if (inputLength >= Vector <ushort> .Count)
                {
                    Vector <ushort> vector0080 = new Vector <ushort>(0x0080);
                    Vector <ushort> vector0400 = new Vector <ushort>(0x0400);
                    Vector <ushort> vector0800 = new Vector <ushort>(0x0800);
                    Vector <ushort> vectorD800 = new Vector <ushort>(0xD800);

                    do
                    {
                        // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
                        // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
                        // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
                        // vectors, each element of the sum will contain one of three values:
                        //
                        // 0x0000 ( 0) = original char was 0000..007F
                        // 0xFFFF (-1) = original char was 0080..07FF
                        // 0xFFFE (-2) = original char was 0800..FFFF
                        //
                        // We'll negate them to produce a value 0..2 for each element, then sum all the
                        // elements together to produce the number of *additional* UTF-8 code units
                        // required to represent this UTF-16 data. This is similar to the popcnt step
                        // performed by the SSE2 code path. This will overcount surrogates, but we'll
                        // handle that shortly.

                        Vector <ushort>  utf16Data            = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer);
                        Vector <ushort>  twoOrMoreUtf8Bytes   = Vector.GreaterThanOrEqual(utf16Data, vector0080);
                        Vector <ushort>  threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
                        Vector <nuint_t> sumVector            = (Vector <nuint_t>)(Vector <ushort> .Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);

                        // We'll try summing by a natural word (rather than a 16-bit word) at a time,
                        // which should halve the number of operations we must perform.

                        nuint popcnt = 0;
                        for (int i = 0; i < Vector <nuint_t> .Count; i++)
                        {
                            popcnt += (nuint)sumVector[i];
                        }

                        uint popcnt32 = (uint)popcnt;
                        if (IntPtr.Size == 8)
                        {
                            popcnt32 += (uint)(popcnt >> 32);
                        }

                        // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
                        // know there aren't any unpaired surrogates in the input data.

                        popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);

                        // Now check for surrogates.

                        utf16Data -= vectorD800;
                        Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
                        if (surrogateChars != Vector <ushort> .Zero)
                        {
                            // There's at least one surrogate (high or low) UTF-16 code unit in
                            // the vector. We'll build up additional vectors: 'highSurrogateChars'
                            // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
                            // UTF-16 code unit was a high or low surrogate, respectively.

                            Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
                            Vector <ushort> lowSurrogateChars  = Vector.AndNot(surrogateChars, highSurrogateChars);

                            // We want to make sure that each high surrogate code unit is followed by
                            // a low surrogate code unit and each low surrogate code unit follows a
                            // high surrogate code unit. Since we don't have an equivalent of pmovmskb
                            // or palignr available to us, we'll do this as a loop. We won't look at
                            // the very last high surrogate char element since we don't yet know if
                            // the next vector read will have a low surrogate char element.

                            if (lowSurrogateChars[0] != 0)
                            {
                                goto Error; // error: start of buffer contains standalone low surrogate char
                            }

                            ushort surrogatePairsCount = 0;
                            for (int i = 0; i < Vector <ushort> .Count - 1; i++)
                            {
                                surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0
                                if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
                                {
                                    goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
                                }
                            }

                            if (highSurrogateChars[Vector <ushort> .Count - 1] != 0)
                            {
                                // There was a standalone high surrogate at the end of the vector.
                                // We'll adjust our counters so that we don't consider this char consumed.

                                pInputBuffer--;
                                inputLength++;
                                popcnt32 -= 2;
                            }

                            nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size

                            // 2 UTF-16 chars become 1 Unicode scalar

                            tempScalarCountAdjustment -= (int)surrogatePairsCountNint;

                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
                            // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
                            // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
                            // so we'll adjust this now.

                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                        }

                        tempUtf8CodeUnitCountAdjustment += popcnt32;
                        pInputBuffer += Vector <ushort> .Count;
                        inputLength  -= Vector <ushort> .Count;
                    } while (inputLength >= Vector <ushort> .Count);
                }
            }

NonVectorizedLoop:

            // Vectorization isn't supported on our current platform, or the input was too small to benefit
            // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
            // drain remaining valid chars before we report failure.

            for (; inputLength > 0; pInputBuffer++, inputLength--)
            {
                uint thisChar = pInputBuffer[0];
                if (thisChar <= 0x7F)
                {
                    continue;
                }

                // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
                // This optimistically assumes no surrogates, which we'll handle shortly.

                tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;

                if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
                {
                    continue;
                }

                // Found a surrogate char. Back out the adjustment we made above, then
                // try to consume the entire surrogate pair all at once. We won't bother
                // trying to interpret the surrogate pair as a scalar value; we'll only
                // validate that its bit pattern matches what's expected for a surrogate pair.

                tempUtf8CodeUnitCountAdjustment -= 2;

                if (inputLength == 1)
                {
                    goto Error; // input buffer too small to read a surrogate pair
                }

                thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer);
                if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
                {
                    goto Error; // not a well-formed surrogate pair
                }

                tempScalarCountAdjustment--;          // 2 UTF-16 code units -> 1 scalar
                tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units

                pInputBuffer++;                       // consumed one extra char
                inputLength--;
            }

Error:

            // Also used for normal return.

            utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
            scalarCountAdjustment       = tempScalarCountAdjustment;
            return(pInputBuffer);
        }
Ejemplo n.º 8
0
        public unsafe void Serialize(ref MessagePackWriter writer, int[]?value, MessagePackSerializerOptions options)
        {
            if (value == null)
            {
                writer.WriteNil();
                return;
            }

            var inputLength = value.Length;

            writer.WriteArrayHeader(inputLength);
            if (inputLength == 0)
            {
                return;
            }

            fixed(int *pSource = &value[0])
            {
                var inputEnd      = pSource + inputLength;
                var inputIterator = pSource;

                if (Sse41.IsSupported)
                {
                    const int ShiftCount = 2;
                    const int Stride     = 1 << ShiftCount;

                    if (inputLength < Stride << 1)
                    {
                        goto ProcessEach;
                    }

                    {
                        // Make InputIterator Aligned
                        var offset = UnsafeMemoryAlignmentUtility.CalculateDifferenceAlign16(inputIterator);
                        // When offset is times of 4, you can adjust memory address.
                        if ((offset & 3) == 0)
                        {
                            offset     >>= 2;
                            inputLength -= offset;
                            var offsetEnd = inputIterator + offset;
                            while (inputIterator != offsetEnd)
                            {
                                writer.Write(*inputIterator++);
                            }
                        }
                    }

                    fixed(byte *tablePointer = &ShuffleAndMaskTable[0])
                    {
                        var countPointer = (int *)(tablePointer + CountTableOffset);

                        fixed(byte *maskTablePointer = &SingleInstructionMultipleDataPrimitiveArrayFormatterHelper.StoreMaskTable[0])
                        {
                            var vectorShortMinValueM1 = Vector128.Create(short.MinValue - 1);
                            var vectorSByteMinValueM1 = Vector128.Create(sbyte.MinValue - 1);
                            var vectorMinFixNegIntM1  = Vector128.Create(MessagePackRange.MinFixNegativeInt - 1);
                            var vectorSByteMaxValue   = Vector128.Create((int)sbyte.MaxValue);
                            var vectorByteMaxValue    = Vector128.Create((int)byte.MaxValue);
                            var vectorUShortMaxValue  = Vector128.Create((int)ushort.MaxValue);
                            var vectorM1M7            = Vector128.Create(-1, -7, -1, -7);
                            var vectorIn1Range        = Vector128.Create(0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);

                            for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride)
                            {
                                var current = Sse2.LoadVector128(inputIterator);
                                var isGreaterThanMinFixNegIntM1 = Sse2.CompareGreaterThan(current, vectorMinFixNegIntM1);
                                var isGreaterThanSByteMaxValue  = Sse2.CompareGreaterThan(current, vectorSByteMaxValue);

                                if (Sse2.MoveMask(Sse2.AndNot(isGreaterThanSByteMaxValue, isGreaterThanMinFixNegIntM1).AsByte()) == 0xFFFF)
                                {
                                    var answer = Ssse3.Shuffle(current.AsByte(), vectorIn1Range).AsUInt32();
                                    var span   = writer.GetSpan(Stride);
                                    Unsafe.As <byte, uint>(ref span[0]) = answer.GetElement(0);
                                    writer.Advance(Stride);
                                    continue;
                                }

                                var indexVector = Sse2.Add(isGreaterThanSByteMaxValue, isGreaterThanMinFixNegIntM1);
                                indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorUShortMaxValue));
                                indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorByteMaxValue));
                                indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorShortMinValueM1));
                                indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorSByteMinValueM1));
                                indexVector = Sse41.MultiplyLow(indexVector, vectorM1M7);
                                indexVector = Ssse3.HorizontalAdd(indexVector, indexVector);

                                var index0 = indexVector.GetElement(0);
                                var index1 = indexVector.GetElement(1);

                                var count0     = countPointer[index0];
                                var count1     = countPointer[index1];
                                var countTotal = count0 + count1;

                                var destination = writer.GetSpan(countTotal);
                                fixed(byte *pDestination = &destination[0])
                                {
                                    var tmpDestination = pDestination;

                                    var item0     = tablePointer + (index0 << 5);
                                    var shuffle0  = Sse2.LoadVector128(item0);
                                    var shuffled0 = Ssse3.Shuffle(current.AsByte(), shuffle0);
                                    var constant0 = Sse2.LoadVector128(item0 + 16);
                                    var answer0   = Sse2.Or(shuffled0, constant0);

                                    Sse2.MaskMove(answer0, Sse2.LoadVector128(maskTablePointer + (count0 << 4)), pDestination);
                                    tmpDestination += count0;

                                    var shift1    = Sse2.ShiftRightLogical128BitLane(current, 8).AsByte();
                                    var item1     = tablePointer + (index1 << 5);
                                    var shuffle1  = Sse2.LoadVector128(item1);
                                    var shuffled1 = Ssse3.Shuffle(shift1, shuffle1);
                                    var constant1 = Sse2.LoadVector128(item1 + 16);
                                    var answer1   = Sse2.Or(shuffled1, constant1);

                                    Sse2.MaskMove(answer1, Sse2.LoadVector128(maskTablePointer + (count1 << 4)), tmpDestination);
                                }

                                writer.Advance(countTotal);
                            }
                        }
                    }
                }

ProcessEach:
                while (inputIterator != inputEnd)
                {
                    writer.Write(*inputIterator++);
                }
            }
        }
Ejemplo n.º 9
0
        public static unsafe int GetUtf16CharCountFromKnownWellFormedUtf8(ReadOnlySpan <byte> utf8Data)
        {
            // Remember: the number of resulting UTF-16 chars will never be greater than the number
            // of UTF-8 bytes given well-formed input, so we can get away with casting the final
            // result to an 'int'.

            fixed(byte *pPinnedUtf8Data = &MemoryMarshal.GetReference(utf8Data))
            {
                if (Sse2.IsSupported && Popcnt.IsSupported)
                {
                    // Optimizations via SSE2 & POPCNT are available - use them.

                    Debug.Assert(BitConverter.IsLittleEndian, "SSE2 only supported on little-endian platforms.");
                    Debug.Assert(sizeof(nint) == IntPtr.Size, "nint defined incorrectly.");
                    Debug.Assert(sizeof(nuint) == IntPtr.Size, "nuint defined incorrectly.");

                    byte *pBuffer      = pPinnedUtf8Data;
                    nuint bufferLength = (uint)utf8Data.Length;

                    // Optimization: Can we stay in the all-ASCII code paths?

                    nuint utf16CharCount = GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength);

                    if (utf16CharCount != bufferLength)
                    {
                        // Found at least one non-ASCII byte, so fall down the slower (but still vectorized) code paths.
                        // Given well-formed UTF-8 input, we can compute the number of resulting UTF-16 code units
                        // using the following formula:
                        //
                        // utf16CharCount = utf8ByteCount - numUtf8ContinuationBytes + numUtf8FourByteHeaders

                        utf16CharCount = bufferLength;

                        Vector128 <sbyte> vecAllC0 = Vector128.Create(unchecked ((sbyte)0xC0));
                        Vector128 <sbyte> vecAll80 = Vector128.Create(unchecked ((sbyte)0x80));
                        Vector128 <sbyte> vecAll6F = Vector128.Create(unchecked ((sbyte)0x6F));

                        {
                            // Perform an aligned read of the first part of the buffer.
                            // We'll mask out any data at the start of the buffer we don't care about.
                            //
                            // For example, if (pBuffer MOD 16) = 2:
                            // [ AA BB CC DD ... ] <-- original vector
                            // [ 00 00 CC DD ... ] <-- after PANDN operation

                            nint offset = -((nint)pBuffer & (sizeof(Vector128 <sbyte>) - 1));
                            Vector128 <sbyte> shouldBeMaskedOut = Sse2.CompareGreaterThan(Vector128.Create((byte)((int)offset + sizeof(Vector128 <sbyte>) - 1)).AsSByte(), VectorOfElementIndices);
                            Vector128 <sbyte> thisVector        = Sse2.AndNot(shouldBeMaskedOut, Unsafe.Read <Vector128 <sbyte> >(pBuffer + offset));

                            // If there's any data at the end of the buffer we don't care about, mask it out now.
                            // If this happens the 'bufferLength' value will be a lie, but it'll cause all of the
                            // branches later in the method to be skipped, so it's not a huge problem.

                            if (bufferLength < (nuint)offset + (uint)sizeof(Vector128 <sbyte>))
                            {
                                Vector128 <sbyte> shouldBeAllowed = Sse2.CompareLessThan(VectorOfElementIndices, Vector128.Create((byte)((int)bufferLength - (int)offset)).AsSByte());
                                thisVector   = Sse2.And(shouldBeAllowed, thisVector);
                                bufferLength = (nuint)offset + (uint)sizeof(Vector128 <sbyte>);
                            }

                            uint maskOfContinuationBytes  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector));
                            uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes);
                            utf16CharCount -= countOfContinuationBytes;

                            uint maskOfFourByteHeaders  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F));
                            uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders);
                            utf16CharCount += countOfFourByteHeaders;

                            bufferLength -= (nuint)offset;
                            bufferLength -= (uint)sizeof(Vector128 <sbyte>);

                            pBuffer += offset;
                            pBuffer += (uint)sizeof(Vector128 <sbyte>);
                        }

                        // At this point, pBuffer is guaranteed aligned.

                        Debug.Assert((nuint)pBuffer % (uint)sizeof(Vector128 <sbyte>) == 0, "pBuffer should have been aligned.");

                        while (bufferLength >= (uint)sizeof(Vector128 <sbyte>))
                        {
                            Vector128 <sbyte> thisVector = Sse2.LoadAlignedVector128((sbyte *)pBuffer);

                            uint maskOfContinuationBytes  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector));
                            uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes);
                            utf16CharCount -= countOfContinuationBytes;

                            uint maskOfFourByteHeaders  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F));
                            uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders);
                            utf16CharCount += countOfFourByteHeaders;

                            pBuffer      += sizeof(Vector128 <sbyte>);
                            bufferLength -= (uint)sizeof(Vector128 <sbyte>);
                        }

                        if ((uint)bufferLength > 0)
                        {
                            // There's still more data to be read.
                            // We need to mask out elements of the vector we don't care about.
                            // These elements will occur at the end of the vector.
                            //
                            // For example, if 14 bytes remain in the input stream:
                            // [ ... CC DD EE FF ] <-- original vector
                            // [ ... CC DD 00 00 ] <-- after PANDN operation

                            Vector128 <sbyte> shouldBeMaskedOut = Sse2.CompareGreaterThan(VectorOfElementIndices, Vector128.Create((byte)((int)bufferLength - 1)).AsSByte());
                            Vector128 <sbyte> thisVector        = Sse2.AndNot(shouldBeMaskedOut, *(Vector128 <sbyte> *)pBuffer);

                            uint maskOfContinuationBytes  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector));
                            uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes);
                            utf16CharCount -= countOfContinuationBytes;

                            uint maskOfFourByteHeaders  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F));
                            uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders);
                            utf16CharCount += countOfFourByteHeaders;
                        }
                    }

                    return((int)utf16CharCount);
                }
                else
                {
                    // Cannot use SSE2 & POPCNT. Fall back to slower code paths.

                    throw new NotImplementedException();
                }
            }
        }
        /// <summary>
        /// Performs an NAND operation against two <see cref="ReadOnlySpan{byte}"/>.
        /// </summary>
        /// <param name="l"></param>
        /// <param name="r"></param>
        /// <param name="o"></param>
        public static void AndNot(this ReadOnlySpan <byte> l, ReadOnlySpan <byte> r, Span <byte> o)
        {
            var s = o.Length;

            if (l.Length != s)
            {
                throw new ArgumentException("Left span size must be equal to output size.");
            }
            if (r.Length != s)
            {
                throw new ArgumentException("Right span size must be equal to output size.");
            }

#if NETCOREAPP3_0
            if (Avx2.IsSupported)
            {
                while (o.Length >= 32)
                {
                    var al = MemoryMarshal.Cast <byte, ulong>(l);
                    var rl = MemoryMarshal.Cast <byte, ulong>(r);
                    var ol = MemoryMarshal.Cast <byte, ulong>(o);

                    unsafe
                    {
                        fixed(ulong *lp = al)
                        fixed(ulong *rp = rl)
                        fixed(ulong *op = ol)
                        {
                            var av = Avx.LoadVector256(lp);
                            var bv = Avx.LoadVector256(rp);
                            var ov = Avx2.AndNot(av, bv);

                            Avx.Store(op, ov);
                        }
                    }

                    l = l.Slice(32);
                    r = r.Slice(32);
                    o = o.Slice(32);
                }
            }
#endif

#if NETCOREAPP3_0
            if (Sse2.IsSupported)
            {
                while (o.Length >= 16)
                {
                    var ll = MemoryMarshal.Cast <byte, ulong>(l);
                    var rl = MemoryMarshal.Cast <byte, ulong>(r);
                    var ol = MemoryMarshal.Cast <byte, ulong>(o);

                    unsafe
                    {
                        fixed(ulong *lp = ll)
                        fixed(ulong *rp = rl)
                        fixed(ulong *op = ol)
                        {
                            var av = Sse2.LoadVector128(lp);
                            var bv = Sse2.LoadVector128(rp);
                            var ov = Sse2.AndNot(av, bv);

                            Sse2.Store(op, ov);
                        }
                    }

                    l = l.Slice(16);
                    r = r.Slice(16);
                    o = o.Slice(16);
                }
            }
#endif

            while (o.Length >= sizeof(ulong))
            {
                var ll = MemoryMarshal.Cast <byte, ulong>(l);
                var rl = MemoryMarshal.Cast <byte, ulong>(r);
                var ol = MemoryMarshal.Cast <byte, ulong>(o);

                ol[0] = ~ll[0] & rl[0];

                l = l.Slice(sizeof(ulong));
                r = r.Slice(sizeof(ulong));
                o = o.Slice(sizeof(ulong));
            }

            while (o.Length >= sizeof(uint))
            {
                var ll = MemoryMarshal.Cast <byte, uint>(l);
                var rl = MemoryMarshal.Cast <byte, uint>(r);
                var ol = MemoryMarshal.Cast <byte, uint>(o);

                ol[0] = ~ll[0] & rl[0];

                l = l.Slice(sizeof(uint));
                r = r.Slice(sizeof(uint));
                o = o.Slice(sizeof(uint));
            }

            // finish remaining bytes
            if (o.Length > 0)
            {
                for (var i = 0; i < o.Length; i++)
                {
                    o[i] = (byte)((uint)~l[i] & r[i]);
                }
            }
        }
Ejemplo n.º 11
0
    public void ResizeBicubic(FastBitmap rtnImage)
    {
        float scaleX = (float)this.width / rtnImage.width;
        float scaleY = (float)this.height / rtnImage.height;

        if (scaleX > 1 || scaleY > 1)
        {
            throw new Exception("拡大のみ対応");
        }

        float[] tmpa = new float[rtnImage.width * 4 * this.height];
        fixed(float *tmpp = tmpa)
        {
            float *tmp     = tmpp;
            var    _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255);
            var    _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255);
            var    _10mask = Vector128.Create(8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255);
            var    _11mask = Vector128.Create(12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255);
            var    _vmask  = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255);

            var _1012  = Vector128.Create(-1, 0, 1, 2);
            var _0123i = Vector128.Create(0, 1, 2, 3);

            var _0000   = Vector128.Create(0, 0, 0, 0);
            var _0000f  = Vector128.Create(0f, 0, 0, 0);
            var _255f   = Vector128.Create(255f, 255, 255, 255);
            var _1111   = Vector128.Create(1, 1, 1, 1);
            var _1111f  = Vector128.Create(1f, 1, 1, 1);
            var _4444f  = Vector128.Create(4f, 4, 4, 4);
            var _4444   = Vector128.Create(4, 4, 4, 4);
            var _5555f  = Vector128.Create(5f, 5, 5, 5);
            var _2222f  = Vector128.Create(2f, 2, 2, 2);
            var _8888f  = Vector128.Create(8f, 8, 8, 8);
            var _7f     = Vector128.Create(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff).AsSingle();
            var _ff     = Vector128.Create(-1, -1, -1, -1);
            var _stride = Vector128.Create(rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4);

            Parallel.For(0, this.height, (y) =>
            {
                float py      = (y * scaleY);
                float *tmpPos = tmp + y * rtnImage.width * 4;
                for (int x = 0; x < rtnImage.width; x++)
                {
                    float px = (x * scaleX);
                    int sx   = (int)px;

                    var _px = Vector128.CreateScalar(px);
                    _px     = Sse.Shuffle(_px, _px, 0);

                    var _sx = Vector128.CreateScalar(sx);
                    _sx     = Sse2.Shuffle(_sx, 0);

                    var _width = Vector128.CreateScalar(this.width);
                    _width     = Sse2.Shuffle(_width, 0);

                    var _x2 = Sse2.Add(_sx, _1012);

                    var _d  = Sse.And(Sse.Subtract(_px, Sse2.ConvertToVector128Single(_x2)), _7f);
                    var _d2 = Sse.Multiply(_d, _d);
                    var _d3 = Sse.Multiply(_d2, _d);

                    var w1   = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2)));
                    var w2   = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3);
                    var wb   = Sse2.CompareGreaterThan(_d, _1111f);
                    var _w   = Sse41.BlendVariable(w1, w2, wb);
                    var _xpb = Sse2.Or(Sse2.CompareLessThan(_x2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_x2, _width), _1111).AsInt32(), _ff));
                    var _xpp = Sse2.And(_sx, _xpb);
                    var _xp  = Sse41.BlendVariable(_x2, _xpp, _xpb);

                    var p = Avx2.GatherVector128((uint *)(this._ptr + this._stride * y), _xp, 4).AsByte();


                    var _p0 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _00mask).AsInt32());
                    var _p1 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _01mask).AsInt32());
                    var _p2 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _10mask).AsInt32());
                    var _p3 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _11mask).AsInt32());

                    var _w0 = Sse.Shuffle(_w, _w, 0);
                    var _w1 = Sse.Shuffle(_w, _w, 0b01010101);
                    var _w2 = Sse.Shuffle(_w, _w, 0b10101010);
                    var _w3 = Sse.Shuffle(_w, _w, 0b11111111);

                    var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3)));

                    Sse2.Store(tmpPos + x * 4, rgbaf);
                }
            });

            Parallel.For(0, rtnImage.height, (y) =>
            {
                float py = (y * scaleY);
                int sy   = (int)py;

                uint *store = stackalloc uint[4];

                var _py = Vector128.CreateScalar(py);
                _py     = Sse.Shuffle(_py, _py, 0);

                var _sy = Vector128.CreateScalar(sy);
                _sy     = Sse2.Shuffle(_sy, 0);

                var _height = Vector128.CreateScalar(this.height);
                _height     = Sse2.Shuffle(_height, 0);

                var _y2 = Sse2.Add(_sy, _1012);

                var _d  = Sse.And(Sse.Subtract(_py, Sse2.ConvertToVector128Single(_y2)), _7f);
                var _d2 = Sse.Multiply(_d, _d);
                var _d3 = Sse.Multiply(_d2, _d);

                var w1 = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2)));
                var w2 = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3);
                var wb = Sse2.CompareGreaterThan(_d, _1111f);
                var _w = Sse41.BlendVariable(w1, w2, wb);


                var _ypb = Sse2.Or(Sse2.CompareLessThan(_y2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_y2, _height), _1111).AsInt32(), _ff));
                var _ypp = Sse2.And(_sy, _ypb);
                var _yp  = Sse41.BlendVariable(_y2, _ypp, _ypb);
                var _yps = Sse41.MultiplyLow(_yp, _stride);

                var _yp0  = Sse2.Add(Sse2.Shuffle(_yps, 0), _0123i);
                var _yp1  = Sse2.Add(Sse2.Shuffle(_yps, 0b01010101), _0123i);
                var _yp2  = Sse2.Add(Sse2.Shuffle(_yps, 0b10101010), _0123i);
                var _yp3  = Sse2.Add(Sse2.Shuffle(_yps, 0b11111111), _0123i);
                uint *rtn = (uint *)(rtnImage._ptr + rtnImage._stride * y);

                for (int x = 0; x < rtnImage.width; x++)
                {
                    var _p0 = Avx2.GatherVector128((float *)(tmp), _yp0, 4);
                    var _p1 = Avx2.GatherVector128((float *)(tmp), _yp1, 4);
                    var _p2 = Avx2.GatherVector128((float *)(tmp), _yp2, 4);
                    var _p3 = Avx2.GatherVector128((float *)(tmp), _yp3, 4);

                    var _w0 = Sse.Shuffle(_w, _w, 0);
                    var _w1 = Sse.Shuffle(_w, _w, 0b01010101);
                    var _w2 = Sse.Shuffle(_w, _w, 0b10101010);
                    var _w3 = Sse.Shuffle(_w, _w, 0b11111111);

                    var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3)));

                    var _b0 = Sse.CompareLessThan(rgbaf, _0000f);
                    rgbaf   = Sse41.BlendVariable(rgbaf, _0000f, _b0);
                    var _b1 = Sse.CompareGreaterThan(rgbaf, _255f);
                    rgbaf   = Sse41.BlendVariable(rgbaf, _255f, _b1);

                    var rgbab = Sse2.ConvertToVector128Int32(rgbaf).AsByte();
                    var rgba  = Ssse3.Shuffle(rgbab, _vmask).AsUInt32();

                    Sse2.Store(store, rgba);

                    _yp0 = Sse2.Add(_yp0, _4444);
                    _yp1 = Sse2.Add(_yp1, _4444);
                    _yp2 = Sse2.Add(_yp2, _4444);
                    _yp3 = Sse2.Add(_yp3, _4444);
                    *rtn = *store;
                    rtn++;
                }
            });
Ejemplo n.º 12
0
        public static unsafe void Encrypt4(uint[] rk, ReadOnlySpan <byte> source, Span <byte> destination)
        {
            var p32 = MemoryMarshal.Cast <byte, uint>(source);
            var t3  = Vector128.Create(p32[3], p32[7], p32[11], p32[15]).ReverseEndianness32();
            var t2  = Vector128.Create(p32[2], p32[6], p32[10], p32[14]).ReverseEndianness32();
            var t1  = Vector128.Create(p32[1], p32[5], p32[9], p32[13]).ReverseEndianness32();
            var t0  = Vector128.Create(p32[0], p32[4], p32[8], p32[12]).ReverseEndianness32();

            for (var i = 0; i < 32; ++i)
            {
                var x = t1.Xor(t2).Xor(t3).Xor(Vector128.Create(rk[i]).AsByte());

                var y = Sse2.And(x, c0f);                 // inner affine
                y = Ssse3.Shuffle(m1l, y);
                x = Sse2.ShiftRightLogical(x.AsUInt64(), 4).AsByte();
                x = Sse2.And(x, c0f);
                x = Ssse3.Shuffle(m1h, x).Xor(y);

                x = Ssse3.Shuffle(x, shr);               // inverse MixColumns
                x = Aes.EncryptLast(x, c0f);             // AES-NI

                y = Sse2.AndNot(x, c0f);                 // outer affine
                y = Ssse3.Shuffle(m2l, y);
                x = Sse2.ShiftRightLogical(x.AsUInt64(), 4).AsByte();
                x = Sse2.And(x, c0f);
                x = Ssse3.Shuffle(m2h, x).Xor(y);

                // 4 parallel L1 linear transforms
                y = x.Xor(x.RotateLeftUInt32_8()).Xor(x.RotateLeftUInt32_16());
                y = y.AsUInt32().RotateLeftUInt32(2).AsByte();
                x = x.Xor(y).Xor(x.RotateLeftUInt32_24());

                // rotate registers
                x  = x.Xor(t0);
                t0 = t1;
                t1 = t2;
                t2 = t3;
                t3 = x;
            }

            var a = t3.ReverseEndianness32().AsUInt32();
            var b = t2.ReverseEndianness32().AsUInt32();
            var c = t1.ReverseEndianness32().AsUInt32();
            var d = t0.ReverseEndianness32().AsUInt32();

            var x0 = Sse2.UnpackLow(a, b);
            var x1 = Sse2.UnpackLow(c, d);
            var x2 = Sse2.UnpackHigh(a, b);
            var x3 = Sse2.UnpackHigh(c, d);

            t0 = Sse2.UnpackLow(x0.AsUInt64(), x1.AsUInt64()).AsByte();
            t1 = Sse2.UnpackHigh(x0.AsUInt64(), x1.AsUInt64()).AsByte();
            t2 = Sse2.UnpackLow(x2.AsUInt64(), x3.AsUInt64()).AsByte();
            t3 = Sse2.UnpackHigh(x2.AsUInt64(), x3.AsUInt64()).AsByte();

            fixed(byte *p = destination)
            {
                Sse2.Store(p, t0);
                Sse2.Store(p + 16, t1);
                Sse2.Store(p + 32, t2);
                Sse2.Store(p + 48, t3);
            }
        }
Ejemplo n.º 13
0
        static unsafe int Main(string[] args)
        {
            int    testResult          = Pass;
            int    testsCount          = 21;
            string methodUnderTestName = nameof(Sse2.AndNot);

            if (Sse2.IsSupported)
            {
                using (var doubleTable = TestTableSse2 <double> .Create(testsCount))
                    using (var longTable = TestTableSse2 <long> .Create(testsCount))
                        using (var ulongTable = TestTableSse2 <ulong> .Create(testsCount))
                            using (var intTable = TestTableSse2 <int> .Create(testsCount))
                                using (var uintTable = TestTableSse2 <uint> .Create(testsCount))
                                    using (var shortTable = TestTableSse2 <short> .Create(testsCount))
                                        using (var ushortTable = TestTableSse2 <ushort> .Create(testsCount))
                                            using (var sbyteTable = TestTableSse2 <sbyte> .Create(testsCount))
                                                using (var byteTable = TestTableSse2 <byte> .Create(testsCount))
                                                {
                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <double>, Vector128 <double>, Vector128 <double>)value = doubleTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        doubleTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <long>, Vector128 <long>, Vector128 <long>)value = longTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        longTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <ulong>, Vector128 <ulong>, Vector128 <ulong>)value = ulongTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        ulongTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <int>, Vector128 <int>, Vector128 <int>)value = intTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        intTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <uint>, Vector128 <uint>, Vector128 <uint>)value = uintTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        uintTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <short>, Vector128 <short>, Vector128 <short>)value = shortTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        shortTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <ushort>, Vector128 <ushort>, Vector128 <ushort>)value = ushortTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        ushortTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <sbyte>, Vector128 <sbyte>, Vector128 <sbyte>)value = sbyteTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        sbyteTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <byte>, Vector128 <byte>, Vector128 <byte>)value = byteTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        byteTable.SetOutArray(result);
                                                    }

                                                    CheckMethod <double> checkDouble = (double x, double y, double z, ref double a) => (a = BinaryAndNot(x, y)) == z;

                                                    if (!doubleTable.CheckResult(checkDouble))
                                                    {
                                                        PrintError(doubleTable, methodUnderTestName, "(double x, double y, double z, ref double a) => (a = BinaryAndNot(x, y)) == z", checkDouble);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <long> checkLong = (long x, long y, long z, ref long a) => (a = (~x) & y) == z;

                                                    if (!longTable.CheckResult(checkLong))
                                                    {
                                                        PrintError(longTable, methodUnderTestName, "(long x, long y, long z, ref long a) => (a = (~x) & y) == z", checkLong);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <ulong> checkUlong = (ulong x, ulong y, ulong z, ref ulong a) => (a = (~x) & y) == z;

                                                    if (!longTable.CheckResult(checkLong))
                                                    {
                                                        PrintError(ulongTable, methodUnderTestName, "(ulong x, ulong y, ulong z, ref ulong a) => (a = (~x) & y) == z", checkUlong);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <int> checkInt32 = (int x, int y, int z, ref int a) => (a = (~x) & y) == z;

                                                    if (!intTable.CheckResult(checkInt32))
                                                    {
                                                        PrintError(intTable, methodUnderTestName, "(int x, int y, int z, ref int a) => (a = (~x) & y) == z", checkInt32);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <uint> checkUInt32 = (uint x, uint y, uint z, ref uint a) => (a = (~x) & y) == z;

                                                    if (!uintTable.CheckResult(checkUInt32))
                                                    {
                                                        PrintError(uintTable, methodUnderTestName, "(uint x, uint y, uint z, ref uint a) => (a = (~x) & y) == z", checkUInt32);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <short> checkInt16 = (short x, short y, short z, ref short a) => (a = (short)((~x) & y)) == z;

                                                    if (!shortTable.CheckResult(checkInt16))
                                                    {
                                                        PrintError(shortTable, methodUnderTestName, "(short x, short y, short z, ref short a) => (a = (short)((~x) & y)) == z", checkInt16);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <ushort> checkUInt16 = (ushort x, ushort y, ushort z, ref ushort a) => (a = (ushort)((~x) & y)) == z;

                                                    if (!ushortTable.CheckResult(checkUInt16))
                                                    {
                                                        PrintError(ushortTable, methodUnderTestName, "(ushort x, ushort y, ushort z, ref ushort a) => (a = (ushort)((~x) & y)) == z", checkUInt16);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <sbyte> checkSByte = (sbyte x, sbyte y, sbyte z, ref sbyte a) => (a = (sbyte)((~x) & y)) == z;

                                                    if (!sbyteTable.CheckResult(checkSByte))
                                                    {
                                                        PrintError(sbyteTable, methodUnderTestName, "(sbyte x, sbyte y, sbyte z, ref sbyte a) =>(a = (sbyte)((~x) & y)) == z", checkSByte);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <byte> checkByte = (byte x, byte y, byte z, ref byte a) => (a = (byte)((~x) & y)) == z;

                                                    if (!byteTable.CheckResult(checkByte))
                                                    {
                                                        PrintError(byteTable, methodUnderTestName, "(byte x, byte y, byte z, ref byte a) =>  (a = (byte)((~x) & y)) == z", checkByte);
                                                        testResult = Fail;
                                                    }
                                                }
            }
            else
            {
                Console.WriteLine($"Sse2.IsSupported: {Sse2.IsSupported}, skipped tests of {typeof(Sse2)}.{methodUnderTestName}");
            }

            return(testResult);
        }
 public static i32 NMask_i32(i32 a, m32 m)
 {
     return(Sse2.AndNot(m, a));
 }
 public static m32 BitwiseAndNot_m32(m32 a, m32 b)
 {
     return(Sse2.AndNot(b, a));
 }
 public static i32 BitwiseAndNot_i32(i32 a, i32 b)
 {
     return(Sse2.AndNot(b, a));
 }