C# (CSharp) Sse2.AndNot 예제들

프로그래밍 언어: C# (CSharp)

클래스/타입: Sse2

메소드/함수: AndNot

hotexamples.com에서의 예제들: 16

C# (CSharp) Sse2.AndNot - 16개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 C# (CSharp)의 Sse2.AndNot에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Add(30)

CompareLessThan(30)

And(30)

CompareEqual(30)

CompareGreaterThan(30)

CompareLessThanOrEqual(20)

CompareGreaterThanOrEqual(19)

AddScalar(18)

CompareOrdered(18)

CompareNotGreaterThanOrEqual(18)

CompareNotGreaterThan(18)

CompareNotEqual(18)

CompareNotLessThanOrEqual(17)

CompareNotLessThan(17)

AndNot(16)

CompareScalarOrderedEqual(15)

CompareScalarUnorderedEqual(14)

CompareScalarOrderedLessThan(14)

CompareScalarOrderedNotEqual(13)

CompareScalarOrderedGreaterThan(13)

CompareScalarOrderedLessThanOrEqual(12)

CompareUnordered(12)

CompareScalarUnorderedNotEqual(12)

CompareScalarUnorderedGreaterThan(12)

CompareScalarUnorderedLessThan(12)

CompareScalarUnorderedLessThanOrEqual(11)

CompareScalarOrderedGreaterThanOrEqual(11)

CompareScalarNotGreaterThanOrEqual(11)

CompareScalarUnorderedGreaterThanOrEqual(11)

CompareScalarGreaterThan(10)

CompareScalarGreaterThanOrEqual(10)

CompareScalarNotGreaterThan(10)

CompareScalarNotEqual(9)

CompareScalarUnordered(9)

CompareGreaterThanScalar(9)

CompareGreaterThanUnorderedScalar(9)

CompareGreaterThanOrEqualScalar(9)

CompareUnorderedScalar(9)

CompareLessThanOrEqualOrderedScalar(9)

CompareLessThanOrderedScalar(9)

CompareScalarOrdered(9)

CompareScalarNotLessThanOrEqual(9)

CompareScalarNotLessThan(9)

CompareLessThanScalar(9)

CompareNotEqualUnorderedScalar(9)

CompareEqualScalar(9)

CompareScalarLessThan(9)

CompareGreaterThanOrderedScalar(9)

CompareScalarEqual(9)

CompareOrderedScalar(9)

예제 #1

파일 보기

        public void RunStructLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load));

            var test   = TestStruct.Create();
            var result = Sse2.AndNot(
                Sse2.LoadVector128((Int32 *)(&test._fld1)),
                Sse2.LoadVector128((Int32 *)(&test._fld2))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }

예제 #2

파일 보기

파일: AndNot.UInt32.cs 프로젝트: Fredo-Q/dotnet-coreclr

            public void RunStructFldScenario_Load(SimpleBinaryOpTest__AndNotUInt32 testClass)
            {
                fixed(Vector128 <UInt32> *pFld1 = &_fld1)
                fixed(Vector128 <UInt32> *pFld2 = &_fld2)
                {
                    var result = Sse2.AndNot(
                        Sse2.LoadVector128((UInt32 *)(pFld1)),
                        Sse2.LoadVector128((UInt32 *)(pFld2))
                        );

                    Unsafe.Write(testClass._dataTable.outArrayPtr, result);
                    testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr);
                }
            }

예제 #3

파일 보기

파일: AndNot.UInt32.cs 프로젝트: Fredo-Q/dotnet-coreclr

        public void RunClassFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load));

            fixed(Vector128 <UInt32> *pFld1 = &_fld1)
            fixed(Vector128 <UInt32> *pFld2 = &_fld2)
            {
                var result = Sse2.AndNot(
                    Sse2.LoadVector128((UInt32 *)(pFld1)),
                    Sse2.LoadVector128((UInt32 *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr);
            }
        }

예제 #4

파일 보기

        public void RunClsVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load));

            fixed(Vector128 <Double> *pClsVar1 = &_clsVar1)
            fixed(Vector128 <Double> *pClsVar2 = &_clsVar2)
            {
                var result = Sse2.AndNot(
                    Sse2.LoadVector128((Double *)(pClsVar1)),
                    Sse2.LoadVector128((Double *)(pClsVar2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
            }
        }

예제 #5

파일 보기

        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new SimpleBinaryOpTest__AndNotDouble();

            fixed(Vector128 <Double> *pFld1 = &test._fld1)
            fixed(Vector128 <Double> *pFld2 = &test._fld2)
            {
                var result = Sse2.AndNot(
                    Sse2.LoadVector128((Double *)(pFld1)),
                    Sse2.LoadVector128((Double *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
            }
        }

예제 #6

파일 보기

        public static Vector128 <double> ConditionalSelectBitwise(Vector128 <double> selector, Vector128 <double> ifTrue, Vector128 <double> ifFalse)
        {
            // This implementation is based on the DirectX Math Library XMVector4NotEqual method
            // https://github.com/microsoft/DirectXMath/blob/master/Inc/DirectXMathVector.inl

            if (AdvSimd.IsSupported)
            {
                return(AdvSimd.BitwiseSelect(selector, ifTrue, ifFalse));
            }
            else if (Sse2.IsSupported)
            {
                return(Sse2.Or(Sse2.And(ifTrue, selector), Sse2.AndNot(selector, ifFalse)));
            }
            else
            {
                // Redundant test so we won't prejit remainder of this method on platforms without AdvSimd.
                throw new PlatformNotSupportedException();
            }
        }

예제 #7

파일 보기

파일: Utf16Utility.Validation.cs 프로젝트: humbatoa/runtime

        // Returns &inputBuffer[inputLength] if the input buffer is valid.
        /// <summary>
        /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
        /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
        /// </summary>
        /// <remarks>
        /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
        /// </remarks>
        public static char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
        {
            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");

            // First, we'll handle the common case of all-ASCII. If this is able to
            // consume the entire buffer, we'll skip the remainder of this method's logic.

            int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);

            Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);

            pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
            inputLength  -= numAsciiCharsConsumedJustNow;

            if (inputLength == 0)
            {
                utf8CodeUnitCountAdjustment = 0;
                scalarCountAdjustment       = 0;
                return(pInputBuffer);
            }

            // If we got here, it means we saw some non-ASCII data, so within our
            // vectorized code paths below we'll handle all non-surrogate UTF-16
            // code points branchlessly. We'll only branch if we see surrogates.
            //
            // We still optimistically assume the data is mostly ASCII. This means that the
            // number of UTF-8 code units and the number of scalars almost matches the number
            // of UTF-16 code units. As we go through the input and find non-ASCII
            // characters, we'll keep track of these "adjustment" fixups. To get the
            // total number of UTF-8 code units required to encode the input data, add
            // the UTF-8 code unit count adjustment to the number of UTF-16 code units
            // seen.  To get the total number of scalars present in the input data,
            // add the scalar count adjustment to the number of UTF-16 code units seen.

            long tempUtf8CodeUnitCountAdjustment = 0;
            int  tempScalarCountAdjustment       = 0;

            if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported)
            {
                if (inputLength >= Vector128 <ushort> .Count)
                {
                    Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80);
                    Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800);
                    Vector128 <short>  vector8800 = Vector128.Create(unchecked ((short)0x8800));
                    Vector128 <ushort> vectorZero = Vector128 <ushort> .Zero;
                    do
                    {
                        Vector128 <ushort> utf16Data;
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            utf16Data = AdvSimd.LoadVector128((ushort *)pInputBuffer); // unaligned
                        }
                        else
                        {
                            utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); // unaligned
                        }

                        Vector128 <ushort> charIsNonAscii;

                        if (AdvSimd.Arm64.IsSupported)
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
                            charIsNonAscii = AdvSimd.Min(utf16Data, vector0080);
                        }
                        else if (Sse41.IsSupported)
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
                            charIsNonAscii = Sse41.Min(utf16Data, vector0080);
                        }
                        else
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will
                            // be handled in a few lines.

                            charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080);
                        }

#if DEBUG
                        // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element.
                        uint debugMask;
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte());
                        }
                        else
                        {
                            debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte());
                        }
                        Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'.");
#endif // DEBUG

                        // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding
                        // input was 0x0800 <= [value]. This also handles the missing range a few lines above.

                        Vector128 <ushort> charIsThreeByteUtf8Encoded;
                        uint mask;

                        if (AdvSimd.IsSupported)
                        {
                            charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11));
                            mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
                        }
                        else
                        {
                            charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
                            mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
                        }

                        // Each even bit of mask will be 1 only if the char was >= 0x0080,
                        // and each odd bit of mask will be 1 only if the char was >= 0x0800.
                        //
                        // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
                        //
                        //            ,-- set if char[1] is >= 0x0800
                        //            |   ,-- set if char[0] is >= 0x0800
                        //            v   v
                        // mask = ... 1 1 0 1
                        //              ^   ^-- set if char[0] is non-ASCII
                        //              `-- set if char[1] is non-ASCII
                        //
                        // This means we can popcnt the number of set bits, and the result is the
                        // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
                        // it expands. This results in the wrong count for UTF-16 surrogate code
                        // units (we just counted that each individual code unit expands to 3 bytes,
                        // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
                        // We'll handle this in just a moment.
                        //
                        // For now, compute the popcnt but squirrel it away. We'll fold it in to the
                        // cumulative UTF-8 adjustment factor once we determine that there are no
                        // unpaired surrogates in our data. (Unpaired surrogates would invalidate
                        // our computed result and we'd have to throw it away.)

                        uint popcnt = (uint)BitOperations.PopCount(mask);

                        // Surrogates need to be special-cased for two reasons: (a) we need
                        // to account for the fact that we over-counted in the addition above;
                        // and (b) they require separate validation.
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            utf16Data = AdvSimd.Add(utf16Data, vectorA800);
                            mask      = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
                        }
                        else
                        {
                            utf16Data = Sse2.Add(utf16Data, vectorA800);
                            mask      = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
                        }

                        if (mask != 0)
                        {
                            // There's at least one UTF-16 surrogate code unit present.
                            // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
                            // the resulting bits of 'mask' will occur in pairs:
                            // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
                            // - 11 if the corresponding UTF-16 char was a surrogate code unit.
                            //
                            // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
                            // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
                            // a low surrogate. Since we added 0xA800 in the vectorized operation above,
                            // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
                            // If we logical right-shift each word by 3, we'll end up with the bit pattern
                            // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
                            // determine whether a given char was a high or a low surrogate.
                            //
                            // Therefore the resulting bits of 'mask2' will occur in pairs:
                            // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
                            // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
                            // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
                            //   Since 'mask' already has 00 in these positions (since the corresponding char
                            //   wasn't a surrogate), "mask AND mask2 == 00" holds for these positions.

                            uint mask2;
                            if (AdvSimd.Arm64.IsSupported)
                            {
                                mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte());
                            }
                            else
                            {
                                mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
                            }

                            // 'lowSurrogatesMask' has its bits occur in pairs:
                            // - 01 if the corresponding char was a low surrogate char,
                            // - 00 if the corresponding char was a high surrogate char or not a surrogate at all.

                            uint lowSurrogatesMask = mask2 & mask;

                            // 'highSurrogatesMask' has its bits occur in pairs:
                            // - 01 if the corresponding char was a high surrogate char,
                            // - 00 if the corresponding char was a low surrogate char or not a surrogate at all.

                            uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask;

                            Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0,
                                         "A char cannot simultaneously be both a high and a low surrogate char.");

                            Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0,
                                         "Only even bits (no odd bits) of the masks should be set.");

                            // Now check that each high surrogate is followed by a low surrogate and that each
                            // low surrogate follows a high surrogate. We make an exception for the case where
                            // the final char of the vector is a high surrogate, since we can't perform validation
                            // on it until the next iteration of the loop when we hope to consume the matching
                            // low surrogate.

                            highSurrogatesMask <<= 2;
                            if ((ushort)highSurrogatesMask != lowSurrogatesMask)
                            {
                                goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
                            }

                            if (highSurrogatesMask > ushort.MaxValue)
                            {
                                // There was a standalone high surrogate at the end of the vector.
                                // We'll adjust our counters so that we don't consider this char consumed.

                                highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
                                popcnt            -= 2;                          // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
                                pInputBuffer--;
                                inputLength++;
                            }

                            // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for
                            // free right now, saving the extension step a few lines below. If we're 32-bit, the
                            // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real
                            // 64 -bit extension a few lines below.
                            nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask);

                            // 2 UTF-16 chars become 1 Unicode scalar

                            tempScalarCountAdjustment -= (int)surrogatePairsCountNuint;

                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
                            // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
                            // assumes that the pair is encoded as 6 UTF-8 code units. Since each
                            // pair is in reality only encoded as 4 UTF-8 code units, we need to
                            // perform this adjustment now.

                            if (IntPtr.Size == 8)
                            {
                                // Since we've already zero-extended surrogatePairsCountNuint, we can directly
                                // sub + sub. It's more efficient than shl + sub.
                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
                            }
                            else
                            {
                                // Take the hit of the 64-bit extension now.
                                tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint;
                            }
                        }

                        tempUtf8CodeUnitCountAdjustment += popcnt;
                        pInputBuffer += Vector128 <ushort> .Count;
                        inputLength  -= Vector128 <ushort> .Count;
                    } while (inputLength >= Vector128 <ushort> .Count);
                }
            }
            else if (Vector.IsHardwareAccelerated)
            {
                if (inputLength >= Vector <ushort> .Count)
                {
                    Vector <ushort> vector0080 = new Vector <ushort>(0x0080);
                    Vector <ushort> vector0400 = new Vector <ushort>(0x0400);
                    Vector <ushort> vector0800 = new Vector <ushort>(0x0800);
                    Vector <ushort> vectorD800 = new Vector <ushort>(0xD800);

                    do
                    {
                        // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
                        // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
                        // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
                        // vectors, each element of the sum will contain one of three values:
                        //
                        // 0x0000 ( 0) = original char was 0000..007F
                        // 0xFFFF (-1) = original char was 0080..07FF
                        // 0xFFFE (-2) = original char was 0800..FFFF
                        //
                        // We'll negate them to produce a value 0..2 for each element, then sum all the
                        // elements together to produce the number of *additional* UTF-8 code units
                        // required to represent this UTF-16 data. This is similar to the popcnt step
                        // performed by the SSE2 code path. This will overcount surrogates, but we'll
                        // handle that shortly.

                        Vector <ushort>  utf16Data            = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer);
                        Vector <ushort>  twoOrMoreUtf8Bytes   = Vector.GreaterThanOrEqual(utf16Data, vector0080);
                        Vector <ushort>  threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
                        Vector <nuint_t> sumVector            = (Vector <nuint_t>)(Vector <ushort> .Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);

                        // We'll try summing by a natural word (rather than a 16-bit word) at a time,
                        // which should halve the number of operations we must perform.

                        nuint popcnt = 0;
                        for (int i = 0; i < Vector <nuint_t> .Count; i++)
                        {
                            popcnt += (nuint)sumVector[i];
                        }

                        uint popcnt32 = (uint)popcnt;
                        if (IntPtr.Size == 8)
                        {
                            popcnt32 += (uint)(popcnt >> 32);
                        }

                        // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
                        // know there aren't any unpaired surrogates in the input data.

                        popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);

                        // Now check for surrogates.

                        utf16Data -= vectorD800;
                        Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
                        if (surrogateChars != Vector <ushort> .Zero)
                        {
                            // There's at least one surrogate (high or low) UTF-16 code unit in
                            // the vector. We'll build up additional vectors: 'highSurrogateChars'
                            // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
                            // UTF-16 code unit was a high or low surrogate, respectively.

                            Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
                            Vector <ushort> lowSurrogateChars  = Vector.AndNot(surrogateChars, highSurrogateChars);

                            // We want to make sure that each high surrogate code unit is followed by
                            // a low surrogate code unit and each low surrogate code unit follows a
                            // high surrogate code unit. Since we don't have an equivalent of pmovmskb
                            // or palignr available to us, we'll do this as a loop. We won't look at
                            // the very last high surrogate char element since we don't yet know if
                            // the next vector read will have a low surrogate char element.

                            if (lowSurrogateChars[0] != 0)
                            {
                                goto Error; // error: start of buffer contains standalone low surrogate char
                            }

                            ushort surrogatePairsCount = 0;
                            for (int i = 0; i < Vector <ushort> .Count - 1; i++)
                            {
                                surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0
                                if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
                                {
                                    goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
                                }
                            }

                            if (highSurrogateChars[Vector <ushort> .Count - 1] != 0)
                            {
                                // There was a standalone high surrogate at the end of the vector.
                                // We'll adjust our counters so that we don't consider this char consumed.

                                pInputBuffer--;
                                inputLength++;
                                popcnt32 -= 2;
                            }

                            nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size

                            // 2 UTF-16 chars become 1 Unicode scalar

                            tempScalarCountAdjustment -= (int)surrogatePairsCountNint;

                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
                            // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
                            // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
                            // so we'll adjust this now.

                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                        }

                        tempUtf8CodeUnitCountAdjustment += popcnt32;
                        pInputBuffer += Vector <ushort> .Count;
                        inputLength  -= Vector <ushort> .Count;
                    } while (inputLength >= Vector <ushort> .Count);
                }
            }

NonVectorizedLoop:

            // Vectorization isn't supported on our current platform, or the input was too small to benefit
            // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
            // drain remaining valid chars before we report failure.

            for (; inputLength > 0; pInputBuffer++, inputLength--)
            {
                uint thisChar = pInputBuffer[0];
                if (thisChar <= 0x7F)
                {
                    continue;
                }

                // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
                // This optimistically assumes no surrogates, which we'll handle shortly.

                tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;

                if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
                {
                    continue;
                }

                // Found a surrogate char. Back out the adjustment we made above, then
                // try to consume the entire surrogate pair all at once. We won't bother
                // trying to interpret the surrogate pair as a scalar value; we'll only
                // validate that its bit pattern matches what's expected for a surrogate pair.

                tempUtf8CodeUnitCountAdjustment -= 2;

                if (inputLength == 1)
                {
                    goto Error; // input buffer too small to read a surrogate pair
                }

                thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer);
                if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
                {
                    goto Error; // not a well-formed surrogate pair
                }

                tempScalarCountAdjustment--;          // 2 UTF-16 code units -> 1 scalar
                tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units

                pInputBuffer++;                       // consumed one extra char
                inputLength--;
            }

Error:

            // Also used for normal return.

            utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
            scalarCountAdjustment       = tempScalarCountAdjustment;
            return(pInputBuffer);
        }

예제 #8

파일 보기

        public unsafe void Serialize(ref MessagePackWriter writer, int[]?value, MessagePackSerializerOptions options)
        {
            if (value == null)
            {
                writer.WriteNil();
                return;
            }

            var inputLength = value.Length;

            writer.WriteArrayHeader(inputLength);
            if (inputLength == 0)
            {
                return;
            }

            fixed(int *pSource = &value[0])
            {
                var inputEnd      = pSource + inputLength;
                var inputIterator = pSource;

                if (Sse41.IsSupported)
                {
                    const int ShiftCount = 2;
                    const int Stride     = 1 << ShiftCount;

                    if (inputLength < Stride << 1)
                    {
                        goto ProcessEach;
                    }

                    {
                        // Make InputIterator Aligned
                        var offset = UnsafeMemoryAlignmentUtility.CalculateDifferenceAlign16(inputIterator);
                        // When offset is times of 4, you can adjust memory address.
                        if ((offset & 3) == 0)
                        {
                            offset     >>= 2;
                            inputLength -= offset;
                            var offsetEnd = inputIterator + offset;
                            while (inputIterator != offsetEnd)
                            {
                                writer.Write(*inputIterator++);
                            }
                        }
                    }

                    fixed(byte *tablePointer = &ShuffleAndMaskTable[0])
                    {
                        var countPointer = (int *)(tablePointer + CountTableOffset);

                        fixed(byte *maskTablePointer = &SingleInstructionMultipleDataPrimitiveArrayFormatterHelper.StoreMaskTable[0])
                        {
                            var vectorShortMinValueM1 = Vector128.Create(short.MinValue - 1);
                            var vectorSByteMinValueM1 = Vector128.Create(sbyte.MinValue - 1);
                            var vectorMinFixNegIntM1  = Vector128.Create(MessagePackRange.MinFixNegativeInt - 1);
                            var vectorSByteMaxValue   = Vector128.Create((int)sbyte.MaxValue);
                            var vectorByteMaxValue    = Vector128.Create((int)byte.MaxValue);
                            var vectorUShortMaxValue  = Vector128.Create((int)ushort.MaxValue);
                            var vectorM1M7            = Vector128.Create(-1, -7, -1, -7);
                            var vectorIn1Range        = Vector128.Create(0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);

                            for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride)
                            {
                                var current = Sse2.LoadVector128(inputIterator);
                                var isGreaterThanMinFixNegIntM1 = Sse2.CompareGreaterThan(current, vectorMinFixNegIntM1);
                                var isGreaterThanSByteMaxValue  = Sse2.CompareGreaterThan(current, vectorSByteMaxValue);

                                if (Sse2.MoveMask(Sse2.AndNot(isGreaterThanSByteMaxValue, isGreaterThanMinFixNegIntM1).AsByte()) == 0xFFFF)
                                {
                                    var answer = Ssse3.Shuffle(current.AsByte(), vectorIn1Range).AsUInt32();
                                    var span   = writer.GetSpan(Stride);
                                    Unsafe.As <byte, uint>(ref span[0]) = answer.GetElement(0);
                                    writer.Advance(Stride);
                                    continue;
                                }

                                var indexVector = Sse2.Add(isGreaterThanSByteMaxValue, isGreaterThanMinFixNegIntM1);
                                indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorUShortMaxValue));
                                indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorByteMaxValue));
                                indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorShortMinValueM1));
                                indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorSByteMinValueM1));
                                indexVector = Sse41.MultiplyLow(indexVector, vectorM1M7);
                                indexVector = Ssse3.HorizontalAdd(indexVector, indexVector);

                                var index0 = indexVector.GetElement(0);
                                var index1 = indexVector.GetElement(1);

                                var count0     = countPointer[index0];
                                var count1     = countPointer[index1];
                                var countTotal = count0 + count1;

                                var destination = writer.GetSpan(countTotal);
                                fixed(byte *pDestination = &destination[0])
                                {
                                    var tmpDestination = pDestination;

                                    var item0     = tablePointer + (index0 << 5);
                                    var shuffle0  = Sse2.LoadVector128(item0);
                                    var shuffled0 = Ssse3.Shuffle(current.AsByte(), shuffle0);
                                    var constant0 = Sse2.LoadVector128(item0 + 16);
                                    var answer0   = Sse2.Or(shuffled0, constant0);

                                    Sse2.MaskMove(answer0, Sse2.LoadVector128(maskTablePointer + (count0 << 4)), pDestination);
                                    tmpDestination += count0;

                                    var shift1    = Sse2.ShiftRightLogical128BitLane(current, 8).AsByte();
                                    var item1     = tablePointer + (index1 << 5);
                                    var shuffle1  = Sse2.LoadVector128(item1);
                                    var shuffled1 = Ssse3.Shuffle(shift1, shuffle1);
                                    var constant1 = Sse2.LoadVector128(item1 + 16);
                                    var answer1   = Sse2.Or(shuffled1, constant1);

                                    Sse2.MaskMove(answer1, Sse2.LoadVector128(maskTablePointer + (count1 << 4)), tmpDestination);
                                }

                                writer.Advance(countTotal);
                            }
                        }
                    }
                }

ProcessEach:
                while (inputIterator != inputEnd)
                {
                    writer.Write(*inputIterator++);
                }
            }
        }

예제 #9

파일 보기

        public static unsafe int GetUtf16CharCountFromKnownWellFormedUtf8(ReadOnlySpan <byte> utf8Data)
        {
            // Remember: the number of resulting UTF-16 chars will never be greater than the number
            // of UTF-8 bytes given well-formed input, so we can get away with casting the final
            // result to an 'int'.

            fixed(byte *pPinnedUtf8Data = &MemoryMarshal.GetReference(utf8Data))
            {
                if (Sse2.IsSupported && Popcnt.IsSupported)
                {
                    // Optimizations via SSE2 & POPCNT are available - use them.

                    Debug.Assert(BitConverter.IsLittleEndian, "SSE2 only supported on little-endian platforms.");
                    Debug.Assert(sizeof(nint) == IntPtr.Size, "nint defined incorrectly.");
                    Debug.Assert(sizeof(nuint) == IntPtr.Size, "nuint defined incorrectly.");

                    byte *pBuffer      = pPinnedUtf8Data;
                    nuint bufferLength = (uint)utf8Data.Length;

                    // Optimization: Can we stay in the all-ASCII code paths?

                    nuint utf16CharCount = GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength);

                    if (utf16CharCount != bufferLength)
                    {
                        // Found at least one non-ASCII byte, so fall down the slower (but still vectorized) code paths.
                        // Given well-formed UTF-8 input, we can compute the number of resulting UTF-16 code units
                        // using the following formula:
                        //
                        // utf16CharCount = utf8ByteCount - numUtf8ContinuationBytes + numUtf8FourByteHeaders

                        utf16CharCount = bufferLength;

                        Vector128 <sbyte> vecAllC0 = Vector128.Create(unchecked ((sbyte)0xC0));
                        Vector128 <sbyte> vecAll80 = Vector128.Create(unchecked ((sbyte)0x80));
                        Vector128 <sbyte> vecAll6F = Vector128.Create(unchecked ((sbyte)0x6F));

                        {
                            // Perform an aligned read of the first part of the buffer.
                            // We'll mask out any data at the start of the buffer we don't care about.
                            //
                            // For example, if (pBuffer MOD 16) = 2:
                            // [ AA BB CC DD ... ] <-- original vector
                            // [ 00 00 CC DD ... ] <-- after PANDN operation

                            nint offset = -((nint)pBuffer & (sizeof(Vector128 <sbyte>) - 1));
                            Vector128 <sbyte> shouldBeMaskedOut = Sse2.CompareGreaterThan(Vector128.Create((byte)((int)offset + sizeof(Vector128 <sbyte>) - 1)).AsSByte(), VectorOfElementIndices);
                            Vector128 <sbyte> thisVector        = Sse2.AndNot(shouldBeMaskedOut, Unsafe.Read <Vector128 <sbyte> >(pBuffer + offset));

                            // If there's any data at the end of the buffer we don't care about, mask it out now.
                            // If this happens the 'bufferLength' value will be a lie, but it'll cause all of the
                            // branches later in the method to be skipped, so it's not a huge problem.

                            if (bufferLength < (nuint)offset + (uint)sizeof(Vector128 <sbyte>))
                            {
                                Vector128 <sbyte> shouldBeAllowed = Sse2.CompareLessThan(VectorOfElementIndices, Vector128.Create((byte)((int)bufferLength - (int)offset)).AsSByte());
                                thisVector   = Sse2.And(shouldBeAllowed, thisVector);
                                bufferLength = (nuint)offset + (uint)sizeof(Vector128 <sbyte>);
                            }

                            uint maskOfContinuationBytes  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector));
                            uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes);
                            utf16CharCount -= countOfContinuationBytes;

                            uint maskOfFourByteHeaders  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F));
                            uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders);
                            utf16CharCount += countOfFourByteHeaders;

                            bufferLength -= (nuint)offset;
                            bufferLength -= (uint)sizeof(Vector128 <sbyte>);

                            pBuffer += offset;
                            pBuffer += (uint)sizeof(Vector128 <sbyte>);
                        }

                        // At this point, pBuffer is guaranteed aligned.

                        Debug.Assert((nuint)pBuffer % (uint)sizeof(Vector128 <sbyte>) == 0, "pBuffer should have been aligned.");

                        while (bufferLength >= (uint)sizeof(Vector128 <sbyte>))
                        {
                            Vector128 <sbyte> thisVector = Sse2.LoadAlignedVector128((sbyte *)pBuffer);

                            uint maskOfContinuationBytes  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector));
                            uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes);
                            utf16CharCount -= countOfContinuationBytes;

                            uint maskOfFourByteHeaders  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F));
                            uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders);
                            utf16CharCount += countOfFourByteHeaders;

                            pBuffer      += sizeof(Vector128 <sbyte>);
                            bufferLength -= (uint)sizeof(Vector128 <sbyte>);
                        }

                        if ((uint)bufferLength > 0)
                        {
                            // There's still more data to be read.
                            // We need to mask out elements of the vector we don't care about.
                            // These elements will occur at the end of the vector.
                            //
                            // For example, if 14 bytes remain in the input stream:
                            // [ ... CC DD EE FF ] <-- original vector
                            // [ ... CC DD 00 00 ] <-- after PANDN operation

                            Vector128 <sbyte> shouldBeMaskedOut = Sse2.CompareGreaterThan(VectorOfElementIndices, Vector128.Create((byte)((int)bufferLength - 1)).AsSByte());
                            Vector128 <sbyte> thisVector        = Sse2.AndNot(shouldBeMaskedOut, *(Vector128 <sbyte> *)pBuffer);

                            uint maskOfContinuationBytes  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector));
                            uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes);
                            utf16CharCount -= countOfContinuationBytes;

                            uint maskOfFourByteHeaders  = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F));
                            uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders);
                            utf16CharCount += countOfFourByteHeaders;
                        }
                    }

                    return((int)utf16CharCount);
                }
                else
                {
                    // Cannot use SSE2 & POPCNT. Fall back to slower code paths.

                    throw new NotImplementedException();
                }
            }
        }

예제 #10

파일 보기

파일: ReadOnlySpanOfByteExtensions.cs 프로젝트: techtician/Cogito

        /// <summary>
        /// Performs an NAND operation against two <see cref="ReadOnlySpan{byte}"/>.
        /// </summary>
        /// <param name="l"></param>
        /// <param name="r"></param>
        /// <param name="o"></param>
        public static void AndNot(this ReadOnlySpan <byte> l, ReadOnlySpan <byte> r, Span <byte> o)
        {
            var s = o.Length;

            if (l.Length != s)
            {
                throw new ArgumentException("Left span size must be equal to output size.");
            }
            if (r.Length != s)
            {
                throw new ArgumentException("Right span size must be equal to output size.");
            }

#if NETCOREAPP3_0
            if (Avx2.IsSupported)
            {
                while (o.Length >= 32)
                {
                    var al = MemoryMarshal.Cast <byte, ulong>(l);
                    var rl = MemoryMarshal.Cast <byte, ulong>(r);
                    var ol = MemoryMarshal.Cast <byte, ulong>(o);

                    unsafe
                    {
                        fixed(ulong *lp = al)
                        fixed(ulong *rp = rl)
                        fixed(ulong *op = ol)
                        {
                            var av = Avx.LoadVector256(lp);
                            var bv = Avx.LoadVector256(rp);
                            var ov = Avx2.AndNot(av, bv);

                            Avx.Store(op, ov);
                        }
                    }

                    l = l.Slice(32);
                    r = r.Slice(32);
                    o = o.Slice(32);
                }
            }
#endif

#if NETCOREAPP3_0
            if (Sse2.IsSupported)
            {
                while (o.Length >= 16)
                {
                    var ll = MemoryMarshal.Cast <byte, ulong>(l);
                    var rl = MemoryMarshal.Cast <byte, ulong>(r);
                    var ol = MemoryMarshal.Cast <byte, ulong>(o);

                    unsafe
                    {
                        fixed(ulong *lp = ll)
                        fixed(ulong *rp = rl)
                        fixed(ulong *op = ol)
                        {
                            var av = Sse2.LoadVector128(lp);
                            var bv = Sse2.LoadVector128(rp);
                            var ov = Sse2.AndNot(av, bv);

                            Sse2.Store(op, ov);
                        }
                    }

                    l = l.Slice(16);
                    r = r.Slice(16);
                    o = o.Slice(16);
                }
            }
#endif

            while (o.Length >= sizeof(ulong))
            {
                var ll = MemoryMarshal.Cast <byte, ulong>(l);
                var rl = MemoryMarshal.Cast <byte, ulong>(r);
                var ol = MemoryMarshal.Cast <byte, ulong>(o);

                ol[0] = ~ll[0] & rl[0];

                l = l.Slice(sizeof(ulong));
                r = r.Slice(sizeof(ulong));
                o = o.Slice(sizeof(ulong));
            }

            while (o.Length >= sizeof(uint))
            {
                var ll = MemoryMarshal.Cast <byte, uint>(l);
                var rl = MemoryMarshal.Cast <byte, uint>(r);
                var ol = MemoryMarshal.Cast <byte, uint>(o);

                ol[0] = ~ll[0] & rl[0];

                l = l.Slice(sizeof(uint));
                r = r.Slice(sizeof(uint));
                o = o.Slice(sizeof(uint));
            }

            // finish remaining bytes
            if (o.Length > 0)
            {
                for (var i = 0; i < o.Length; i++)
                {
                    o[i] = (byte)((uint)~l[i] & r[i]);
                }
            }
        }

예제 #11

파일 보기

    public void ResizeBicubic(FastBitmap rtnImage)
    {
        float scaleX = (float)this.width / rtnImage.width;
        float scaleY = (float)this.height / rtnImage.height;

        if (scaleX > 1 || scaleY > 1)
        {
            throw new Exception("拡大のみ対応");
        }

        float[] tmpa = new float[rtnImage.width * 4 * this.height];
        fixed(float *tmpp = tmpa)
        {
            float *tmp     = tmpp;
            var    _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255);
            var    _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255);
            var    _10mask = Vector128.Create(8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255);
            var    _11mask = Vector128.Create(12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255);
            var    _vmask  = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255);

            var _1012  = Vector128.Create(-1, 0, 1, 2);
            var _0123i = Vector128.Create(0, 1, 2, 3);

            var _0000   = Vector128.Create(0, 0, 0, 0);
            var _0000f  = Vector128.Create(0f, 0, 0, 0);
            var _255f   = Vector128.Create(255f, 255, 255, 255);
            var _1111   = Vector128.Create(1, 1, 1, 1);
            var _1111f  = Vector128.Create(1f, 1, 1, 1);
            var _4444f  = Vector128.Create(4f, 4, 4, 4);
            var _4444   = Vector128.Create(4, 4, 4, 4);
            var _5555f  = Vector128.Create(5f, 5, 5, 5);
            var _2222f  = Vector128.Create(2f, 2, 2, 2);
            var _8888f  = Vector128.Create(8f, 8, 8, 8);
            var _7f     = Vector128.Create(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff).AsSingle();
            var _ff     = Vector128.Create(-1, -1, -1, -1);
            var _stride = Vector128.Create(rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4);

            Parallel.For(0, this.height, (y) =>
            {
                float py      = (y * scaleY);
                float *tmpPos = tmp + y * rtnImage.width * 4;
                for (int x = 0; x < rtnImage.width; x++)
                {
                    float px = (x * scaleX);
                    int sx   = (int)px;

                    var _px = Vector128.CreateScalar(px);
                    _px     = Sse.Shuffle(_px, _px, 0);

                    var _sx = Vector128.CreateScalar(sx);
                    _sx     = Sse2.Shuffle(_sx, 0);

                    var _width = Vector128.CreateScalar(this.width);
                    _width     = Sse2.Shuffle(_width, 0);

                    var _x2 = Sse2.Add(_sx, _1012);

                    var _d  = Sse.And(Sse.Subtract(_px, Sse2.ConvertToVector128Single(_x2)), _7f);
                    var _d2 = Sse.Multiply(_d, _d);
                    var _d3 = Sse.Multiply(_d2, _d);

                    var w1   = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2)));
                    var w2   = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3);
                    var wb   = Sse2.CompareGreaterThan(_d, _1111f);
                    var _w   = Sse41.BlendVariable(w1, w2, wb);
                    var _xpb = Sse2.Or(Sse2.CompareLessThan(_x2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_x2, _width), _1111).AsInt32(), _ff));
                    var _xpp = Sse2.And(_sx, _xpb);
                    var _xp  = Sse41.BlendVariable(_x2, _xpp, _xpb);

                    var p = Avx2.GatherVector128((uint *)(this._ptr + this._stride * y), _xp, 4).AsByte();


                    var _p0 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _00mask).AsInt32());
                    var _p1 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _01mask).AsInt32());
                    var _p2 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _10mask).AsInt32());
                    var _p3 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _11mask).AsInt32());

                    var _w0 = Sse.Shuffle(_w, _w, 0);
                    var _w1 = Sse.Shuffle(_w, _w, 0b01010101);
                    var _w2 = Sse.Shuffle(_w, _w, 0b10101010);
                    var _w3 = Sse.Shuffle(_w, _w, 0b11111111);

                    var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3)));

                    Sse2.Store(tmpPos + x * 4, rgbaf);
                }
            });

            Parallel.For(0, rtnImage.height, (y) =>
            {
                float py = (y * scaleY);
                int sy   = (int)py;

                uint *store = stackalloc uint[4];

                var _py = Vector128.CreateScalar(py);
                _py     = Sse.Shuffle(_py, _py, 0);

                var _sy = Vector128.CreateScalar(sy);
                _sy     = Sse2.Shuffle(_sy, 0);

                var _height = Vector128.CreateScalar(this.height);
                _height     = Sse2.Shuffle(_height, 0);

                var _y2 = Sse2.Add(_sy, _1012);

                var _d  = Sse.And(Sse.Subtract(_py, Sse2.ConvertToVector128Single(_y2)), _7f);
                var _d2 = Sse.Multiply(_d, _d);
                var _d3 = Sse.Multiply(_d2, _d);

                var w1 = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2)));
                var w2 = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3);
                var wb = Sse2.CompareGreaterThan(_d, _1111f);
                var _w = Sse41.BlendVariable(w1, w2, wb);


                var _ypb = Sse2.Or(Sse2.CompareLessThan(_y2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_y2, _height), _1111).AsInt32(), _ff));
                var _ypp = Sse2.And(_sy, _ypb);
                var _yp  = Sse41.BlendVariable(_y2, _ypp, _ypb);
                var _yps = Sse41.MultiplyLow(_yp, _stride);

                var _yp0  = Sse2.Add(Sse2.Shuffle(_yps, 0), _0123i);
                var _yp1  = Sse2.Add(Sse2.Shuffle(_yps, 0b01010101), _0123i);
                var _yp2  = Sse2.Add(Sse2.Shuffle(_yps, 0b10101010), _0123i);
                var _yp3  = Sse2.Add(Sse2.Shuffle(_yps, 0b11111111), _0123i);
                uint *rtn = (uint *)(rtnImage._ptr + rtnImage._stride * y);

                for (int x = 0; x < rtnImage.width; x++)
                {
                    var _p0 = Avx2.GatherVector128((float *)(tmp), _yp0, 4);
                    var _p1 = Avx2.GatherVector128((float *)(tmp), _yp1, 4);
                    var _p2 = Avx2.GatherVector128((float *)(tmp), _yp2, 4);
                    var _p3 = Avx2.GatherVector128((float *)(tmp), _yp3, 4);

                    var _w0 = Sse.Shuffle(_w, _w, 0);
                    var _w1 = Sse.Shuffle(_w, _w, 0b01010101);
                    var _w2 = Sse.Shuffle(_w, _w, 0b10101010);
                    var _w3 = Sse.Shuffle(_w, _w, 0b11111111);

                    var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3)));

                    var _b0 = Sse.CompareLessThan(rgbaf, _0000f);
                    rgbaf   = Sse41.BlendVariable(rgbaf, _0000f, _b0);
                    var _b1 = Sse.CompareGreaterThan(rgbaf, _255f);
                    rgbaf   = Sse41.BlendVariable(rgbaf, _255f, _b1);

                    var rgbab = Sse2.ConvertToVector128Int32(rgbaf).AsByte();
                    var rgba  = Ssse3.Shuffle(rgbab, _vmask).AsUInt32();

                    Sse2.Store(store, rgba);

                    _yp0 = Sse2.Add(_yp0, _4444);
                    _yp1 = Sse2.Add(_yp1, _4444);
                    _yp2 = Sse2.Add(_yp2, _4444);
                    _yp3 = Sse2.Add(_yp3, _4444);
                    *rtn = *store;
                    rtn++;
                }
            });

예제 #12

파일 보기

파일: SM4Utils.cs 프로젝트: Tratos/CryptoBase

        public static unsafe void Encrypt4(uint[] rk, ReadOnlySpan <byte> source, Span <byte> destination)
        {
            var p32 = MemoryMarshal.Cast <byte, uint>(source);
            var t3  = Vector128.Create(p32[3], p32[7], p32[11], p32[15]).ReverseEndianness32();
            var t2  = Vector128.Create(p32[2], p32[6], p32[10], p32[14]).ReverseEndianness32();
            var t1  = Vector128.Create(p32[1], p32[5], p32[9], p32[13]).ReverseEndianness32();
            var t0  = Vector128.Create(p32[0], p32[4], p32[8], p32[12]).ReverseEndianness32();

            for (var i = 0; i < 32; ++i)
            {
                var x = t1.Xor(t2).Xor(t3).Xor(Vector128.Create(rk[i]).AsByte());

                var y = Sse2.And(x, c0f);                 // inner affine
                y = Ssse3.Shuffle(m1l, y);
                x = Sse2.ShiftRightLogical(x.AsUInt64(), 4).AsByte();
                x = Sse2.And(x, c0f);
                x = Ssse3.Shuffle(m1h, x).Xor(y);

                x = Ssse3.Shuffle(x, shr);               // inverse MixColumns
                x = Aes.EncryptLast(x, c0f);             // AES-NI

                y = Sse2.AndNot(x, c0f);                 // outer affine
                y = Ssse3.Shuffle(m2l, y);
                x = Sse2.ShiftRightLogical(x.AsUInt64(), 4).AsByte();
                x = Sse2.And(x, c0f);
                x = Ssse3.Shuffle(m2h, x).Xor(y);

                // 4 parallel L1 linear transforms
                y = x.Xor(x.RotateLeftUInt32_8()).Xor(x.RotateLeftUInt32_16());
                y = y.AsUInt32().RotateLeftUInt32(2).AsByte();
                x = x.Xor(y).Xor(x.RotateLeftUInt32_24());

                // rotate registers
                x  = x.Xor(t0);
                t0 = t1;
                t1 = t2;
                t2 = t3;
                t3 = x;
            }

            var a = t3.ReverseEndianness32().AsUInt32();
            var b = t2.ReverseEndianness32().AsUInt32();
            var c = t1.ReverseEndianness32().AsUInt32();
            var d = t0.ReverseEndianness32().AsUInt32();

            var x0 = Sse2.UnpackLow(a, b);
            var x1 = Sse2.UnpackLow(c, d);
            var x2 = Sse2.UnpackHigh(a, b);
            var x3 = Sse2.UnpackHigh(c, d);

            t0 = Sse2.UnpackLow(x0.AsUInt64(), x1.AsUInt64()).AsByte();
            t1 = Sse2.UnpackHigh(x0.AsUInt64(), x1.AsUInt64()).AsByte();
            t2 = Sse2.UnpackLow(x2.AsUInt64(), x3.AsUInt64()).AsByte();
            t3 = Sse2.UnpackHigh(x2.AsUInt64(), x3.AsUInt64()).AsByte();

            fixed(byte *p = destination)
            {
                Sse2.Store(p, t0);
                Sse2.Store(p + 16, t1);
                Sse2.Store(p + 32, t2);
                Sse2.Store(p + 48, t3);
            }
        }

예제 #13

파일 보기

파일: AndNot.cs 프로젝트: vehar/coreclr

        static unsafe int Main(string[] args)
        {
            int    testResult          = Pass;
            int    testsCount          = 21;
            string methodUnderTestName = nameof(Sse2.AndNot);

            if (Sse2.IsSupported)
            {
                using (var doubleTable = TestTableSse2 <double> .Create(testsCount))
                    using (var longTable = TestTableSse2 <long> .Create(testsCount))
                        using (var ulongTable = TestTableSse2 <ulong> .Create(testsCount))
                            using (var intTable = TestTableSse2 <int> .Create(testsCount))
                                using (var uintTable = TestTableSse2 <uint> .Create(testsCount))
                                    using (var shortTable = TestTableSse2 <short> .Create(testsCount))
                                        using (var ushortTable = TestTableSse2 <ushort> .Create(testsCount))
                                            using (var sbyteTable = TestTableSse2 <sbyte> .Create(testsCount))
                                                using (var byteTable = TestTableSse2 <byte> .Create(testsCount))
                                                {
                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <double>, Vector128 <double>, Vector128 <double>)value = doubleTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        doubleTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <long>, Vector128 <long>, Vector128 <long>)value = longTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        longTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <ulong>, Vector128 <ulong>, Vector128 <ulong>)value = ulongTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        ulongTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <int>, Vector128 <int>, Vector128 <int>)value = intTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        intTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <uint>, Vector128 <uint>, Vector128 <uint>)value = uintTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        uintTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <short>, Vector128 <short>, Vector128 <short>)value = shortTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        shortTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <ushort>, Vector128 <ushort>, Vector128 <ushort>)value = ushortTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        ushortTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <sbyte>, Vector128 <sbyte>, Vector128 <sbyte>)value = sbyteTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        sbyteTable.SetOutArray(result);
                                                    }

                                                    for (int i = 0; i < testsCount; i++)
                                                    {
                                                        (Vector128 <byte>, Vector128 <byte>, Vector128 <byte>)value = byteTable[i];
                                                        var result = Sse2.AndNot(value.Item1, value.Item2);
                                                        byteTable.SetOutArray(result);
                                                    }

                                                    CheckMethod <double> checkDouble = (double x, double y, double z, ref double a) => (a = BinaryAndNot(x, y)) == z;

                                                    if (!doubleTable.CheckResult(checkDouble))
                                                    {
                                                        PrintError(doubleTable, methodUnderTestName, "(double x, double y, double z, ref double a) => (a = BinaryAndNot(x, y)) == z", checkDouble);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <long> checkLong = (long x, long y, long z, ref long a) => (a = (~x) & y) == z;

                                                    if (!longTable.CheckResult(checkLong))
                                                    {
                                                        PrintError(longTable, methodUnderTestName, "(long x, long y, long z, ref long a) => (a = (~x) & y) == z", checkLong);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <ulong> checkUlong = (ulong x, ulong y, ulong z, ref ulong a) => (a = (~x) & y) == z;

                                                    if (!longTable.CheckResult(checkLong))
                                                    {
                                                        PrintError(ulongTable, methodUnderTestName, "(ulong x, ulong y, ulong z, ref ulong a) => (a = (~x) & y) == z", checkUlong);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <int> checkInt32 = (int x, int y, int z, ref int a) => (a = (~x) & y) == z;

                                                    if (!intTable.CheckResult(checkInt32))
                                                    {
                                                        PrintError(intTable, methodUnderTestName, "(int x, int y, int z, ref int a) => (a = (~x) & y) == z", checkInt32);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <uint> checkUInt32 = (uint x, uint y, uint z, ref uint a) => (a = (~x) & y) == z;

                                                    if (!uintTable.CheckResult(checkUInt32))
                                                    {
                                                        PrintError(uintTable, methodUnderTestName, "(uint x, uint y, uint z, ref uint a) => (a = (~x) & y) == z", checkUInt32);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <short> checkInt16 = (short x, short y, short z, ref short a) => (a = (short)((~x) & y)) == z;

                                                    if (!shortTable.CheckResult(checkInt16))
                                                    {
                                                        PrintError(shortTable, methodUnderTestName, "(short x, short y, short z, ref short a) => (a = (short)((~x) & y)) == z", checkInt16);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <ushort> checkUInt16 = (ushort x, ushort y, ushort z, ref ushort a) => (a = (ushort)((~x) & y)) == z;

                                                    if (!ushortTable.CheckResult(checkUInt16))
                                                    {
                                                        PrintError(ushortTable, methodUnderTestName, "(ushort x, ushort y, ushort z, ref ushort a) => (a = (ushort)((~x) & y)) == z", checkUInt16);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <sbyte> checkSByte = (sbyte x, sbyte y, sbyte z, ref sbyte a) => (a = (sbyte)((~x) & y)) == z;

                                                    if (!sbyteTable.CheckResult(checkSByte))
                                                    {
                                                        PrintError(sbyteTable, methodUnderTestName, "(sbyte x, sbyte y, sbyte z, ref sbyte a) =>(a = (sbyte)((~x) & y)) == z", checkSByte);
                                                        testResult = Fail;
                                                    }

                                                    CheckMethod <byte> checkByte = (byte x, byte y, byte z, ref byte a) => (a = (byte)((~x) & y)) == z;

                                                    if (!byteTable.CheckResult(checkByte))
                                                    {
                                                        PrintError(byteTable, methodUnderTestName, "(byte x, byte y, byte z, ref byte a) =>  (a = (byte)((~x) & y)) == z", checkByte);
                                                        testResult = Fail;
                                                    }
                                                }
            }
            else
            {
                Console.WriteLine($"Sse2.IsSupported: {Sse2.IsSupported}, skipped tests of {typeof(Sse2)}.{methodUnderTestName}");
            }

            return(testResult);
        }

예제 #14

파일 보기

파일: Sse2Functions.cs 프로젝트: TechnologicalPizza/SharpFastNoise2

 public static i32 NMask_i32(i32 a, m32 m)
 {
     return(Sse2.AndNot(m, a));
 }

예제 #15

파일 보기

파일: Sse2Functions.cs 프로젝트: TechnologicalPizza/SharpFastNoise2

 public static m32 BitwiseAndNot_m32(m32 a, m32 b)
 {
     return(Sse2.AndNot(b, a));
 }

예제 #16

파일 보기

파일: Sse2Functions.cs 프로젝트: TechnologicalPizza/SharpFastNoise2

 public static i32 BitwiseAndNot_i32(i32 a, i32 b)
 {
     return(Sse2.AndNot(b, a));
 }