C# (CSharp) UnicodeUtility.IsAsciiCodePoint примеры использования

Язык программирования: C# (CSharp)

Класс/Тип: UnicodeUtility

Метод/Функция: IsAsciiCodePoint

Примеров на hotexamples.com: 10

C# (CSharp) UnicodeUtility.IsAsciiCodePoint - 10 примеров найдено. Это лучшие примеры C# (CSharp) кода для UnicodeUtility.IsAsciiCodePoint, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

UnicodeToKoDau(30)

UrlRewriting(24)

IsAsciiCodePoint(10)

ToUtf32(4)

IsInRangeInclusive(4)

RemoveSpecialCharacter(3)

UnicodeToKoDauAndGach(3)

UnicodeToFriendlyUrl(3)

GetCharacterClass(3)

IsBmpCodePoint(3)

GetScalarFromUtf16SurrogatePair(3)

GetCharacterLength(3)

IsCJKCodePoint(2)

GetUtf16SurrogatesFromSupplementaryPlaneScalar(2)

GetCodePoint(2)

RemoveHtmlTags(2)

IsSurrogateCodePoint(2)

ToPlainText(1)

encode(1)

FromHexString(1)

ToHexString(1)

ToCodePoints(1)

IsValidUnicodeScalar(1)

IsValidCodePoint(1)

IsLowSurrogateCodePoint(1)

IsHighSurrogateCodePoint(1)

GetUtf16SequenceLength(1)

GetCharacterClassFromCategory(1)

utf8Clean(1)

Пример #1

Показать файл

Файл: Utf8Utility.WhiteSpace.CoreLib.cs Проект: layomia/dotnet_runtime

        private static nuint GetIndexOfFirstNonWhiteSpaceChar(ref byte utf8Data, nuint length)
        {
            // This method is optimized for the case where the input data is ASCII, and if the
            // data does need to be trimmed it's likely that only a relatively small number of
            // bytes will be trimmed.

            nuint i = 0;

            while (i < length)
            {
                // Very quick check: see if the byte is in the range [ 21 .. 7F ].
                // If so, we can skip the more expensive logic later in this method.

                if ((sbyte)Unsafe.AddByteOffset(ref utf8Data, i) > (sbyte)0x20)
                {
                    break;
                }

                uint possibleAsciiByte = Unsafe.AddByteOffset(ref utf8Data, i);
                if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte))
                {
                    // The simple comparison failed. Let's read the actual byte value,
                    // and if it's ASCII we can delegate to Rune's inlined method
                    // implementation.

                    if (Rune.IsWhiteSpace(Rune.UnsafeCreate(possibleAsciiByte)))
                    {
                        i++;
                        continue;
                    }
                }
                else
                {
                    // Not ASCII data. Go back to the slower "decode the entire scalar"
                    // code path, then compare it against our Unicode tables.

                    Rune.DecodeFromUtf8(new ReadOnlySpan <byte>(ref utf8Data, (int)length).Slice((int)i), out Rune decodedRune, out int bytesConsumed);
                    if (Rune.IsWhiteSpace(decodedRune))
                    {
                        i += (uint)bytesConsumed;
                        continue;
                    }
                }

                break; // If we got here, we saw a non-whitespace subsequence.
            }

            return(i);
        }

Пример #2

Показать файл

        /// <summary>
        /// Returns the index in <paramref name="utf8Data"/> where the trailing whitespace sequence
        /// begins, or 0 if the data contains only whitespace characters, or the span length if the
        /// data does not end with any whitespace characters.
        /// </summary>
        public static int GetIndexOfTrailingWhiteSpaceSequence(ReadOnlySpan <byte> utf8Data)
        {
            // This method is optimized for the case where the input data is ASCII, and if the
            // data does need to be trimmed it's likely that only a relatively small number of
            // bytes will be trimmed.

            int length = utf8Data.Length;

            while (length > 0)
            {
                // Very quick check: see if the byte is in the range [ 21 .. 7F ].
                // If so, we can skip the more expensive logic later in this method.

                if ((sbyte)utf8Data[length - 1] > (sbyte)0x20)
                {
                    break;
                }

                uint possibleAsciiByte = utf8Data[length - 1];
                if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte))
                {
                    // The simple comparison failed. Let's read the actual byte value,
                    // and if it's ASCII we can delegate to Rune's inlined method
                    // implementation.

                    if (Rune.IsWhiteSpace(new Rune(possibleAsciiByte)))
                    {
                        length--;
                        continue;
                    }
                }
                else
                {
                    // Not ASCII data. Go back to the slower "decode the entire scalar"
                    // code path, then compare it against our Unicode tables.

                    Rune.DecodeLastFromUtf8(utf8Data.Slice(0, length), out Rune decodedRune, out int bytesConsumed);
                    if (Rune.IsWhiteSpace(decodedRune))
                    {
                        length -= bytesConsumed;
                        continue;
                    }
                }

                break; // If we got here, we saw a non-whitespace subsequence.
            }

            return(length);
        }

Пример #3

Показать файл

Файл: TextEncoder.cs Проект: spydacarnage/corefx

        public virtual int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            int originalUtf8TextLength = utf8Text.Length;

            // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value
            // that must be encoded. If we see either of these things then we'll return its index in the original
            // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate
            // that the text can be copied as-is without escaping.

            int i = 0;

            while (i < utf8Text.Length)
            {
                byte value = utf8Text[i];
                if (UnicodeUtility.IsAsciiCodePoint(value))
                {
                    if (!ReferenceEquals(GetAsciiEncoding(value), s_noEscape))
                    {
                        return(originalUtf8TextLength - utf8Text.Length + i);
                    }

                    i++;
                }
                else
                {
                    if (i > 0)
                    {
                        utf8Text = utf8Text.Slice(i);
                    }

                    if (UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text, out uint nextScalarValue, out int bytesConsumedThisIteration) != OperationStatus.Done ||
                        WillEncode((int)nextScalarValue))
                    {
                        return(originalUtf8TextLength - utf8Text.Length);
                    }

                    i = bytesConsumedThisIteration;
                }
            }

            return(-1); // no input data needs to be escaped
        }

Пример #4

Показать файл

Файл: TextEncoder.cs Проект: Vettvangur/DotnetRuntime

        public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            if (!_isAsciiCacheInitialized)
            {
                InitializeAsciiCache();
            }

            // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value
            // that must be encoded. If we see either of these things then we'll return its index in the original
            // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate
            // that the text can be copied as-is without escaping.

            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if ((Sse2.IsSupported || AdvSimd.Arm64.IsSupported) && utf8Text.Length - 16 >= idx)
                {
                    // Hoist these outside the loop, as the JIT won't do it.
                    Vector128 <sbyte> bitMaskLookupAsciiNeedsEscaping = _bitMaskLookupAsciiNeedsEscaping;
                    Vector128 <sbyte> bitPosLookup    = Ssse3Helper.s_bitPosLookup;
                    Vector128 <sbyte> nibbleMaskSByte = Ssse3Helper.s_nibbleMaskSByte;
                    Vector128 <sbyte> nullMaskSByte   = Ssse3Helper.s_nullMaskSByte;

                    sbyte *startingAddress = (sbyte *)ptr;
                    do
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue;
                        bool containsNonAsciiBytes;

                        // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when
                        // casted to signed byte.
                        if (Sse2.IsSupported)
                        {
                            sourceValue           = Sse2.LoadVector128(startingAddress);
                            containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue);
                        }
                        else if (AdvSimd.Arm64.IsSupported)
                        {
                            sourceValue           = AdvSimd.LoadVector128(startingAddress);
                            containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue);
                        }
                        else
                        {
                            throw new PlatformNotSupportedException();
                        }

                        if (!containsNonAsciiBytes)
                        {
                            // All of the following 16 bytes is ASCII.
                            // TODO AdvSimd: optimization maybe achievable using VectorTableLookup and/or VectorTableLookupExtension

                            if (Ssse3.IsSupported)
                            {
                                Vector128 <sbyte> mask = Ssse3Helper.CreateEscapingMask(sourceValue, bitMaskLookupAsciiNeedsEscaping, bitPosLookup, nibbleMaskSByte, nullMaskSByte);
                                int index = Sse2Helper.GetIndexOfFirstNonAsciiByte(mask.AsByte());

                                if (index < 16)
                                {
                                    idx += index;
                                    goto Return;
                                }
                            }
                            else
                            {
                                byte *p = (byte *)startingAddress;
                                if (DoesAsciiNeedEncoding(p[0]))
                                {
                                    goto Return;
                                }
                                if (DoesAsciiNeedEncoding(p[1]))
                                {
                                    goto Return1;
                                }
                                if (DoesAsciiNeedEncoding(p[2]))
                                {
                                    goto Return2;
                                }
                                if (DoesAsciiNeedEncoding(p[3]))
                                {
                                    goto Return3;
                                }
                                if (DoesAsciiNeedEncoding(p[4]))
                                {
                                    goto Return4;
                                }
                                if (DoesAsciiNeedEncoding(p[5]))
                                {
                                    goto Return5;
                                }
                                if (DoesAsciiNeedEncoding(p[6]))
                                {
                                    goto Return6;
                                }
                                if (DoesAsciiNeedEncoding(p[7]))
                                {
                                    goto Return7;
                                }
                                if (DoesAsciiNeedEncoding(p[8]))
                                {
                                    goto Return8;
                                }
                                if (DoesAsciiNeedEncoding(p[9]))
                                {
                                    goto Return9;
                                }
                                if (DoesAsciiNeedEncoding(p[10]))
                                {
                                    goto Return10;
                                }
                                if (DoesAsciiNeedEncoding(p[11]))
                                {
                                    goto Return11;
                                }
                                if (DoesAsciiNeedEncoding(p[12]))
                                {
                                    goto Return12;
                                }
                                if (DoesAsciiNeedEncoding(p[13]))
                                {
                                    goto Return13;
                                }
                                if (DoesAsciiNeedEncoding(p[14]))
                                {
                                    goto Return14;
                                }
                                if (DoesAsciiNeedEncoding(p[15]))
                                {
                                    goto Return15;
                                }
                            }

                            idx += 16;
                        }
                        else
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (DoesAsciiNeedEncoding(ptr[idx]))
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                        }
                        startingAddress = (sbyte *)ptr + idx;
                    }while (utf8Text.Length - 16 >= idx);

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (DoesAsciiNeedEncoding(ptr[idx]))
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.
                goto Return;

#if NETCOREAPP
Return15:
                return(idx + 15);

Return14:
                return(idx + 14);

Return13:
                return(idx + 13);

Return12:
                return(idx + 12);

Return11:
                return(idx + 11);

Return10:
                return(idx + 10);

Return9:
                return(idx + 9);

Return8:
                return(idx + 8);

Return7:
                return(idx + 7);

Return6:
                return(idx + 6);

Return5:
                return(idx + 5);

Return4:
                return(idx + 4);

Return3:
                return(idx + 3);

Return2:
                return(idx + 2);

Return1:
                return(idx + 1);
#endif
Return:
                return(idx);
            }
        }

Пример #5

Показать файл

Файл: TextEncoder.cs Проект: Vettvangur/DotnetRuntime

        /// <summary>
        /// Encodes the supplied UTF-8 text.
        /// </summary>
        /// <param name="utf8Source">A source buffer containing the UTF-8 text to encode.</param>
        /// <param name="utf8Destination">The destination buffer to which the encoded form of <paramref name="utf8Source"/>
        /// will be written.</param>
        /// <param name="bytesConsumed">The number of bytes consumed from the <paramref name="utf8Source"/> buffer.</param>
        /// <param name="bytesWritten">The number of bytes written to the <paramref name="utf8Destination"/> buffer.</param>
        /// <param name="isFinalBlock"><see langword="true"/> if there is further source data that needs to be encoded;
        /// <see langword="false"/> if there is no further source data that needs to be encoded.</param>
        /// <returns>An <see cref="OperationStatus"/> describing the result of the encoding operation.</returns>
        /// <remarks>The buffers <paramref name="utf8Source"/> and <paramref name="utf8Destination"/> must not overlap.</remarks>
        public unsafe virtual OperationStatus EncodeUtf8(
            ReadOnlySpan <byte> utf8Source,
            Span <byte> utf8Destination,
            out int bytesConsumed,
            out int bytesWritten,
            bool isFinalBlock = true)
        {
            int originalUtf8SourceLength      = utf8Source.Length;
            int originalUtf8DestinationLength = utf8Destination.Length;

            const int TempUtf16CharBufferLength = 24; // arbitrarily chosen, but sufficient for any reasonable implementation
            char *    pTempCharBuffer           = stackalloc char[TempUtf16CharBufferLength];

            const int TempUtf8ByteBufferLength = TempUtf16CharBufferLength * 3 /* max UTF-8 output code units per UTF-16 input code unit */;
            byte *    pTempUtf8Buffer          = stackalloc byte[TempUtf8ByteBufferLength];

            uint            nextScalarValue;
            int             utf8BytesConsumedForScalar = 0;
            int             nonEscapedByteCount        = 0;
            OperationStatus opStatus = OperationStatus.Done;

            while (!utf8Source.IsEmpty)
            {
                // For performance, read until we require escaping.
                do
                {
                    nextScalarValue = utf8Source[nonEscapedByteCount];
                    if (UnicodeUtility.IsAsciiCodePoint(nextScalarValue))
                    {
                        // Check Ascii cache.
                        byte[]? encodedBytes = GetAsciiEncoding((byte)nextScalarValue);

                        if (ReferenceEquals(encodedBytes, s_noEscape))
                        {
                            if (++nonEscapedByteCount <= utf8Destination.Length)
                            {
                                // Source data can be copied as-is.
                                continue;
                            }

                            --nonEscapedByteCount;
                            opStatus = OperationStatus.DestinationTooSmall;
                            break;
                        }

                        if (encodedBytes == null)
                        {
                            // We need to escape and update the cache, so break out of this loop.
                            opStatus = OperationStatus.Done;
                            utf8BytesConsumedForScalar = 1;
                            break;
                        }

                        // For performance, handle the non-escaped bytes and encoding here instead of breaking out of the loop.
                        if (nonEscapedByteCount > 0)
                        {
                            // We previously verified the destination size.
                            Debug.Assert(nonEscapedByteCount <= utf8Destination.Length);

                            utf8Source.Slice(0, nonEscapedByteCount).CopyTo(utf8Destination);
                            utf8Source          = utf8Source.Slice(nonEscapedByteCount);
                            utf8Destination     = utf8Destination.Slice(nonEscapedByteCount);
                            nonEscapedByteCount = 0;
                        }

                        if (!((ReadOnlySpan <byte>)encodedBytes).TryCopyTo(utf8Destination))
                        {
                            opStatus = OperationStatus.DestinationTooSmall;
                            break;
                        }

                        utf8Destination = utf8Destination.Slice(encodedBytes.Length);
                        utf8Source      = utf8Source.Slice(1);
                        continue;
                    }

                    // Code path for non-Ascii.
                    opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Source.Slice(nonEscapedByteCount), out nextScalarValue, out utf8BytesConsumedForScalar);
                    if (opStatus == OperationStatus.Done)
                    {
                        if (!WillEncode((int)nextScalarValue))
                        {
                            nonEscapedByteCount += utf8BytesConsumedForScalar;
                            if (nonEscapedByteCount <= utf8Destination.Length)
                            {
                                // Source data can be copied as-is.
                                continue;
                            }

                            nonEscapedByteCount -= utf8BytesConsumedForScalar;
                            opStatus             = OperationStatus.DestinationTooSmall;
                        }
                    }

                    // We need to escape.
                    break;
                } while (nonEscapedByteCount < utf8Source.Length);

                if (nonEscapedByteCount > 0)
                {
                    // We previously verified the destination size.
                    Debug.Assert(nonEscapedByteCount <= utf8Destination.Length);

                    utf8Source.Slice(0, nonEscapedByteCount).CopyTo(utf8Destination);
                    utf8Source          = utf8Source.Slice(nonEscapedByteCount);
                    utf8Destination     = utf8Destination.Slice(nonEscapedByteCount);
                    nonEscapedByteCount = 0;
                }

                if (utf8Source.IsEmpty)
                {
                    goto Done;
                }

                // This code path is hit for ill-formed input data (where decoding has replaced it with U+FFFD)
                // and for well-formed input data that must be escaped.

                if (opStatus != OperationStatus.Done) // Optimize happy path.
                {
                    if (opStatus == OperationStatus.NeedMoreData)
                    {
                        if (!isFinalBlock)
                        {
                            bytesConsumed = originalUtf8SourceLength - utf8Source.Length;
                            bytesWritten  = originalUtf8DestinationLength - utf8Destination.Length;
                            return(OperationStatus.NeedMoreData);
                        }
                        // else treat this as a normal invalid subsequence.
                    }
                    else if (opStatus == OperationStatus.DestinationTooSmall)
                    {
                        goto ReturnDestinationTooSmall;
                    }
                }

                if (TryEncodeUnicodeScalar((int)nextScalarValue, pTempCharBuffer, TempUtf16CharBufferLength, out int charsWrittenJustNow))
                {
                    // Now that we have it as UTF-16, transcode it to UTF-8.
                    // Need to copy it to a temporary buffer first, otherwise GetBytes might throw an exception
                    // due to lack of output space.

                    int transcodedByteCountThisIteration = Encoding.UTF8.GetBytes(pTempCharBuffer, charsWrittenJustNow, pTempUtf8Buffer, TempUtf8ByteBufferLength);
                    ReadOnlySpan <byte> transcodedUtf8BytesThisIteration = new ReadOnlySpan <byte>(pTempUtf8Buffer, transcodedByteCountThisIteration);

                    // Update cache for Ascii
                    if (UnicodeUtility.IsAsciiCodePoint(nextScalarValue))
                    {
                        _asciiEscape[nextScalarValue] = transcodedUtf8BytesThisIteration.ToArray();
                    }

                    if (!transcodedUtf8BytesThisIteration.TryCopyTo(utf8Destination))
                    {
                        goto ReturnDestinationTooSmall;
                    }

                    utf8Destination = utf8Destination.Slice(transcodedByteCountThisIteration);
                }
                else
                {
                    // We really don't expect this to fail. If that happens we'll report an error to our caller.
                    bytesConsumed = originalUtf8SourceLength - utf8Source.Length;
                    bytesWritten  = originalUtf8DestinationLength - utf8Destination.Length;
                    return(OperationStatus.InvalidData);
                }

                utf8Source = utf8Source.Slice(utf8BytesConsumedForScalar);
            }

Done:
            // Input buffer has been fully processed!
            bytesConsumed = originalUtf8SourceLength;
            bytesWritten  = originalUtf8DestinationLength - utf8Destination.Length;
            return(OperationStatus.Done);

ReturnDestinationTooSmall:
            bytesConsumed = originalUtf8SourceLength - utf8Source.Length;
            bytesWritten  = originalUtf8DestinationLength - utf8Destination.Length;
            return(OperationStatus.DestinationTooSmall);
        }

Пример #6

Показать файл

        /// <summary>
        /// A copy of the logic in Rune.DecodeFromUtf8.
        /// </summary>
        public static OperationStatus DecodeScalarValueFromUtf8(ReadOnlySpan <byte> source, out uint result, out int bytesConsumed)
        {
            const char ReplacementChar = '\uFFFD';

            // This method follows the Unicode Standard's recommendation for detecting
            // the maximal subpart of an ill-formed subsequence. See The Unicode Standard,
            // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence,
            // it tries to consume as many code units as possible as long as those code
            // units constitute the beginning of a longer well-formed subsequence per Table 3-7.

            int index = 0;

            // Try reading input[0].

            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            uint tempValue = source[index];

            if (!UnicodeUtility.IsAsciiCodePoint(tempValue))
            {
                goto NotAscii;
            }

Finish:

            bytesConsumed = index + 1;
            Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4]
            result = tempValue;
            return(OperationStatus.Done);

NotAscii:

            // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in
            // the range [C2..F4]. If it's outside of that range, it's either a standalone
            // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range
            // four-byte sequence.

            if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4))
            {
                goto FirstByteInvalid;
            }

            tempValue = (tempValue - 0xC2) << 6;

            // Try reading input[1].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            // Continuation bytes are of the form [10xxxxxx], which means that their two's
            // complement representation is in the range [-65..-128]. This allows us to
            // perform a single comparison to see if a byte is a continuation byte.

            int thisByteSignExtended = (sbyte)source[index];

            if (thisByteSignExtended >= -64)
            {
                goto Invalid;
            }

            tempValue += (uint)thisByteSignExtended;
            tempValue += 0x80;               // remove the continuation byte marker
            tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker

            if (tempValue < 0x0800)
            {
                Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF));
                goto Finish; // this is a valid 2-byte sequence
            }

            // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have
            // enough information (from just two code units) to detect overlong or surrogate
            // sequences, we need to perform these checks now.

            if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80)))
            {
                // The first two bytes were not in the range [[E0 A0]..[F4 8F]].
                // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence.
                goto Invalid;
            }

            if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80)))
            {
                // This is a UTF-16 surrogate code point, which is invalid in UTF-8.
                goto Invalid;
            }

            if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80)))
            {
                // This is an overlong 4-byte sequence.
                goto Invalid;
            }

            // The first two bytes were just fine. We don't need to perform any other checks
            // on the remaining bytes other than to see that they're valid continuation bytes.

            // Try reading input[2].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            thisByteSignExtended = (sbyte)source[index];
            if (thisByteSignExtended >= -64)
            {
                goto Invalid; // this byte is not a UTF-8 continuation byte
            }

            tempValue <<= 6;
            tempValue  += (uint)thisByteSignExtended;
            tempValue  += 0x80;                // remove the continuation byte marker
            tempValue  -= (0xE0 - 0xC0) << 12; // remove the leading byte marker

            if (tempValue <= 0xFFFF)
            {
                Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF));
                goto Finish; // this is a valid 3-byte sequence
            }

            // Try reading input[3].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            thisByteSignExtended = (sbyte)source[index];
            if (thisByteSignExtended >= -64)
            {
                goto Invalid; // this byte is not a UTF-8 continuation byte
            }

            tempValue <<= 6;
            tempValue  += (uint)thisByteSignExtended;
            tempValue  += 0x80;                // remove the continuation byte marker
            tempValue  -= (0xF0 - 0xE0) << 18; // remove the leading byte marker

            UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue);
            goto Finish; // this is a valid 4-byte sequence

FirstByteInvalid:

            index = 1; // Invalid subsequences are always at least length 1.

Invalid:

            Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3
            bytesConsumed = index;
            result        = ReplacementChar;
            return(OperationStatus.InvalidData);

NeedsMoreData:

            Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3
            bytesConsumed = index;
            result        = ReplacementChar;
            return(OperationStatus.NeedMoreData);
        }

Пример #7

Показать файл

        public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if (Sse2.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (utf8Text.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress);

                        Vector128 <sbyte> mask = Sse2Helper.CreateAsciiMask(sourceValue);
                        int index = Sse2.MoveMask(mask);

                        if (index != 0)
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                        }
                        else
                        {
                            if (DoesAsciiNeedEncoding(ptr[idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1)
                            {
                                goto Return;
                            }
                            idx++;
                        }
                        startingAddress = (sbyte *)ptr + idx;
                    }

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }

                idx = -1; // All bytes are allowed.

Return:
                return(idx);
            }
        }

Пример #8

Показать файл

        public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if (Sse2.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (utf8Text.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress);

                        // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when
                        // casted to signed byte.
                        int index = Sse2.MoveMask(sourceValue);

                        if (index != 0)
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx]))
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                            startingAddress = (sbyte *)ptr + idx;
                        }
                        else
                        {
                            // Check if any of the 16 bytes need to be escaped.
                            Vector128 <sbyte> mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue);

                            index = Sse2.MoveMask(mask);
                            // If index == 0, that means none of the 16 bytes needed to be escaped.
                            // TrailingZeroCount is relatively expensive, avoid it if possible.
                            if (index != 0)
                            {
                                // Found at least one byte that needs to be escaped, figure out the index of
                                // the first one found that needed to be escaped within the 16 bytes.
                                Debug.Assert(index > 0 && index <= 65_535);
                                int tzc = BitOperations.TrailingZeroCount(index);
                                Debug.Assert(tzc >= 0 && tzc <= 16);
                                idx += tzc;
                                goto Return;
                            }
                            idx             += 16;
                            startingAddress += 16;
                        }
                    }

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx]))
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.

Return:
                return(idx);
            }
        }

Пример #9

Показать файл

        public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value
            // that must be encoded. If we see either of these things then we'll return its index in the original
            // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate
            // that the text can be copied as-is without escaping.

            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if (Sse2.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (utf8Text.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress);

                        // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when
                        // casted to signed byte.
                        int index = Sse2.MoveMask(sourceValue);

                        if (index != 0)
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                        }
                        else
                        {
                            if (DoesAsciiNeedEncoding(ptr[idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1)
                            {
                                goto Return;
                            }
                            idx++;
                        }
                        startingAddress = (sbyte *)ptr + idx;
                    }

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.

Return:
                return(idx);
            }
        }

Пример #10

Показать файл

Файл: DefaultJavaScriptEncoder.cs Проект: Vettvangur/DotnetRuntime

        public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (utf8Text.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        bool containsNonAsciiBytes;

                        // Load the next 16 bytes, and check for ASCII text.
                        // Any byte that's not in the ASCII range will already be negative when casted to signed byte.
                        if (Sse2.IsSupported)
                        {
                            Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress);
                            containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue);
                        }
                        else if (AdvSimd.Arm64.IsSupported)
                        {
                            Vector128 <sbyte> sourceValue = AdvSimd.LoadVector128(startingAddress);
                            containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue);
                        }
                        else
                        {
                            throw new PlatformNotSupportedException();
                        }

                        if (containsNonAsciiBytes)
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                        }
                        else
                        {
                            if (DoesAsciiNeedEncoding(ptr[idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1)
                            {
                                goto Return;
                            }
                            idx++;
                        }
                        startingAddress = (sbyte *)ptr + idx;
                    }

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.

Return:
                return(idx);
            }
        }