Пример #1
0
 /// <summary>
 /// Returns a value stating whether the current <see cref="Utf8Span"/> instance contains
 /// <paramref name="value"/>. The specified comparison is used.
 /// </summary>
 public bool Contains(char value, StringComparison comparison)
 {
     return(Rune.TryCreate(value, out Rune rune) && Contains(rune, comparison));
 }
Пример #2
0
 internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan <byte> bytes, out Rune value, out int bytesConsumed)
 {
     return(Rune.DecodeFromUtf8(bytes, out value, out bytesConsumed));
 }
Пример #3
0
        /// <summary>
        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with
        /// the specified <see cref="Rune"/>. The specified comparison is used.
        /// </summary>
        public bool StartsWith(Rune value, StringComparison comparison)
        {
            // TODO_UTF8STRING: Optimize me to avoid allocations.

            return(this.ToString().StartsWith(value.ToString(), comparison));
        }
Пример #4
0
 /// <summary>
 /// Returns a value stating whether the current <see cref="Utf8Span"/> instance contains
 /// <paramref name="value"/>. An ordinal comparison is used.
 /// </summary>
 public bool Contains(char value)
 {
     return(Rune.TryCreate(value, out Rune rune) && Contains(rune));
 }
Пример #5
0
 /// <summary>
 /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with
 /// <paramref name="value"/>. An ordinal comparison is used.
 /// </summary>
 public bool StartsWith(char value)
 {
     return(Rune.TryCreate(value, out Rune rune) && StartsWith(rune));
 }
Пример #6
0
 /// <summary>
 /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with
 /// <paramref name="value"/>. The specified comparison is used.
 /// </summary>
 public bool StartsWith(char value, StringComparison comparison)
 {
     return(Rune.TryCreate(value, out Rune rune) && StartsWith(rune, comparison));
 }
Пример #7
0
        internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan <byte> bytes, out Rune value, out int bytesConsumed)
        {
            if (!bytes.IsEmpty)
            {
                byte b = bytes[0];
                if (b <= 0x7F)
                {
                    // ASCII byte

                    value         = new Rune(b);
                    bytesConsumed = 1;
                    return(OperationStatus.Done);
                }
                else
                {
                    // Non-ASCII byte

                    value         = Rune.ReplacementChar;
                    bytesConsumed = 1;
                    return(OperationStatus.InvalidData);
                }
            }
            else
            {
                // No data to decode

                value         = Rune.ReplacementChar;
                bytesConsumed = 0;
                return(OperationStatus.NeedMoreData);
            }
        }
Пример #8
0
 private void GrowAndAppend(Rune rune)
 {
     Grow(2);
     Append(rune);
 }
Пример #9
0
 /// <summary>
 /// Locates <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
 /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
 /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
 /// </summary>
 /// <remarks>
 /// The search is performed using the specified <paramref name="comparisonType"/>.
 /// </remarks>
 public SplitOnResult SplitOn(Rune separator, StringComparison comparisonType)
 {
     return(TryFind(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this));
 }
Пример #10
0
 internal Enumerator(Utf8Span span)
 {
     _currentRune        = default;
     _remainingUtf8Bytes = span.Bytes;
 }
Пример #11
0
 /// <summary>
 /// Locates <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
 /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
 /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
 /// </summary>
 /// <remarks>
 /// An ordinal search is performed.
 /// </remarks>
 public SplitOnResult SplitOn(Rune separator)
 {
     return(TryFind(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this));
 }
Пример #12
0
        public SplitResult Split(Rune separator, Utf8StringSplitOptions options = Utf8StringSplitOptions.None)
        {
            Utf8String.CheckSplitOptions(options);

            return(new SplitResult(this, separator, options));
        }
Пример #13
0
 public bool Equals(Rune other) => this == other;
Пример #14
0
        /// <summary>
        /// Decodes the <see cref="Rune"/> at the beginning of the provided UTF-8 source buffer.
        /// </summary>
        /// <returns>
        /// <para>
        /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns <see cref="OperationStatus.Done"/>,
        /// and outs via <paramref name="result"/> the decoded <see cref="Rune"/> and via <paramref name="bytesConsumed"/> the
        /// number of <see langword="byte"/>s used in the input buffer to encode the <see cref="Rune"/>.
        /// </para>
        /// <para>
        /// If the source buffer is empty or contains only a partial UTF-8 subsequence, returns <see cref="OperationStatus.NeedMoreData"/>,
        /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the length of the input buffer.
        /// </para>
        /// <para>
        /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns <see cref="OperationStatus.InvalidData"/>,
        /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the number of
        /// <see langword="char"/>s used in the input buffer to encode the ill-formed sequence.
        /// </para>
        /// </returns>
        /// <remarks>
        /// The general calling convention is to call this method in a loop, slicing the <paramref name="source"/> buffer by
        /// <paramref name="bytesConsumed"/> elements on each iteration of the loop. On each iteration of the loop <paramref name="result"/>
        /// will contain the real scalar value if successfully decoded, or it will contain <see cref="ReplacementChar"/> if
        /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of
        /// invalid sequences while iterating through the loop.
        /// </remarks>
        public static OperationStatus DecodeFromUtf8(ReadOnlySpan <byte> source, out Rune result, out int bytesConsumed)
        {
            // This method follows the Unicode Standard's recommendation for detecting
            // the maximal subpart of an ill-formed subsequence. See The Unicode Standard,
            // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence,
            // it tries to consume as many code units as possible as long as those code
            // units constitute the beginning of a longer well-formed subsequence per Table 3-7.

            int index = 0;

            // Try reading input[0].

            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            uint tempValue = source[index];

            if (!UnicodeUtility.IsAsciiCodePoint(tempValue))
            {
                goto NotAscii;
            }

Finish:

            bytesConsumed = index + 1;
            Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4]
            result = UnsafeCreate(tempValue);
            return(OperationStatus.Done);

NotAscii:

            // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in
            // the range [C2..F4]. If it's outside of that range, it's either a standalone
            // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range
            // four-byte sequence.

            if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4))
            {
                goto FirstByteInvalid;
            }

            tempValue = (tempValue - 0xC2) << 6;

            // Try reading input[1].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            // Continuation bytes are of the form [10xxxxxx], which means that their two's
            // complement representation is in the range [-65..-128]. This allows us to
            // perform a single comparison to see if a byte is a continuation byte.

            int thisByteSignExtended = (sbyte)source[index];

            if (thisByteSignExtended >= -64)
            {
                goto Invalid;
            }

            tempValue += (uint)thisByteSignExtended;
            tempValue += 0x80;               // remove the continuation byte marker
            tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker

            if (tempValue < 0x0800)
            {
                Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF));
                goto Finish; // this is a valid 2-byte sequence
            }

            // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have
            // enough information (from just two code units) to detect overlong or surrogate
            // sequences, we need to perform these checks now.

            if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80)))
            {
                // The first two bytes were not in the range [[E0 A0]..[F4 8F]].
                // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence.
                goto Invalid;
            }

            if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80)))
            {
                // This is a UTF-16 surrogate code point, which is invalid in UTF-8.
                goto Invalid;
            }

            if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80)))
            {
                // This is an overlong 4-byte sequence.
                goto Invalid;
            }

            // The first two bytes were just fine. We don't need to perform any other checks
            // on the remaining bytes other than to see that they're valid continuation bytes.

            // Try reading input[2].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            thisByteSignExtended = (sbyte)source[index];
            if (thisByteSignExtended >= -64)
            {
                goto Invalid; // this byte is not a UTF-8 continuation byte
            }

            tempValue <<= 6;
            tempValue  += (uint)thisByteSignExtended;
            tempValue  += 0x80;                // remove the continuation byte marker
            tempValue  -= (0xE0 - 0xC0) << 12; // remove the leading byte marker

            if (tempValue <= 0xFFFF)
            {
                Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF));
                goto Finish; // this is a valid 3-byte sequence
            }

            // Try reading input[3].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            thisByteSignExtended = (sbyte)source[index];
            if (thisByteSignExtended >= -64)
            {
                goto Invalid; // this byte is not a UTF-8 continuation byte
            }

            tempValue <<= 6;
            tempValue  += (uint)thisByteSignExtended;
            tempValue  += 0x80;                // remove the continuation byte marker
            tempValue  -= (0xF0 - 0xE0) << 18; // remove the leading byte marker

            UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue);
            goto Finish; // this is a valid 4-byte sequence

FirstByteInvalid:

            index = 1; // Invalid subsequences are always at least length 1.

Invalid:

            Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3
            bytesConsumed = index;
            result        = ReplacementChar;
            return(OperationStatus.InvalidData);

NeedsMoreData:

            Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3
            bytesConsumed = index;
            result        = ReplacementChar;
            return(OperationStatus.NeedMoreData);
        }
Пример #15
0
 internal SpanRuneEnumerator(ReadOnlySpan <Char> buffer)
 {
     _remaining = buffer;
     _current   = default;
 }