/// <summary> /// Returns a value stating whether the current <see cref="Utf8Span"/> instance contains /// <paramref name="value"/>. The specified comparison is used. /// </summary> public bool Contains(char value, StringComparison comparison) { return(Rune.TryCreate(value, out Rune rune) && Contains(rune, comparison)); }
internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan <byte> bytes, out Rune value, out int bytesConsumed) { return(Rune.DecodeFromUtf8(bytes, out value, out bytesConsumed)); }
/// <summary> /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with /// the specified <see cref="Rune"/>. The specified comparison is used. /// </summary> public bool StartsWith(Rune value, StringComparison comparison) { // TODO_UTF8STRING: Optimize me to avoid allocations. return(this.ToString().StartsWith(value.ToString(), comparison)); }
/// <summary> /// Returns a value stating whether the current <see cref="Utf8Span"/> instance contains /// <paramref name="value"/>. An ordinal comparison is used. /// </summary> public bool Contains(char value) { return(Rune.TryCreate(value, out Rune rune) && Contains(rune)); }
/// <summary> /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with /// <paramref name="value"/>. An ordinal comparison is used. /// </summary> public bool StartsWith(char value) { return(Rune.TryCreate(value, out Rune rune) && StartsWith(rune)); }
/// <summary> /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with /// <paramref name="value"/>. The specified comparison is used. /// </summary> public bool StartsWith(char value, StringComparison comparison) { return(Rune.TryCreate(value, out Rune rune) && StartsWith(rune, comparison)); }
internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan <byte> bytes, out Rune value, out int bytesConsumed) { if (!bytes.IsEmpty) { byte b = bytes[0]; if (b <= 0x7F) { // ASCII byte value = new Rune(b); bytesConsumed = 1; return(OperationStatus.Done); } else { // Non-ASCII byte value = Rune.ReplacementChar; bytesConsumed = 1; return(OperationStatus.InvalidData); } } else { // No data to decode value = Rune.ReplacementChar; bytesConsumed = 0; return(OperationStatus.NeedMoreData); } }
private void GrowAndAppend(Rune rune) { Grow(2); Append(rune); }
/// <summary> /// Locates <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/> /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)". /// </summary> /// <remarks> /// The search is performed using the specified <paramref name="comparisonType"/>. /// </remarks> public SplitOnResult SplitOn(Rune separator, StringComparison comparisonType) { return(TryFind(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this)); }
internal Enumerator(Utf8Span span) { _currentRune = default; _remainingUtf8Bytes = span.Bytes; }
/// <summary> /// Locates <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/> /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)". /// </summary> /// <remarks> /// An ordinal search is performed. /// </remarks> public SplitOnResult SplitOn(Rune separator) { return(TryFind(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this)); }
public SplitResult Split(Rune separator, Utf8StringSplitOptions options = Utf8StringSplitOptions.None) { Utf8String.CheckSplitOptions(options); return(new SplitResult(this, separator, options)); }
public bool Equals(Rune other) => this == other;
/// <summary> /// Decodes the <see cref="Rune"/> at the beginning of the provided UTF-8 source buffer. /// </summary> /// <returns> /// <para> /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns <see cref="OperationStatus.Done"/>, /// and outs via <paramref name="result"/> the decoded <see cref="Rune"/> and via <paramref name="bytesConsumed"/> the /// number of <see langword="byte"/>s used in the input buffer to encode the <see cref="Rune"/>. /// </para> /// <para> /// If the source buffer is empty or contains only a partial UTF-8 subsequence, returns <see cref="OperationStatus.NeedMoreData"/>, /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the length of the input buffer. /// </para> /// <para> /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns <see cref="OperationStatus.InvalidData"/>, /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the number of /// <see langword="char"/>s used in the input buffer to encode the ill-formed sequence. /// </para> /// </returns> /// <remarks> /// The general calling convention is to call this method in a loop, slicing the <paramref name="source"/> buffer by /// <paramref name="bytesConsumed"/> elements on each iteration of the loop. On each iteration of the loop <paramref name="result"/> /// will contain the real scalar value if successfully decoded, or it will contain <see cref="ReplacementChar"/> if /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of /// invalid sequences while iterating through the loop. /// </remarks> public static OperationStatus DecodeFromUtf8(ReadOnlySpan <byte> source, out Rune result, out int bytesConsumed) { // This method follows the Unicode Standard's recommendation for detecting // the maximal subpart of an ill-formed subsequence. See The Unicode Standard, // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence, // it tries to consume as many code units as possible as long as those code // units constitute the beginning of a longer well-formed subsequence per Table 3-7. int index = 0; // Try reading input[0]. if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } uint tempValue = source[index]; if (!UnicodeUtility.IsAsciiCodePoint(tempValue)) { goto NotAscii; } Finish: bytesConsumed = index + 1; Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4] result = UnsafeCreate(tempValue); return(OperationStatus.Done); NotAscii: // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in // the range [C2..F4]. If it's outside of that range, it's either a standalone // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range // four-byte sequence. if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4)) { goto FirstByteInvalid; } tempValue = (tempValue - 0xC2) << 6; // Try reading input[1]. index++; if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } // Continuation bytes are of the form [10xxxxxx], which means that their two's // complement representation is in the range [-65..-128]. This allows us to // perform a single comparison to see if a byte is a continuation byte. int thisByteSignExtended = (sbyte)source[index]; if (thisByteSignExtended >= -64) { goto Invalid; } tempValue += (uint)thisByteSignExtended; tempValue += 0x80; // remove the continuation byte marker tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker if (tempValue < 0x0800) { Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF)); goto Finish; // this is a valid 2-byte sequence } // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have // enough information (from just two code units) to detect overlong or surrogate // sequences, we need to perform these checks now. if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80))) { // The first two bytes were not in the range [[E0 A0]..[F4 8F]]. // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence. goto Invalid; } if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80))) { // This is a UTF-16 surrogate code point, which is invalid in UTF-8. goto Invalid; } if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80))) { // This is an overlong 4-byte sequence. goto Invalid; } // The first two bytes were just fine. We don't need to perform any other checks // on the remaining bytes other than to see that they're valid continuation bytes. // Try reading input[2]. index++; if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } thisByteSignExtended = (sbyte)source[index]; if (thisByteSignExtended >= -64) { goto Invalid; // this byte is not a UTF-8 continuation byte } tempValue <<= 6; tempValue += (uint)thisByteSignExtended; tempValue += 0x80; // remove the continuation byte marker tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker if (tempValue <= 0xFFFF) { Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF)); goto Finish; // this is a valid 3-byte sequence } // Try reading input[3]. index++; if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } thisByteSignExtended = (sbyte)source[index]; if (thisByteSignExtended >= -64) { goto Invalid; // this byte is not a UTF-8 continuation byte } tempValue <<= 6; tempValue += (uint)thisByteSignExtended; tempValue += 0x80; // remove the continuation byte marker tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); goto Finish; // this is a valid 4-byte sequence FirstByteInvalid: index = 1; // Invalid subsequences are always at least length 1. Invalid: Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3 bytesConsumed = index; result = ReplacementChar; return(OperationStatus.InvalidData); NeedsMoreData: Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3 bytesConsumed = index; result = ReplacementChar; return(OperationStatus.NeedMoreData); }
internal SpanRuneEnumerator(ReadOnlySpan <Char> buffer) { _remaining = buffer; _current = default; }