// Returns the left column in a range of the string. internal static int CalculateLeftColumn(List <Rune> t, int start, int end, int width, int currentColumn) { if (t == null) { return(0); } (var dSize, _) = TextModel.DisplaySize(t, start, end); if (dSize < width) { return(start); } int size = 0; int tcount = end > t.Count - 1 ? t.Count - 1 : end; int col = 0; for (int i = tcount; i > start; i--) { var rune = t [i]; var s = Rune.ColumnWidth(rune); size += s; if (size >= dSize - width) { col = tcount - i + start; if (start == 0 || col == start || (currentColumn == t.Count && (currentColumn - col > width))) { col++; } break; } } return(col); }
internal static int GetColFromX(List <Rune> t, int start, int x) { if (x < 0) { return(x); } int size = start; var pX = x + start; for (int i = start; i < t.Count; i++) { var r = t [i]; size += Rune.ColumnWidth(r); if (i == pX || (size > pX)) { return(i - start); } } return(t.Count - start); }
void RenderUstr(ConsoleDriver driver, ustring ustr, int col, int line, int width) { int byteLen = ustr.Length; int used = 0; for (int i = 0; i < byteLen;) { (var rune, var size) = Utf8.DecodeRune(ustr, i, i - byteLen); var count = Rune.ColumnWidth(rune); if (used + count >= width) { break; } driver.AddRune(rune); used += count; i += size; } for (; used < width; used++) { driver.AddRune(' '); } }
// Returns the size and length in a range of the string. internal static (int size, int length) DisplaySize(List <Rune> t, int start = -1, int end = -1, bool checkNextRune = true) { if (t == null || t.Count == 0) { return(0, 0); } int size = 0; int len = 0; int tcount = end == -1 ? t.Count : end > t.Count ? t.Count : end; int i = start == -1 ? 0 : start; for (; i < tcount; i++) { var rune = t [i]; size += Rune.ColumnWidth(rune); len += Rune.RuneLen(rune); if (checkNextRune && i == tcount - 1 && t.Count > tcount && Rune.ColumnWidth(t [i + 1]) > 1) { size += Rune.ColumnWidth(t [i + 1]); len += Rune.RuneLen(t [i + 1]); } } return(size, len); }
/// <summary> /// ToLower maps the rune to lower case. /// </summary> /// <returns>The lower cased rune if it can be.</returns> /// <param name="rune">Rune.</param> public static uint ToLower(Rune rune) => NStack.Unicode.ToLower(rune.value);
/// <summary> /// Reports whether the rune is a title case letter. /// </summary> /// <returns><c>true</c>, if the rune is a lower case lette, <c>false</c> otherwise.</returns> /// <param name="rune">The rune to test for.</param> public static bool IsTitle(Rune rune) => NStack.Unicode.IsTitle(rune.value);
/// <summary> /// IsControl reports whether the rune is a control character. /// </summary> /// <returns><c>true</c>, if the rune is a lower case letter, <c>false</c> otherwise.</returns> /// <param name="rune">The rune to test for.</param> /// <remarks> /// The C (Other) Unicode category includes more code points such as surrogates; use C.InRange (r) to test for them. /// </remarks> public static bool IsControl(Rune rune) => NStack.Unicode.IsControl(rune.value);
/// <summary> /// IsGraphic reports whether the rune is defined as a Graphic by Unicode. /// </summary> /// <returns><c>true</c>, if the rune is a lower case letter, <c>false</c> otherwise.</returns> /// <param name="rune">The rune to test for.</param> /// <remarks> /// Such characters include letters, marks, numbers, punctuation, symbols, and /// spaces, from categories L, M, N, P, S, Zs. /// </remarks> public static bool IsGraphic(Rune rune) => NStack.Unicode.IsGraphic(rune.value);
/// <summary> /// Locates the last occurrence of <paramref name="separator"/> within this <see cref="Utf8String"/> instance, creating <see cref="Utf8String"/> /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found /// within this <see cref="Utf8String"/> instance, returns the tuple "(this, null)". /// </summary> /// <remarks> /// An ordinal search is performed. /// </remarks> public SplitOnResult SplitOnLast(Rune separator) { return(TryFindLast(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this)); }
// // IRI normalization for strings containing characters that are not allowed or // escaped characters that should be unescaped in the context of the specified Uri component. // internal static unsafe string EscapeUnescapeIri(char *pInput, int start, int end, UriComponents component) { int size = end - start; ValueStringBuilder dest = new ValueStringBuilder(size); byte[]? bytes = null; int next = start; char ch; Span <byte> maxUtf8EncodedSpan = stackalloc byte[4]; for (; next < end; ++next) { if ((ch = pInput[next]) == '%') { if (next + 2 < end) { ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]); // Do not unescape a reserved char if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch)) { // keep as is dest.Append(pInput[next++]); dest.Append(pInput[next++]); dest.Append(pInput[next]); continue; } else if (ch <= '\x7F') { Debug.Assert(ch < 0xFF, "Expecting ASCII character."); //ASCII dest.Append(ch); next += 2; continue; } else { // possibly utf8 encoded sequence of unicode // check if safe to unescape according to Iri rules Debug.Assert(ch < 0xFF, "Expecting ASCII character."); int startSeq = next; int byteCount = 1; // lazy initialization of max size, will reuse the array for next sequences if (bytes is null) { bytes = new byte[end - next]; } bytes[0] = (byte)ch; next += 3; while (next < end) { // Check on exit criterion if ((ch = pInput[next]) != '%' || next + 2 >= end) { break; } // already made sure we have 3 characters in str ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]); //invalid hex sequence ? if (ch == Uri.c_DummyChar) { break; } // character is not part of a UTF-8 sequence ? else if (ch < '\x80') { break; } else { //a UTF-8 sequence bytes[byteCount++] = (byte)ch; next += 3; } Debug.Assert(ch < 0xFF, "Expecting ASCII character."); } next--; // for loop will increment // Using encoder with no replacement fall-back will skip all invalid UTF-8 sequences. Encoding noFallbackCharUTF8 = Encoding.GetEncoding( Encoding.UTF8.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback("")); char[] unescapedChars = new char[bytes.Length]; int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0); if (charCount != 0) { // If invalid sequences were present in the original escaped string, we need to // copy the escaped versions of those sequences. // Decoded Unicode values will be kept only when they are allowed by the URI/IRI RFC // rules. UriHelper.MatchUTF8Sequence(ref dest, unescapedChars, charCount, bytes, byteCount, component == UriComponents.Query, true); } else { // copy escaped sequence as is for (int i = startSeq; i <= next; ++i) { dest.Append(pInput[i]); } } } } else { dest.Append(pInput[next]); } } else if (ch > '\x7f') { // unicode bool isInIriUnicodeRange; bool surrogatePair = false; char ch2 = '\0'; if ((char.IsHighSurrogate(ch)) && (next + 1 < end)) { ch2 = pInput[next + 1]; isInIriUnicodeRange = CheckIriUnicodeRange(ch, ch2, out surrogatePair, component == UriComponents.Query); } else { isInIriUnicodeRange = CheckIriUnicodeRange(ch, component == UriComponents.Query); } if (isInIriUnicodeRange) { dest.Append(ch); if (surrogatePair) { dest.Append(ch2); } } else { Rune rune; if (surrogatePair) { rune = new Rune(ch, ch2); } else if (!Rune.TryCreate(ch, out rune)) { rune = Rune.ReplacementChar; } int bytesWritten = rune.EncodeToUtf8(maxUtf8EncodedSpan); Span <byte> encodedBytes = maxUtf8EncodedSpan.Slice(0, bytesWritten); foreach (byte b in encodedBytes) { UriHelper.EscapeAsciiChar(b, ref dest); } } if (surrogatePair) { next++; } } else { // just copy the character dest.Append(pInput[next]); } } string result = dest.ToString(); return(result); }
/// <summary> /// Returns a value stating whether the current <see cref="Utf8String"/> instance begins with /// the specified <see cref="Rune"/>. The specified comparison is used. /// </summary> public bool StartsWith(Rune value, StringComparison comparison) { // TODO_UTF8STRING: Optimize me to avoid allocations. return(ToString().StartsWith(value.ToString(), comparison)); }
/// <summary> /// Returns a value stating whether the current <see cref="Utf8String"/> instance begins with /// <paramref name="value"/>. The specified comparison is used. /// </summary> public bool StartsWith(char value, StringComparison comparison) { return(Rune.TryCreate(value, out Rune rune) && StartsWith(rune, comparison)); }
/// <summary> /// Returns a value stating whether the current <see cref="Utf8String"/> instance ends with /// <paramref name="value"/>. An ordinal comparison is used. /// </summary> public bool EndsWith(char value) { return(Rune.TryCreate(value, out Rune rune) && EndsWith(rune)); }
/// <summary> /// Returns a value stating whether the current <see cref="Utf8String"/> instance contains /// <paramref name="value"/>. The specified comparison is used. /// </summary> public bool Contains(char value, StringComparison comparison) { return(Rune.TryCreate(value, out Rune rune) && Contains(rune, comparison)); }
// // IRI normalization for strings containing characters that are not allowed or // escaped characters that should be unescaped in the context of the specified Uri component. // internal static unsafe string EscapeUnescapeIri(char *pInput, int start, int end, UriComponents component) { int size = end - start; ValueStringBuilder dest = size <= 256 ? new ValueStringBuilder(stackalloc char[256]) : new ValueStringBuilder(size); Span <byte> maxUtf8EncodedSpan = stackalloc byte[4]; for (int i = start; i < end; ++i) { char ch = pInput[i]; if (ch == '%') { if (end - i > 2) { ch = UriHelper.DecodeHexChars(pInput[i + 1], pInput[i + 2]); // Do not unescape a reserved char if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch)) { // keep as is dest.Append(pInput[i++]); dest.Append(pInput[i++]); dest.Append(pInput[i]); continue; } else if (ch <= '\x7F') { Debug.Assert(ch < 0xFF, "Expecting ASCII character."); //ASCII dest.Append(ch); i += 2; continue; } else { // possibly utf8 encoded sequence of unicode int charactersRead = PercentEncodingHelper.UnescapePercentEncodedUTF8Sequence( pInput + i, end - i, ref dest, component == UriComponents.Query, iriParsing: true); Debug.Assert(charactersRead > 0); i += charactersRead - 1; // -1 as i will be incremented in the loop } } else { dest.Append(pInput[i]); } } else if (ch > '\x7f') { // unicode bool isInIriUnicodeRange; bool surrogatePair = false; char ch2 = '\0'; if ((char.IsHighSurrogate(ch)) && (i + 1 < end)) { ch2 = pInput[i + 1]; isInIriUnicodeRange = CheckIriUnicodeRange(ch, ch2, out surrogatePair, component == UriComponents.Query); } else { isInIriUnicodeRange = CheckIriUnicodeRange(ch, component == UriComponents.Query); } if (isInIriUnicodeRange) { dest.Append(ch); if (surrogatePair) { dest.Append(ch2); } } else { Rune rune; if (surrogatePair) { rune = new Rune(ch, ch2); } else if (!Rune.TryCreate(ch, out rune)) { rune = Rune.ReplacementChar; } int bytesWritten = rune.EncodeToUtf8(maxUtf8EncodedSpan); Span <byte> encodedBytes = maxUtf8EncodedSpan.Slice(0, bytesWritten); foreach (byte b in encodedBytes) { UriHelper.EscapeAsciiChar(b, ref dest); } } if (surrogatePair) { i++; } } else { // just copy the character dest.Append(pInput[i]); } } return(dest.ToString()); }
public static unsafe int UnescapePercentEncodedUTF8Sequence(char *input, int length, ref ValueStringBuilder dest, bool isQuery, bool iriParsing) { // The following assertions rely on the input not mutating mid-operation, as is the case currently since callers are working with strings // If we start accepting input such as spans, this method must be audited to ensure no buffer overruns/infinite loops could occur // As an optimization, this method should only be called after the first character is known to be a part of a non-ascii UTF8 sequence Debug.Assert(length >= 3); Debug.Assert(input[0] == '%'); Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) != Uri.c_DummyChar); Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) >= 128); uint fourByteBuffer = 0; int bytesLeftInBuffer = 0; int totalCharsConsumed = 0; int charsToCopy = 0; int bytesConsumed = 0; RefillBuffer: int i = totalCharsConsumed + (bytesLeftInBuffer * 3); ReadByteFromInput: if ((uint)(length - i) <= 2 || input[i] != '%') { goto NoMoreOrInvalidInput; } uint value = input[i + 1]; if ((uint)((value - 'A') & ~0x20) <= ('F' - 'A')) { value = (value | 0x20) - 'a' + 10; } else if ((value - '8') <= ('9' - '8')) { value -= '0'; } else { goto NoMoreOrInvalidInput; // First character wasn't hex or was <= 7F (Ascii) } uint second = (uint)input[i + 2] - '0'; if (second <= 9) { // second is already [0, 9] } else if ((uint)((second - ('A' - '0')) & ~0x20) <= ('F' - 'A')) { second = ((second + '0') | 0x20) - 'a' + 10; } else { goto NoMoreOrInvalidInput; // Second character wasn't Hex } value = (value << 4) | second; Debug.Assert(value >= 128); // Rotate the buffer and overwrite the last byte if (BitConverter.IsLittleEndian) { fourByteBuffer = (fourByteBuffer >> 8) | (value << 24); } else { fourByteBuffer = (fourByteBuffer << 8) | value; } if (++bytesLeftInBuffer != 4) { i += 3; goto ReadByteFromInput; } DecodeRune: Debug.Assert(totalCharsConsumed % 3 == 0); Debug.Assert(bytesLeftInBuffer == 2 || bytesLeftInBuffer == 3 || bytesLeftInBuffer == 4); Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00000080 : 0x80000000)) != 0); Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00008000 : 0x00800000)) != 0); Debug.Assert(bytesLeftInBuffer < 3 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00800000 : 0x00008000)) != 0); Debug.Assert(bytesLeftInBuffer < 4 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x80000000 : 0x00000080)) != 0); uint temp = fourByteBuffer; // make a copy so that the *copy* (not the original) is marked address-taken if (Rune.DecodeFromUtf8(new ReadOnlySpan <byte>(&temp, bytesLeftInBuffer), out Rune rune, out bytesConsumed) == OperationStatus.Done) { Debug.Assert(bytesConsumed >= 2, $"Rune.DecodeFromUtf8 consumed {bytesConsumed} bytes, likely indicating input was modified concurrently during UnescapePercentEncodedUTF8Sequence's execution"); if (!iriParsing || IriHelper.CheckIriUnicodeRange((uint)rune.Value, isQuery)) { if (charsToCopy != 0) { dest.Append(input + totalCharsConsumed - charsToCopy, charsToCopy); charsToCopy = 0; } dest.Append(rune); goto AfterDecodeRune; } }
private static void EscapeStringToBuilder( ReadOnlySpan <char> stringToEscape, ref ValueStringBuilder vsb, ReadOnlySpan <bool> noEscape, bool checkExistingEscaped) { // Allocate enough stack space to hold any Rune's UTF8 encoding. Span <byte> utf8Bytes = stackalloc byte[4]; // Then enumerate every rune in the input. SpanRuneEnumerator e = stringToEscape.EnumerateRunes(); while (e.MoveNext()) { Rune r = e.Current; if (!r.IsAscii) { // The rune is non-ASCII, so encode it as UTF8, and escape each UTF8 byte. r.TryEncodeToUtf8(utf8Bytes, out int bytesWritten); foreach (byte b in utf8Bytes.Slice(0, bytesWritten)) { vsb.Append('%'); HexConverter.ToCharsBuffer(b, vsb.AppendSpan(2), 0, HexConverter.Casing.Upper); } continue; } // If the value doesn't need to be escaped, append it and continue. byte value = (byte)r.Value; if (noEscape[value]) { vsb.Append((char)value); continue; } // If we're checking for existing escape sequences, then if this is the beginning of // one, check the next two characters in the sequence. This is a little tricky to do // as we're using an enumerator, but luckily it's a ref struct-based enumerator: we can // make a copy and iterate through the copy without impacting the original, and then only // push the original ahead if we find what we're looking for in the copy. if (checkExistingEscaped && value == '%') { // If the next two characters are valid escaped ASCII, then just output them as-is. SpanRuneEnumerator tmpEnumerator = e; if (tmpEnumerator.MoveNext()) { Rune r1 = tmpEnumerator.Current; if (r1.IsAscii && IsHexDigit((char)r1.Value) && tmpEnumerator.MoveNext()) { Rune r2 = tmpEnumerator.Current; if (r2.IsAscii && IsHexDigit((char)r2.Value)) { vsb.Append('%'); vsb.Append((char)r1.Value); vsb.Append((char)r2.Value); e = tmpEnumerator; continue; } } } } // Otherwise, append the escaped character. vsb.Append('%'); HexConverter.ToCharsBuffer(value, vsb.AppendSpan(2), 0, HexConverter.Casing.Upper); } }
/// <summary> /// ToLower maps the rune to title case. /// </summary> /// <returns>The lower cased rune if it can be.</returns> /// <param name="rune">Rune.</param> public static uint ToTitle(Rune rune) => NStack.Unicode.ToTitle(rune.value);
/// <summary> /// SimpleFold iterates over Unicode code points equivalent under /// the Unicode-defined simple case folding. /// </summary> /// <returns>The simple-case folded rune.</returns> /// <param name="rune">Rune.</param> /// <remarks> /// SimpleFold iterates over Unicode code points equivalent under /// the Unicode-defined simple case folding. Among the code points /// equivalent to rune (including rune itself), SimpleFold returns the /// smallest rune > r if one exists, or else the smallest rune >= 0. /// If r is not a valid Unicode code point, SimpleFold(r) returns r. /// /// For example: /// <code> /// SimpleFold('A') = 'a' /// SimpleFold('a') = 'A' /// /// SimpleFold('K') = 'k' /// SimpleFold('k') = '\u212A' (Kelvin symbol, K) /// SimpleFold('\u212A') = 'K' /// /// SimpleFold('1') = '1' /// /// SimpleFold(-2) = -2 /// </code> /// </remarks> public static uint SimpleFold(Rune rune) => NStack.Unicode.SimpleFold(rune.value);
// Ordinal search public bool Contains(char value) { return(Rune.TryCreate(value, out Rune result) && Contains(result)); }
public SplitResult Split(Rune separator, Utf8StringSplitOptions options = Utf8StringSplitOptions.None) { CheckSplitOptions(options); return(new SplitResult(this, separator, options)); }
// Ordinal search public int IndexOf(char value) { return(Rune.TryCreate(value, out Rune result) ? IndexOf(result) : -1); }
/// <summary> /// Locates the last occurrence of <paramref name="separator"/> within this <see cref="Utf8String"/> instance, creating <see cref="Utf8String"/> /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found /// within this <see cref="Utf8String"/> instance, returns the tuple "(this, null)". /// </summary> /// <remarks> /// The search is performed using the specified <paramref name="comparisonType"/>. /// </remarks> public SplitOnResult SplitOnLast(Rune separator, StringComparison comparisonType) { return(TryFindLast(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this)); }
// Ordinal search public bool StartsWith(char value) { return(Rune.TryCreate(value, out Rune result) && StartsWith(result)); }
/// <summary> /// IsPrint reports whether the rune is defined as printable. /// </summary> /// <returns><c>true</c>, if the rune is a lower case letter, <c>false</c> otherwise.</returns> /// <param name="rune">The rune to test for.</param> /// <remarks> /// Such characters include letters, marks, numbers, punctuation, symbols, and the /// ASCII space character, from categories L, M, N, P, S and the ASCII space /// character. This categorization is the same as IsGraphic except that the /// only spacing character is ASCII space, U+0020. /// </remarks> public static bool IsPrint(Rune rune) => NStack.Unicode.IsPrint(rune.value);
/// <summary> /// Attempts to locate the target <paramref name="value"/> within this <see cref="Utf8String"/> instance. /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to /// the location where <paramref name="value"/> occurs within this <see cref="Utf8String"/> instance. /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/> /// to <see langword="default"/>. /// </summary> /// <remarks> /// An ordinal search is performed. /// </remarks> public bool TryFind(Rune value, out Range range) => this.AsSpanSkipNullCheck().TryFind(value, out range);
/// <summary> /// IsMark reports whether the rune is a letter (category M). /// </summary> /// <returns><c>true</c>, if the rune is a mark, <c>false</c> otherwise.</returns> /// <param name="rune">The rune to test for.</param> /// <remarks> /// Reports whether the rune is a mark character (category M). /// </remarks> public static bool IsMark(Rune rune) => NStack.Unicode.IsMark(rune.value);
/// <summary> /// Attempts to locate the target <paramref name="value"/> within this <see cref="Utf8String"/> instance. /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to /// the location where <paramref name="value"/> occurs within this <see cref="Utf8String"/> instance. /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/> /// to <see langword="default"/>. /// </summary> /// <remarks> /// The search is performed using the specified <paramref name="comparisonType"/>. /// </remarks> public bool TryFind(Rune value, StringComparison comparisonType, out Range range) => this.AsSpanSkipNullCheck().TryFind(value, comparisonType, out range);
/// <summary> /// Reports whether the rune is a lower case letter. /// </summary> /// <returns><c>true</c>, if the rune is a lower case lette, <c>false</c> otherwise.</returns> /// <param name="rune">The rune to test for.</param> public static bool IsLower(Rune rune) => NStack.Unicode.IsLower(rune.value);
/// <summary> /// Return true if the given symbol should be considered part of a word /// and can be contained in matches. Base behaviour is to use <see cref="char.IsLetterOrDigit(char)"/> /// </summary> /// <param name="rune"></param> /// <returns></returns> public virtual bool IsWordChar(Rune rune) { return(Char.IsLetterOrDigit((char)rune)); }