// Unescapes entire string and checks if it has unicode chars // Also checks for sequences that are 3986 Unreserved characters as these should be un-escaped private static bool CheckForUnicodeOrEscapedUnreserved(string data) { for (int i = 0; i < data.Length; i++) { char c = data[i]; if (c == '%') { if ((uint)(i + 2) < (uint)data.Length) { char value = UriHelper.DecodeHexChars(data[i + 1], data[i + 2]); if (value >= UriHelper.UnreservedTable.Length || UriHelper.UnreservedTable[value]) { return(true); } i += 2; } } else if (c > 0x7F) { return(true); } } return(false); }
// // IRI normalization for strings containing characters that are not allowed or // escaped characters that should be unescaped in the context of the specified Uri component. // internal static unsafe string EscapeUnescapeIri(char *pInput, int start, int end, UriComponents component) { int size = end - start; ValueStringBuilder dest = size <= 256 ? new ValueStringBuilder(stackalloc char[256]) : new ValueStringBuilder(size); Span <byte> maxUtf8EncodedSpan = stackalloc byte[4]; for (int i = start; i < end; ++i) { char ch = pInput[i]; if (ch == '%') { if (end - i > 2) { ch = UriHelper.DecodeHexChars(pInput[i + 1], pInput[i + 2]); // Do not unescape a reserved char if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch)) { // keep as is dest.Append(pInput[i++]); dest.Append(pInput[i++]); dest.Append(pInput[i]); continue; } else if (ch <= '\x7F') { Debug.Assert(ch < 0xFF, "Expecting ASCII character."); //ASCII dest.Append(ch); i += 2; continue; } else { // possibly utf8 encoded sequence of unicode int charactersRead = PercentEncodingHelper.UnescapePercentEncodedUTF8Sequence( pInput + i, end - i, ref dest, component == UriComponents.Query, iriParsing: true); Debug.Assert(charactersRead > 0); i += charactersRead - 1; // -1 as i will be incremented in the loop } } else { dest.Append(pInput[i]); } } else if (ch > '\x7f') { // unicode bool isInIriUnicodeRange; bool surrogatePair = false; char ch2 = '\0'; if ((char.IsHighSurrogate(ch)) && (i + 1 < end)) { ch2 = pInput[i + 1]; isInIriUnicodeRange = CheckIriUnicodeRange(ch, ch2, out surrogatePair, component == UriComponents.Query); } else { isInIriUnicodeRange = CheckIriUnicodeRange(ch, component == UriComponents.Query); } if (isInIriUnicodeRange) { dest.Append(ch); if (surrogatePair) { dest.Append(ch2); } } else { Rune rune; if (surrogatePair) { rune = new Rune(ch, ch2); } else if (!Rune.TryCreate(ch, out rune)) { rune = Rune.ReplacementChar; } int bytesWritten = rune.EncodeToUtf8(maxUtf8EncodedSpan); Span <byte> encodedBytes = maxUtf8EncodedSpan.Slice(0, bytesWritten); foreach (byte b in encodedBytes) { UriHelper.EscapeAsciiChar(b, ref dest); } } if (surrogatePair) { i++; } } else { // just copy the character dest.Append(pInput[i]); } } return(dest.ToString()); }
// // IRI normalization for strings containing characters that are not allowed or // escaped characters that should be unescaped in the context of the specified Uri component. // internal static unsafe string EscapeUnescapeIri(char *pInput, int start, int end, UriComponents component) { int size = end - start; ValueStringBuilder dest = new ValueStringBuilder(size); byte[]? bytes = null; int next = start; char ch; Span <byte> maxUtf8EncodedSpan = stackalloc byte[4]; for (; next < end; ++next) { if ((ch = pInput[next]) == '%') { if (next + 2 < end) { ch = UriHelper.DecodeHexChars(pInput[next + 1], pInput[next + 2]); // Do not unescape a reserved char if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch)) { // keep as is dest.Append(pInput[next++]); dest.Append(pInput[next++]); dest.Append(pInput[next]); continue; } else if (ch <= '\x7F') { Debug.Assert(ch < 0xFF, "Expecting ASCII character."); //ASCII dest.Append(ch); next += 2; continue; } else { // possibly utf8 encoded sequence of unicode // check if safe to unescape according to Iri rules Debug.Assert(ch < 0xFF, "Expecting ASCII character."); int startSeq = next; int byteCount = 1; // lazy initialization of max size, will reuse the array for next sequences if (bytes is null) { bytes = new byte[end - next]; } bytes[0] = (byte)ch; next += 3; while (next < end) { // Check on exit criterion if ((ch = pInput[next]) != '%' || next + 2 >= end) { break; } // already made sure we have 3 characters in str ch = UriHelper.DecodeHexChars(pInput[next + 1], pInput[next + 2]); //invalid hex sequence ? if (ch == Uri.c_DummyChar) { break; } // character is not part of a UTF-8 sequence ? else if (ch < '\x80') { break; } else { //a UTF-8 sequence bytes[byteCount++] = (byte)ch; next += 3; } Debug.Assert(ch < 0xFF, "Expecting ASCII character."); } next--; // for loop will increment // Using encoder with no replacement fall-back will skip all invalid UTF-8 sequences. Encoding noFallbackCharUTF8 = Encoding.GetEncoding( Encoding.UTF8.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback("")); char[] unescapedChars = new char[bytes.Length]; int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0); if (charCount != 0) { // If invalid sequences were present in the original escaped string, we need to // copy the escaped versions of those sequences. // Decoded Unicode values will be kept only when they are allowed by the URI/IRI RFC // rules. UriHelper.MatchUTF8Sequence(ref dest, unescapedChars, charCount, bytes, byteCount, component == UriComponents.Query, true); } else { // copy escaped sequence as is for (int i = startSeq; i <= next; ++i) { dest.Append(pInput[i]); } } } } else { dest.Append(pInput[next]); } } else if (ch > '\x7f') { // unicode bool isInIriUnicodeRange; bool surrogatePair = false; char ch2 = '\0'; if ((char.IsHighSurrogate(ch)) && (next + 1 < end)) { ch2 = pInput[next + 1]; isInIriUnicodeRange = CheckIriUnicodeRange(ch, ch2, out surrogatePair, component == UriComponents.Query); } else { isInIriUnicodeRange = CheckIriUnicodeRange(ch, component == UriComponents.Query); } if (isInIriUnicodeRange) { dest.Append(ch); if (surrogatePair) { dest.Append(ch2); } } else { Rune rune; if (surrogatePair) { rune = new Rune(ch, ch2); } else if (!Rune.TryCreate(ch, out rune)) { rune = Rune.ReplacementChar; } int bytesWritten = rune.EncodeToUtf8(maxUtf8EncodedSpan); Span <byte> encodedBytes = maxUtf8EncodedSpan.Slice(0, bytesWritten); foreach (byte b in encodedBytes) { UriHelper.EscapeAsciiChar(b, ref dest); } } if (surrogatePair) { next++; } } else { // just copy the character dest.Append(pInput[next]); } } string result = dest.ToString(); return(result); }
public static unsafe int UnescapePercentEncodedUTF8Sequence(char *input, int length, ref ValueStringBuilder dest, bool isQuery, bool iriParsing) { // The following assertions rely on the input not mutating mid-operation, as is the case currently since callers are working with strings // If we start accepting input such as spans, this method must be audited to ensure no buffer overruns/infinite loops could occur // As an optimization, this method should only be called after the first character is known to be a part of a non-ascii UTF8 sequence Debug.Assert(length >= 3); Debug.Assert(input[0] == '%'); Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) != Uri.c_DummyChar); Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) >= 128); uint fourByteBuffer = 0; int bytesLeftInBuffer = 0; int totalCharsConsumed = 0; int charsToCopy = 0; int bytesConsumed = 0; RefillBuffer: int i = totalCharsConsumed + (bytesLeftInBuffer * 3); ReadByteFromInput: if ((uint)(length - i) <= 2 || input[i] != '%') { goto NoMoreOrInvalidInput; } uint value = input[i + 1]; if ((uint)((value - 'A') & ~0x20) <= ('F' - 'A')) { value = (value | 0x20) - 'a' + 10; } else if ((value - '8') <= ('9' - '8')) { value -= '0'; } else { goto NoMoreOrInvalidInput; // First character wasn't hex or was <= 7F (Ascii) } uint second = (uint)input[i + 2] - '0'; if (second <= 9) { // second is already [0, 9] } else if ((uint)((second - ('A' - '0')) & ~0x20) <= ('F' - 'A')) { second = ((second + '0') | 0x20) - 'a' + 10; } else { goto NoMoreOrInvalidInput; // Second character wasn't Hex } value = (value << 4) | second; Debug.Assert(value >= 128); // Rotate the buffer and overwrite the last byte if (BitConverter.IsLittleEndian) { fourByteBuffer = (fourByteBuffer >> 8) | (value << 24); } else { fourByteBuffer = (fourByteBuffer << 8) | value; } if (++bytesLeftInBuffer != 4) { i += 3; goto ReadByteFromInput; } DecodeRune: Debug.Assert(totalCharsConsumed % 3 == 0); Debug.Assert(bytesLeftInBuffer == 2 || bytesLeftInBuffer == 3 || bytesLeftInBuffer == 4); Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00000080 : 0x80000000)) != 0); Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00008000 : 0x00800000)) != 0); Debug.Assert(bytesLeftInBuffer < 3 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00800000 : 0x00008000)) != 0); Debug.Assert(bytesLeftInBuffer < 4 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x80000000 : 0x00000080)) != 0); uint temp = fourByteBuffer; // make a copy so that the *copy* (not the original) is marked address-taken if (Rune.DecodeFromUtf8(new ReadOnlySpan <byte>(&temp, bytesLeftInBuffer), out Rune rune, out bytesConsumed) == OperationStatus.Done) { Debug.Assert(bytesConsumed >= 2, $"Rune.DecodeFromUtf8 consumed {bytesConsumed} bytes, likely indicating input was modified concurrently during UnescapePercentEncodedUTF8Sequence's execution"); if (!iriParsing || IriHelper.CheckIriUnicodeRange((uint)rune.Value, isQuery)) { if (charsToCopy != 0) { dest.Append(input + totalCharsConsumed - charsToCopy, charsToCopy); charsToCopy = 0; } dest.Append(rune); goto AfterDecodeRune; } }