// // Cleans up the specified component according to Iri rules // a) Chars allowed by iri in a component are unescaped if found escaped // b) Bidi chars are stripped // // should be called only if IRI parsing is switched on internal unsafe string EscapeUnescapeIri(string input, int start, int end, UriComponents component) { fixed(char *pInput = input) { return(IriHelper.EscapeUnescapeIri(pInput, start, end, component)); } }
// // Need to check for invalid utf sequences that may not have given any chars. // We got the unescaped chars, we then reencode them and match off the bytes // to get the invalid sequence bytes that we just copy off // internal static unsafe void MatchUTF8Sequence(char *pDest, char[] dest, ref int destOffset, char[] unescapedChars, int charCount, byte[] bytes, int byteCount, bool isQuery, bool iriParsing) { int count = 0; fixed(char *unescapedCharsPtr = unescapedChars) { for (int j = 0; j < charCount; ++j) { bool isHighSurr = Char.IsHighSurrogate(unescapedCharsPtr[j]); byte[] encodedBytes = Encoding.UTF8.GetBytes(unescapedChars, j, isHighSurr ? 2 : 1); int encodedBytesLength = encodedBytes.Length; // we have to keep unicode chars outside Iri range escaped bool inIriRange = false; if (iriParsing) { if (!isHighSurr) { inIriRange = IriHelper.CheckIriUnicodeRange(unescapedChars[j], isQuery); } else { bool surrPair = false; inIriRange = IriHelper.CheckIriUnicodeRange(unescapedChars[j], unescapedChars[j + 1], ref surrPair, isQuery); } } while (true) { // Escape any invalid bytes that were before this character while (bytes[count] != encodedBytes[0]) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); EscapeAsciiChar((char)bytes[count++], dest, ref destOffset); } // check if all bytes match bool allBytesMatch = true; int k = 0; for (; k < encodedBytesLength; ++k) { if (bytes[count + k] != encodedBytes[k]) { allBytesMatch = false; break; } } if (allBytesMatch) { count += encodedBytesLength; if (iriParsing) { if (!inIriRange) { // need to keep chars not allowed as escaped for (int l = 0; l < encodedBytes.Length; ++l) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); EscapeAsciiChar((char)encodedBytes[l], dest, ref destOffset); } } else if (!Uri.IsBidiControlCharacter(unescapedCharsPtr[j])) { //copy chars Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = unescapedCharsPtr[j]; if (isHighSurr) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = unescapedCharsPtr[j + 1]; } } } else { //copy chars Debug.Assert(dest.Length > destOffset); pDest[destOffset++] = unescapedCharsPtr[j]; if (isHighSurr) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = unescapedCharsPtr[j + 1]; } } break; // break out of while (true) since we've matched this char bytes } else { // copy bytes till place where bytes dont match for (int l = 0; l < k; ++l) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); EscapeAsciiChar((char)bytes[count++], dest, ref destOffset); } } } if (isHighSurr) { j++; } } } // Include any trailing invalid sequences while (count < byteCount) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); EscapeAsciiChar((char)bytes[count++], dest, ref destOffset); } }
internal static unsafe void UnescapeString(char *pStr, int start, int end, ref ValueStringBuilder dest, char rsvd1, char rsvd2, char rsvd3, UnescapeMode unescapeMode, UriParser?syntax, bool isQuery) { if ((unescapeMode & UnescapeMode.EscapeUnescape) == UnescapeMode.CopyOnly) { dest.Append(pStr + start, end - start); return; } bool escapeReserved = false; bool iriParsing = Uri.IriParsingStatic(syntax) && ((unescapeMode & UnescapeMode.EscapeUnescape) == UnescapeMode.EscapeUnescape); for (int next = start; next < end;) { char ch = (char)0; for (; next < end; ++next) { if ((ch = pStr[next]) == '%') { if ((unescapeMode & UnescapeMode.Unescape) == 0) { // re-escape, don't check anything else escapeReserved = true; } else if (next + 2 < end) { ch = DecodeHexChars(pStr[next + 1], pStr[next + 2]); // Unescape a good sequence if full unescape is requested if (unescapeMode >= UnescapeMode.UnescapeAll) { if (ch == Uri.c_DummyChar) { if (unescapeMode >= UnescapeMode.UnescapeAllOrThrow) { // Should be a rare case where the app tries to feed an invalid escaped sequence throw new UriFormatException(SR.net_uri_BadString); } continue; } } // re-escape % from an invalid sequence else if (ch == Uri.c_DummyChar) { if ((unescapeMode & UnescapeMode.Escape) != 0) { escapeReserved = true; } else { continue; // we should throw instead but since v1.0 would just print '%' } } // Do not unescape '%' itself unless full unescape is requested else if (ch == '%') { next += 2; continue; } // Do not unescape a reserved char unless full unescape is requested else if (ch == rsvd1 || ch == rsvd2 || ch == rsvd3) { next += 2; continue; } // Do not unescape a dangerous char unless it's V1ToStringFlags mode else if ((unescapeMode & UnescapeMode.V1ToStringFlag) == 0 && IsNotSafeForUnescape(ch)) { next += 2; continue; } else if (iriParsing && ((ch <= '\x9F' && IsNotSafeForUnescape(ch)) || (ch > '\x9F' && !IriHelper.CheckIriUnicodeRange(ch, isQuery)))) { // check if unenscaping gives a char outside iri range // if it does then keep it escaped next += 2; continue; } // unescape escaped char or escape % break; } else if (unescapeMode >= UnescapeMode.UnescapeAll) { if (unescapeMode >= UnescapeMode.UnescapeAllOrThrow) { // Should be a rare case where the app tries to feed an invalid escaped sequence throw new UriFormatException(SR.net_uri_BadString); } // keep a '%' as part of a bogus sequence continue; } else { escapeReserved = true; } // escape (escapeReserved==true) or otherwise unescape the sequence break; } else if ((unescapeMode & (UnescapeMode.Unescape | UnescapeMode.UnescapeAll)) == (UnescapeMode.Unescape | UnescapeMode.UnescapeAll)) { continue; } else if ((unescapeMode & UnescapeMode.Escape) != 0) { // Could actually escape some of the characters if (ch == rsvd1 || ch == rsvd2 || ch == rsvd3) { // found an unescaped reserved character -> escape it escapeReserved = true; break; } else if ((unescapeMode & UnescapeMode.V1ToStringFlag) == 0 && (ch <= '\x1F' || (ch >= '\x7F' && ch <= '\x9F'))) { // found an unescaped reserved character -> escape it escapeReserved = true; break; } } } //copy off previous characters from input while (start < next) { dest.Append(pStr[start++]); } if (next != end) { if (escapeReserved) { EscapeAsciiChar((byte)pStr[next], ref dest); escapeReserved = false; next++; } else if (ch <= 127) { dest.Append(ch); next += 3; } else { // Unicode int charactersRead = PercentEncodingHelper.UnescapePercentEncodedUTF8Sequence( pStr + next, end - next, ref dest, isQuery, iriParsing); Debug.Assert(charactersRead > 0); next += charactersRead; } start = next; } } }
internal unsafe static char[] UnescapeString(char *pStr, int start, int end, char[] dest, ref int destPosition, char rsvd1, char rsvd2, char rsvd3, UnescapeMode unescapeMode, UriParser syntax, bool isQuery) { byte [] bytes = null; byte escapedReallocations = 0; bool escapeReserved = false; int next = start; bool iriParsing = Uri.IriParsingStatic(syntax) && ((unescapeMode & UnescapeMode.EscapeUnescape) == UnescapeMode.EscapeUnescape); while (true) { // we may need to re-pin dest[] fixed(char *pDest = dest) { if ((unescapeMode & UnescapeMode.EscapeUnescape) == UnescapeMode.CopyOnly) { while (start < end) { pDest[destPosition++] = pStr[start++]; } return(dest); } while (true) { char ch = (char)0; for (; next < end; ++next) { if ((ch = pStr[next]) == '%') { if ((unescapeMode & UnescapeMode.Unescape) == 0) { // re-escape, don't check anything else escapeReserved = true; } else if (next + 2 < end) { ch = EscapedAscii(pStr[next + 1], pStr[next + 2]); // Unescape a good sequence if full unescape is requested if (unescapeMode >= UnescapeMode.UnescapeAll) { if (ch == Uri.c_DummyChar) { if (unescapeMode >= UnescapeMode.UnescapeAllOrThrow) { // Should be a rare case where the app tries to feed an invalid escaped sequence throw new UriFormatException(SR.GetString(SR.net_uri_BadString)); } continue; } } // re-escape % from an invalid sequence else if (ch == Uri.c_DummyChar) { if ((unescapeMode & UnescapeMode.Escape) != 0) { escapeReserved = true; } else { continue; // we should throw instead but since v1.0 woudl just print '%' } } // Do not unescape '%' itself unless full unescape is requested else if (ch == '%') { next += 2; continue; } // Do not unescape a reserved char unless full unescape is requested else if (ch == rsvd1 || ch == rsvd2 || ch == rsvd3) { next += 2; continue; } // Do not unescape a dangerous char unless it's V1ToStringFlags mode else if ((unescapeMode & UnescapeMode.V1ToStringFlag) == 0 && IsNotSafeForUnescape(ch)) { next += 2; continue; } else if (iriParsing && ((ch <= '\x9F' && IsNotSafeForUnescape(ch)) || (ch > '\x9F' && !IriHelper.CheckIriUnicodeRange(ch, isQuery)))) { // check if unenscaping gives a char ouside iri range // if it does then keep it escaped next += 2; continue; } // unescape escaped char or escape % break; } else if (unescapeMode >= UnescapeMode.UnescapeAll) { if (unescapeMode >= UnescapeMode.UnescapeAllOrThrow) { // Should be a rare case where the app tries to feed an invalid escaped sequence throw new UriFormatException(SR.GetString(SR.net_uri_BadString)); } // keep a '%' as part of a bogus sequence continue; } else { escapeReserved = true; } // escape (escapeReserved==ture) or otheriwse unescape the sequence break; } else if ((unescapeMode & (UnescapeMode.Unescape | UnescapeMode.UnescapeAll)) == (UnescapeMode.Unescape | UnescapeMode.UnescapeAll)) { continue; } else if ((unescapeMode & UnescapeMode.Escape) != 0) { // Could actually escape some of the characters if (ch == rsvd1 || ch == rsvd2 || ch == rsvd3) { // found an unescaped reserved character -> escape it escapeReserved = true; break; } else if ((unescapeMode & UnescapeMode.V1ToStringFlag) == 0 && (ch <= '\x1F' || (ch >= '\x7F' && ch <= '\x9F'))) { // found an unescaped reserved character -> escape it escapeReserved = true; break; } } } //copy off previous characters from input while (start < next) { pDest[destPosition++] = pStr[start++]; } if (next != end) { //VsWhidbey#87423 if (escapeReserved) { //escape that char // Since this should be _really_ rare case, reallocate with constant size increase of 30 rsvd-type characters. if (escapedReallocations == 0) { escapedReallocations = 30; char[] newDest = new char[dest.Length + escapedReallocations * 3]; fixed(char *pNewDest = newDest) { for (int i = 0; i < destPosition; ++i) { pNewDest[i] = pDest[i]; } } dest = newDest; // re-pin new dest[] array goto dest_fixed_loop_break; } else { --escapedReallocations; EscapeAsciiChar(pStr[next], dest, ref destPosition); escapeReserved = false; start = ++next; continue; } } // unescaping either one Ascii or possibly multiple Unicode if (ch <= '\x7F') { //ASCII dest[destPosition++] = ch; next += 3; start = next; continue; } // Unicode int byteCount = 1; // lazy initialization of max size, will reuse the array for next sequences if ((object)bytes == null) { bytes = new byte[end - next]; } bytes[0] = (byte)ch; next += 3; while (next < end) { // Check on exit criterion if ((ch = pStr[next]) != '%' || next + 2 >= end) { break; } // already made sure we have 3 characters in str ch = EscapedAscii(pStr[next + 1], pStr[next + 2]); //invalid hex sequence ? if (ch == Uri.c_DummyChar) { break; } // character is not part of a UTF-8 sequence ? else if (ch < '\x80') { break; } else { //a UTF-8 sequence bytes[byteCount++] = (byte)ch; next += 3; } } Encoding noFallbackCharUTF8 = (Encoding)Encoding.UTF8.Clone(); noFallbackCharUTF8.EncoderFallback = new EncoderReplacementFallback(""); noFallbackCharUTF8.DecoderFallback = new DecoderReplacementFallback(""); char[] unescapedChars = new char[bytes.Length]; int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0); start = next; // match exact bytes // Do not unescape chars not allowed by Iri // need to check for invalid utf sequences that may not have given any chars MatchUTF8Sequence(pDest, dest, ref destPosition, unescapedChars, charCount, bytes, byteCount, isQuery, iriParsing); } if (next == end) { goto done; } } dest_fixed_loop_break :; } } done : return(dest); }
public static unsafe int UnescapePercentEncodedUTF8Sequence(char *input, int length, ref ValueStringBuilder dest, bool isQuery, bool iriParsing) { // The following assertions rely on the input not mutating mid-operation, as is the case currently since callers are working with strings // If we start accepting input such as spans, this method must be audited to ensure no buffer overruns/infinite loops could occur // As an optimization, this method should only be called after the first character is known to be a part of a non-ascii UTF8 sequence Debug.Assert(length >= 3); Debug.Assert(input[0] == '%'); Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) != Uri.c_DummyChar); Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) >= 128); uint fourByteBuffer = 0; int bytesLeftInBuffer = 0; int totalCharsConsumed = 0; int charsToCopy = 0; int bytesConsumed = 0; RefillBuffer: int i = totalCharsConsumed + (bytesLeftInBuffer * 3); ReadByteFromInput: if ((uint)(length - i) <= 2 || input[i] != '%') { goto NoMoreOrInvalidInput; } uint value = input[i + 1]; if ((uint)((value - 'A') & ~0x20) <= ('F' - 'A')) { value = (value | 0x20) - 'a' + 10; } else if ((value - '8') <= ('9' - '8')) { value -= '0'; } else { goto NoMoreOrInvalidInput; // First character wasn't hex or was <= 7F (Ascii) } uint second = (uint)input[i + 2] - '0'; if (second <= 9) { // second is already [0, 9] } else if ((uint)((second - ('A' - '0')) & ~0x20) <= ('F' - 'A')) { second = ((second + '0') | 0x20) - 'a' + 10; } else { goto NoMoreOrInvalidInput; // Second character wasn't Hex } value = (value << 4) | second; Debug.Assert(value >= 128); // Rotate the buffer and overwrite the last byte if (BitConverter.IsLittleEndian) { fourByteBuffer = (fourByteBuffer >> 8) | (value << 24); } else { fourByteBuffer = (fourByteBuffer << 8) | value; } if (++bytesLeftInBuffer != 4) { i += 3; goto ReadByteFromInput; } DecodeRune: Debug.Assert(totalCharsConsumed % 3 == 0); Debug.Assert(bytesLeftInBuffer == 2 || bytesLeftInBuffer == 3 || bytesLeftInBuffer == 4); Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00000080 : 0x80000000)) != 0); Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00008000 : 0x00800000)) != 0); Debug.Assert(bytesLeftInBuffer < 3 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00800000 : 0x00008000)) != 0); Debug.Assert(bytesLeftInBuffer < 4 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x80000000 : 0x00000080)) != 0); uint temp = fourByteBuffer; // make a copy so that the *copy* (not the original) is marked address-taken if (Rune.DecodeFromUtf8(new ReadOnlySpan <byte>(&temp, bytesLeftInBuffer), out Rune rune, out bytesConsumed) == OperationStatus.Done) { Debug.Assert(bytesConsumed >= 2, $"Rune.DecodeFromUtf8 consumed {bytesConsumed} bytes, likely indicating input was modified concurrently during UnescapePercentEncodedUTF8Sequence's execution"); if (!iriParsing || IriHelper.CheckIriUnicodeRange((uint)rune.Value, isQuery)) { if (charsToCopy != 0) { dest.Append(input + totalCharsConsumed - charsToCopy, charsToCopy); charsToCopy = 0; } dest.Append(rune); goto AfterDecodeRune; } }