// Does this string have any %6A sequences that are 3986 Unreserved characters? These should be un-escaped. private unsafe bool CheckForEscapedUnreserved(string data) { fixed(char *tempPtr = data) { for (int i = 0; i < data.Length - 2; ++i) { if (tempPtr[i] == '%' && IsHexDigit(tempPtr[i + 1]) && IsHexDigit(tempPtr[i + 2]) && tempPtr[i + 1] >= '0' && tempPtr[i + 1] <= '7') // max 0x7F { char ch = UriHelper.EscapedAscii(tempPtr[i + 1], tempPtr[i + 2]); if (ch != c_DummyChar && UriHelper.Is3986Unreserved(ch)) { return(true); } } } } return(false); }
// // Unescapes entire string and checks if it has unicode chars // private bool CheckForUnicode(string data) { for (int i = 0; i < data.Length; i++) { char c = data[i]; if (c == '%') { if (i + 2 < data.Length) { if (UriHelper.EscapedAscii(data[i + 1], data[i + 2]) > 0x7F) { return(true); } i += 2; } } else if (c > 0x7F) { return(true); } } return(false); }
// // IRI normalization for strings containing characters that are not allowed or // escaped characters that should be unescaped in the context of the specified Uri component. // internal static unsafe string EscapeUnescapeIri(char *pInput, int start, int end, UriComponents component) { char[] dest = new char[end - start]; byte[] bytes = null; // Pin the array to do pointer accesses GCHandle destHandle = GCHandle.Alloc(dest, GCHandleType.Pinned); char * pDest = (char *)destHandle.AddrOfPinnedObject(); const int percentEncodingLen = 3; // Escaped UTF-8 will take 3 chars: %AB. const int bufferCapacityIncrease = 30 * percentEncodingLen; int bufferRemaining = 0; int next = start; int destOffset = 0; char ch; bool escape = false; bool surrogatePair = false; for (; next < end; ++next) { escape = false; surrogatePair = false; if ((ch = pInput[next]) == '%') { if (next + 2 < end) { ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]); // Do not unescape a reserved char if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch)) { // keep as is Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset."); pDest[destOffset++] = pInput[next++]; Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset."); pDest[destOffset++] = pInput[next++]; Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset."); pDest[destOffset++] = pInput[next]; continue; } else if (ch <= '\x7F') { Debug.Assert(ch < 0xFF, "Expecting ASCII character."); Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset."); //ASCII pDest[destOffset++] = ch; next += 2; continue; } else { // possibly utf8 encoded sequence of unicode // check if safe to unescape according to Iri rules Debug.Assert(ch < 0xFF, "Expecting ASCII character."); int startSeq = next; int byteCount = 1; // lazy initialization of max size, will reuse the array for next sequences if ((object)bytes == null) { bytes = new byte[end - next]; } bytes[0] = (byte)ch; next += 3; while (next < end) { // Check on exit criterion if ((ch = pInput[next]) != '%' || next + 2 >= end) { break; } // already made sure we have 3 characters in str ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]); //invalid hex sequence ? if (ch == Uri.c_DummyChar) { break; } // character is not part of a UTF-8 sequence ? else if (ch < '\x80') { break; } else { //a UTF-8 sequence bytes[byteCount++] = (byte)ch; next += 3; } Debug.Assert(ch < 0xFF, "Expecting ASCII character."); } next--; // for loop will increment // Using encoder with no replacement fall-back will skip all invalid UTF-8 sequences. Encoding noFallbackCharUTF8 = Encoding.GetEncoding( Encoding.UTF8.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback("")); char[] unescapedChars = new char[bytes.Length]; int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0); if (charCount != 0) { // If invalid sequences were present in the original escaped string, we need to // copy the escaped versions of those sequences. // Decoded Unicode values will be kept only when they are allowed by the URI/IRI RFC // rules. UriHelper.MatchUTF8Sequence(pDest, dest, ref destOffset, unescapedChars, charCount, bytes, byteCount, component == UriComponents.Query, true); } else { // copy escaped sequence as is for (int i = startSeq; i <= next; ++i) { Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset."); pDest[destOffset++] = pInput[i]; } } } } else { Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset."); pDest[destOffset++] = pInput[next]; } } else if (ch > '\x7f') { // unicode char ch2; if ((char.IsHighSurrogate(ch)) && (next + 1 < end)) { ch2 = pInput[next + 1]; escape = !CheckIriUnicodeRange(ch, ch2, ref surrogatePair, component == UriComponents.Query); if (!escape) { // copy the two chars Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset."); pDest[destOffset++] = pInput[next++]; Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset."); pDest[destOffset++] = pInput[next]; } } else { if (CheckIriUnicodeRange(ch, component == UriComponents.Query)) { if (!Uri.IsBidiControlCharacter(ch)) { // copy it Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset."); pDest[destOffset++] = pInput[next]; } } else { // escape it escape = true; } } } else { // just copy the character Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset."); pDest[destOffset++] = pInput[next]; } if (escape) { const int maxNumberOfBytesEncoded = 4; if (bufferRemaining < maxNumberOfBytesEncoded * percentEncodingLen) { int newBufferLength = 0; checked { // may need more memory since we didn't anticipate escaping newBufferLength = dest.Length + bufferCapacityIncrease; bufferRemaining += bufferCapacityIncrease; } char[] newDest = new char[newBufferLength]; fixed(char *pNewDest = newDest) { Buffer.MemoryCopy((byte *)pDest, (byte *)pNewDest, newBufferLength, destOffset * sizeof(char)); } if (destHandle.IsAllocated) { destHandle.Free(); } dest = newDest; // re-pin new dest[] array destHandle = GCHandle.Alloc(dest, GCHandleType.Pinned); pDest = (char *)destHandle.AddrOfPinnedObject(); } byte[] encodedBytes = new byte[maxNumberOfBytesEncoded]; fixed(byte *pEncodedBytes = encodedBytes) { int encodedBytesCount = Encoding.UTF8.GetBytes(pInput + next, surrogatePair ? 2 : 1, pEncodedBytes, maxNumberOfBytesEncoded); Debug.Assert(encodedBytesCount <= maxNumberOfBytesEncoded, "UTF8 encoder should not exceed specified byteCount"); bufferRemaining -= encodedBytesCount * percentEncodingLen; for (int count = 0; count < encodedBytesCount; ++count) { UriHelper.EscapeAsciiChar((char)encodedBytes[count], dest, ref destOffset); } } } } if (destHandle.IsAllocated) { destHandle.Free(); } Debug.Assert(destOffset <= dest.Length, "Destination length met or exceeded destination offset."); return(new string(dest, 0, destOffset)); }
// // IRI normalization for strings containing characters that are not allowed or // escaped characters that should be unescaped in the context of the specified Uri component. // internal static unsafe string EscapeUnescapeIri(char *pInput, int start, int end, UriComponents component) { int size = end - start; ValueStringBuilder dest = new ValueStringBuilder(size); byte[]? bytes = null; const int percentEncodingLen = 3; // Escaped UTF-8 will take 3 chars: %AB. int bufferRemaining = 0; int next = start; char ch; bool escape = false; bool surrogatePair = false; for (; next < end; ++next) { escape = false; surrogatePair = false; if ((ch = pInput[next]) == '%') { if (next + 2 < end) { ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]); // Do not unescape a reserved char if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch)) { // keep as is dest.Append(pInput[next++]); dest.Append(pInput[next++]); dest.Append(pInput[next]); continue; } else if (ch <= '\x7F') { Debug.Assert(ch < 0xFF, "Expecting ASCII character."); //ASCII dest.Append(ch); next += 2; continue; } else { // possibly utf8 encoded sequence of unicode // check if safe to unescape according to Iri rules Debug.Assert(ch < 0xFF, "Expecting ASCII character."); int startSeq = next; int byteCount = 1; // lazy initialization of max size, will reuse the array for next sequences if ((object?)bytes == null) { bytes = new byte[end - next]; } bytes[0] = (byte)ch; next += 3; while (next < end) { // Check on exit criterion if ((ch = pInput[next]) != '%' || next + 2 >= end) { break; } // already made sure we have 3 characters in str ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]); //invalid hex sequence ? if (ch == Uri.c_DummyChar) { break; } // character is not part of a UTF-8 sequence ? else if (ch < '\x80') { break; } else { //a UTF-8 sequence bytes[byteCount++] = (byte)ch; next += 3; } Debug.Assert(ch < 0xFF, "Expecting ASCII character."); } next--; // for loop will increment // Using encoder with no replacement fall-back will skip all invalid UTF-8 sequences. Encoding noFallbackCharUTF8 = Encoding.GetEncoding( Encoding.UTF8.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback("")); char[] unescapedChars = new char[bytes.Length]; int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0); if (charCount != 0) { // If invalid sequences were present in the original escaped string, we need to // copy the escaped versions of those sequences. // Decoded Unicode values will be kept only when they are allowed by the URI/IRI RFC // rules. UriHelper.MatchUTF8Sequence(ref dest, unescapedChars, charCount, bytes, byteCount, component == UriComponents.Query, true); } else { // copy escaped sequence as is for (int i = startSeq; i <= next; ++i) { dest.Append(pInput[i]); } } } } else { dest.Append(pInput[next]); } } else if (ch > '\x7f') { // unicode char ch2; if ((char.IsHighSurrogate(ch)) && (next + 1 < end)) { ch2 = pInput[next + 1]; escape = !CheckIriUnicodeRange(ch, ch2, ref surrogatePair, component == UriComponents.Query); if (!escape) { // copy the two chars dest.Append(pInput[next++]); dest.Append(pInput[next]); } } else { if (CheckIriUnicodeRange(ch, component == UriComponents.Query)) { if (!UriHelper.IsBidiControlCharacter(ch) || !UriParser.DontKeepUnicodeBidiFormattingCharacters) { // copy it dest.Append(pInput[next]); } } else { // escape it escape = true; } } } else { // just copy the character dest.Append(pInput[next]); } if (escape) { const int MaxNumberOfBytesEncoded = 4; byte[] encodedBytes = new byte[MaxNumberOfBytesEncoded]; fixed(byte *pEncodedBytes = &encodedBytes[0]) { int encodedBytesCount = Encoding.UTF8.GetBytes(pInput + next, surrogatePair ? 2 : 1, pEncodedBytes, MaxNumberOfBytesEncoded); Debug.Assert(encodedBytesCount <= MaxNumberOfBytesEncoded, "UTF8 encoder should not exceed specified byteCount"); bufferRemaining -= encodedBytesCount * percentEncodingLen; for (int count = 0; count < encodedBytesCount; ++count) { UriHelper.EscapeAsciiChar((char)encodedBytes[count], ref dest); } } } } string result = dest.ToString(); return(result); }
// // IRI normalization for strings containing characters that are not allowed or // escaped characters that should be unescaped in the context of the specified Uri component. // internal static unsafe string EscapeUnescapeIri(char *pInput, int start, int end, UriComponents component) { int size = end - start; ValueStringBuilder dest = new ValueStringBuilder(size); byte[]? bytes = null; int next = start; char ch; Span <byte> maxUtf8EncodedSpan = stackalloc byte[4]; for (; next < end; ++next) { if ((ch = pInput[next]) == '%') { if (next + 2 < end) { ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]); // Do not unescape a reserved char if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch)) { // keep as is dest.Append(pInput[next++]); dest.Append(pInput[next++]); dest.Append(pInput[next]); continue; } else if (ch <= '\x7F') { Debug.Assert(ch < 0xFF, "Expecting ASCII character."); //ASCII dest.Append(ch); next += 2; continue; } else { // possibly utf8 encoded sequence of unicode // check if safe to unescape according to Iri rules Debug.Assert(ch < 0xFF, "Expecting ASCII character."); int startSeq = next; int byteCount = 1; // lazy initialization of max size, will reuse the array for next sequences if ((object?)bytes == null) { bytes = new byte[end - next]; } bytes[0] = (byte)ch; next += 3; while (next < end) { // Check on exit criterion if ((ch = pInput[next]) != '%' || next + 2 >= end) { break; } // already made sure we have 3 characters in str ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]); //invalid hex sequence ? if (ch == Uri.c_DummyChar) { break; } // character is not part of a UTF-8 sequence ? else if (ch < '\x80') { break; } else { //a UTF-8 sequence bytes[byteCount++] = (byte)ch; next += 3; } Debug.Assert(ch < 0xFF, "Expecting ASCII character."); } next--; // for loop will increment // Using encoder with no replacement fall-back will skip all invalid UTF-8 sequences. Encoding noFallbackCharUTF8 = Encoding.GetEncoding( Encoding.UTF8.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback("")); char[] unescapedChars = new char[bytes.Length]; int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0); if (charCount != 0) { // If invalid sequences were present in the original escaped string, we need to // copy the escaped versions of those sequences. // Decoded Unicode values will be kept only when they are allowed by the URI/IRI RFC // rules. UriHelper.MatchUTF8Sequence(ref dest, unescapedChars, charCount, bytes, byteCount, component == UriComponents.Query, true); } else { // copy escaped sequence as is for (int i = startSeq; i <= next; ++i) { dest.Append(pInput[i]); } } } } else { dest.Append(pInput[next]); } } else if (ch > '\x7f') { // unicode bool escape; bool surrogatePair = false; char ch2 = '\0'; if ((char.IsHighSurrogate(ch)) && (next + 1 < end)) { ch2 = pInput[next + 1]; escape = !CheckIriUnicodeRange(ch, ch2, ref surrogatePair, component == UriComponents.Query); } else { escape = !CheckIriUnicodeRange(ch, component == UriComponents.Query); } if (escape) { Rune rune; if (surrogatePair) { rune = new Rune(ch, ch2); } else if (!Rune.TryCreate(ch, out rune)) { rune = Rune.ReplacementChar; } int bytesWritten = rune.EncodeToUtf8(maxUtf8EncodedSpan); Span <byte> encodedBytes = maxUtf8EncodedSpan.Slice(0, bytesWritten); foreach (byte b in encodedBytes) { UriHelper.EscapeAsciiChar(b, ref dest); } } else { dest.Append(ch); if (surrogatePair) { dest.Append(ch2); } } if (surrogatePair) { next++; } } else { // just copy the character dest.Append(pInput[next]); } } string result = dest.ToString(); return(result); }