// // Check if highSurr and lowSurr are a surrogate pair then // it checks if the combined char is in the range // Takes in isQuery because because iri restrictions for query are different // internal static bool CheckIriUnicodeRange(char highSurr, char lowSurr, ref bool surrogatePair, bool isQuery) { bool inRange = false; surrogatePair = false; Debug.Assert(Char.IsHighSurrogate(highSurr)); if (Char.IsSurrogatePair(highSurr, lowSurr)) { surrogatePair = true; char[] chars = new char[2] { highSurr, lowSurr }; string surrPair = new string(chars); if (((string.CompareOrdinal(surrPair, "\U00010000") >= 0) && (string.CompareOrdinal(surrPair, "\U0001FFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U00020000") >= 0) && (string.CompareOrdinal(surrPair, "\U0002FFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U00030000") >= 0) && (string.CompareOrdinal(surrPair, "\U0003FFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U00040000") >= 0) && (string.CompareOrdinal(surrPair, "\U0004FFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U00050000") >= 0) && (string.CompareOrdinal(surrPair, "\U0005FFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U00060000") >= 0) && (string.CompareOrdinal(surrPair, "\U0006FFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U00070000") >= 0) && (string.CompareOrdinal(surrPair, "\U0007FFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U00080000") >= 0) && (string.CompareOrdinal(surrPair, "\U0008FFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U00090000") >= 0) && (string.CompareOrdinal(surrPair, "\U0009FFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U000A0000") >= 0) && (string.CompareOrdinal(surrPair, "\U000AFFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U000B0000") >= 0) && (string.CompareOrdinal(surrPair, "\U000BFFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U000C0000") >= 0) && (string.CompareOrdinal(surrPair, "\U000CFFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U000D0000") >= 0) && (string.CompareOrdinal(surrPair, "\U000DFFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U000E1000") >= 0) && (string.CompareOrdinal(surrPair, "\U000EFFFD") <= 0)) || (isQuery && (((string.CompareOrdinal(surrPair, "\U000F0000") >= 0) && (string.CompareOrdinal(surrPair, "\U000FFFFD") <= 0)) || ((string.CompareOrdinal(surrPair, "\U00100000") >= 0) && (string.CompareOrdinal(surrPair, "\U0010FFFD") <= 0))))) { inRange = true; } } return(inRange); }
public static int ConvertToUtf32(string s, int index) { CheckParameter(s, index); if (!Char.IsSurrogate(s [index])) { return(s [index]); } if (!Char.IsHighSurrogate(s [index]) || index == s.Length - 1 || !Char.IsLowSurrogate(s [index + 1])) { throw new ArgumentException(String.Format("The string contains invalid surrogate pair character at {0}", index)); } return(ConvertToUtf32(s [index], s [index + 1])); }
public static int ConvertToUtf32(string s, int index) { if (s == null) { throw new ArgumentNullException("s"); } if (index < 0 || index >= s.Length) { throw new ArgumentOutOfRangeException("index"); } if (!Char.IsSurrogate(s [index])) { return(s [index]); } if (!Char.IsHighSurrogate(s [index]) || index == s.Length - 1 || !Char.IsLowSurrogate(s [index + 1])) { throw new ArgumentException(String.Format("The string contains invalid surrogate pair character at {0}", index)); } return(ConvertToUtf32(s [index], s [index + 1])); }
// // Need to check for invalid utf sequences that may not have given any chars. // We got the unescaped chars, we then reencode them and match off the bytes // to get the invalid sequence bytes that we just copy off // internal static unsafe void MatchUTF8Sequence(char *pDest, char[] dest, ref int destOffset, char[] unescapedChars, int charCount, byte[] bytes, int byteCount, bool isQuery, bool iriParsing) { int count = 0; fixed(char *unescapedCharsPtr = unescapedChars) { for (int j = 0; j < charCount; ++j) { bool isHighSurr = Char.IsHighSurrogate(unescapedCharsPtr[j]); byte[] encodedBytes = Encoding.UTF8.GetBytes(unescapedChars, j, isHighSurr ? 2 : 1); int encodedBytesLength = encodedBytes.Length; // we have to keep unicode chars outside Iri range escaped bool inIriRange = false; if (iriParsing) { if (!isHighSurr) { inIriRange = IriHelper.CheckIriUnicodeRange(unescapedChars[j], isQuery); } else { bool surrPair = false; inIriRange = IriHelper.CheckIriUnicodeRange(unescapedChars[j], unescapedChars[j + 1], ref surrPair, isQuery); } } while (true) { // Escape any invalid bytes that were before this character while (bytes[count] != encodedBytes[0]) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); EscapeAsciiChar((char)bytes[count++], dest, ref destOffset); } // check if all bytes match bool allBytesMatch = true; int k = 0; for (; k < encodedBytesLength; ++k) { if (bytes[count + k] != encodedBytes[k]) { allBytesMatch = false; break; } } if (allBytesMatch) { count += encodedBytesLength; if (iriParsing) { if (!inIriRange) { // need to keep chars not allowed as escaped for (int l = 0; l < encodedBytes.Length; ++l) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); EscapeAsciiChar((char)encodedBytes[l], dest, ref destOffset); } } else if (!Uri.IsBidiControlCharacter(unescapedCharsPtr[j])) { //copy chars Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = unescapedCharsPtr[j]; if (isHighSurr) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = unescapedCharsPtr[j + 1]; } } } else { //copy chars Debug.Assert(dest.Length > destOffset); pDest[destOffset++] = unescapedCharsPtr[j]; if (isHighSurr) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = unescapedCharsPtr[j + 1]; } } break; // break out of while (true) since we've matched this char bytes } else { // copy bytes till place where bytes dont match for (int l = 0; l < k; ++l) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); EscapeAsciiChar((char)bytes[count++], dest, ref destOffset); } } } if (isHighSurr) { j++; } } } // Include any trailing invalid sequences while (count < byteCount) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); EscapeAsciiChar((char)bytes[count++], dest, ref destOffset); } }
public static Boolean IsHighSurrogate(this Char c) { return(Char.IsHighSurrogate(c)); }
/// <summary> /// Indicates whether the specified Char object is a high surrogate. /// </summary> /// <param name="Char">The Unicode character to evaluate.</param> /// <returns>true if the numeric value of the <paramref name="Char"/> parameter ranges from U+D800 through U+DBFF; otherwise, false.</returns> public static Boolean IsHighSurrogate(this Char Char) => Char.IsHighSurrogate(Char);
// // IRI normalization for strings containing characters that are not allowed or // escaped characters that should be unescaped in the context of the specified Uri component. // internal static unsafe string EscapeUnescapeIri(char *pInput, int start, int end, UriComponents component) { char[] dest = new char[end - start]; byte[] bytes = null; // Pin the array to do pointer accesses GCHandle destHandle = GCHandle.Alloc(dest, GCHandleType.Pinned); char * pDest = (char *)destHandle.AddrOfPinnedObject(); const int percentEncodingLen = 3; // Escaped UTF-8 will take 3 chars: %AB. const int bufferCapacityIncrease = 30 * percentEncodingLen; int bufferRemaining = 0; int next = start; int destOffset = 0; char ch; bool escape = false; bool surrogatePair = false; for (; next < end; ++next) { escape = false; surrogatePair = false; if ((ch = pInput[next]) == '%') { if (next + 2 < end) { ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]); // Do not unescape a reserved char if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch)) { // keep as is Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = pInput[next++]; Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = pInput[next++]; Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = pInput[next]; continue; } else if (ch <= '\x7F') { Debug.Assert(ch < 0xFF, "Expecting ASCII character."); Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); //ASCII pDest[destOffset++] = ch; next += 2; continue; } else { // possibly utf8 encoded sequence of unicode // check if safe to unescape according to Iri rules Debug.Assert(ch < 0xFF, "Expecting ASCII character."); int startSeq = next; int byteCount = 1; // lazy initialization of max size, will reuse the array for next sequences if ((object)bytes == null) { bytes = new byte[end - next]; } bytes[0] = (byte)ch; next += 3; while (next < end) { // Check on exit criterion if ((ch = pInput[next]) != '%' || next + 2 >= end) { break; } // already made sure we have 3 characters in str ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]); //invalid hex sequence ? if (ch == Uri.c_DummyChar) { break; } // character is not part of a UTF-8 sequence ? else if (ch < '\x80') { break; } else { //a UTF-8 sequence bytes[byteCount++] = (byte)ch; next += 3; } Debug.Assert(ch < 0xFF, "Expecting ASCII character."); } next--; // for loop will increment // Using encoder with no replacement fall-back will skip all invalid UTF-8 sequences. Encoding noFallbackCharUTF8 = (Encoding)Encoding.UTF8.Clone(); noFallbackCharUTF8.EncoderFallback = new EncoderReplacementFallback(""); noFallbackCharUTF8.DecoderFallback = new DecoderReplacementFallback(""); char[] unescapedChars = new char[bytes.Length]; int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0); if (charCount != 0) { // If invalid sequences were present in the original escaped string, we need to // copy the escaped versions of those sequences. // Decoded Unicode values will be kept only when they are allowed by the URI/IRI RFC // rules. UriHelper.MatchUTF8Sequence(pDest, dest, ref destOffset, unescapedChars, charCount, bytes, byteCount, component == UriComponents.Query, true); } else { // copy escaped sequence as is for (int i = startSeq; i <= next; ++i) { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = pInput[i]; } } } } else { Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = pInput[next]; } } else if (ch > '\x7f') { // unicode char ch2; if ((Char.IsHighSurrogate(ch)) && (next + 1 < end)) { ch2 = pInput[next + 1]; escape = !CheckIriUnicodeRange(ch, ch2, ref surrogatePair, component == UriComponents.Query); if (!escape) { // copy the two chars Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = pInput[next++]; Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = pInput[next]; } } else { if (CheckIriUnicodeRange(ch, component == UriComponents.Query)) { if (!Uri.IsBidiControlCharacter(ch) || !UriParser.DontKeepUnicodeBidiFormattingCharacters) { // copy it Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = pInput[next]; } } else { // escape it escape = true; } } } else { // just copy the character Debug.Assert(dest.Length > destOffset, "Buffer overrun detected"); pDest[destOffset++] = pInput[next]; } if (escape) { const int maxNumberOfBytesEncoded = 4; if (bufferRemaining < maxNumberOfBytesEncoded * percentEncodingLen) { int newBufferLength = 0; checked { // may need more memory since we didn't anticipate escaping newBufferLength = dest.Length + bufferCapacityIncrease; bufferRemaining += bufferCapacityIncrease; } char[] newDest = new char[newBufferLength]; fixed(char *pNewDest = newDest) { #if !UT_PUBLIC_DEPENDS Buffer.Memcpy((byte *)pNewDest, (byte *)pDest, destOffset * sizeof(char)); #else for (int idx = 0; idx < destOffset; idx++) { pNewDest[idx] = pDest[idx]; } #endif } if (destHandle.IsAllocated) { destHandle.Free(); } dest = newDest; // re-pin new dest[] array destHandle = GCHandle.Alloc(dest, GCHandleType.Pinned); pDest = (char *)destHandle.AddrOfPinnedObject(); } byte[] encodedBytes = new byte[maxNumberOfBytesEncoded]; fixed(byte *pEncodedBytes = encodedBytes) { int encodedBytesCount = Encoding.UTF8.GetBytes(pInput + next, surrogatePair ? 2 : 1, pEncodedBytes, maxNumberOfBytesEncoded); Debug.Assert(encodedBytesCount <= maxNumberOfBytesEncoded, "UTF8 encoder should not exceed specified byteCount"); bufferRemaining -= encodedBytesCount * percentEncodingLen; for (int count = 0; count < encodedBytesCount; ++count) { UriHelper.EscapeAsciiChar((char)encodedBytes[count], dest, ref destOffset); } } } } if (destHandle.IsAllocated) { destHandle.Free(); } Debug.Assert(destOffset <= dest.Length, "Buffer overrun detected"); return(new string(dest, 0, destOffset)); }