Esempio n. 1
0
        //
        // IRI normalization for strings containing characters that are not allowed or
        // escaped characters that should be unescaped in the context of the specified Uri component.
        //
        internal static unsafe string EscapeUnescapeIri(char *pInput, int start, int end, UriComponents component)
        {
            char[] dest  = new char[end - start];
            byte[] bytes = null;

            // Pin the array to do pointer accesses
            GCHandle destHandle = GCHandle.Alloc(dest, GCHandleType.Pinned);
            char *   pDest      = (char *)destHandle.AddrOfPinnedObject();

            const int percentEncodingLen     = 3; // Escaped UTF-8 will take 3 chars: %AB.
            const int bufferCapacityIncrease = 30 * percentEncodingLen;
            int       bufferRemaining        = 0;

            int  next       = start;
            int  destOffset = 0;
            char ch;
            bool escape        = false;
            bool surrogatePair = false;

            for (; next < end; ++next)
            {
                escape        = false;
                surrogatePair = false;

                if ((ch = pInput[next]) == '%')
                {
                    if (next + 2 < end)
                    {
                        ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]);

                        // Do not unescape a reserved char
                        if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch))
                        {
                            // keep as is
                            Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                            pDest[destOffset++] = pInput[next++];
                            Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                            pDest[destOffset++] = pInput[next++];
                            Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                            pDest[destOffset++] = pInput[next];
                            continue;
                        }
                        else if (ch <= '\x7F')
                        {
                            Debug.Assert(ch < 0xFF, "Expecting ASCII character.");
                            Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                            //ASCII
                            pDest[destOffset++] = ch;
                            next += 2;
                            continue;
                        }
                        else
                        {
                            // possibly utf8 encoded sequence of unicode

                            // check if safe to unescape according to Iri rules

                            Debug.Assert(ch < 0xFF, "Expecting ASCII character.");

                            int startSeq  = next;
                            int byteCount = 1;
                            // lazy initialization of max size, will reuse the array for next sequences
                            if ((object)bytes == null)
                            {
                                bytes = new byte[end - next];
                            }

                            bytes[0] = (byte)ch;
                            next    += 3;
                            while (next < end)
                            {
                                // Check on exit criterion
                                if ((ch = pInput[next]) != '%' || next + 2 >= end)
                                {
                                    break;
                                }

                                // already made sure we have 3 characters in str
                                ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]);

                                //invalid hex sequence ?
                                if (ch == Uri.c_DummyChar)
                                {
                                    break;
                                }
                                // character is not part of a UTF-8 sequence ?
                                else if (ch < '\x80')
                                {
                                    break;
                                }
                                else
                                {
                                    //a UTF-8 sequence
                                    bytes[byteCount++] = (byte)ch;
                                    next += 3;
                                }

                                Debug.Assert(ch < 0xFF, "Expecting ASCII character.");
                            }
                            next--; // for loop will increment


                            // Using encoder with no replacement fall-back will skip all invalid UTF-8 sequences.
                            Encoding noFallbackCharUTF8 = Encoding.GetEncoding(
                                Encoding.UTF8.CodePage,
                                new EncoderReplacementFallback(""),
                                new DecoderReplacementFallback(""));

                            char[] unescapedChars = new char[bytes.Length];
                            int    charCount      = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0);


                            if (charCount != 0)
                            {
                                // If invalid sequences were present in the original escaped string, we need to
                                // copy the escaped versions of those sequences.
                                // Decoded Unicode values will be kept only when they are allowed by the URI/IRI RFC
                                // rules.
                                UriHelper.MatchUTF8Sequence(pDest, dest, ref destOffset, unescapedChars, charCount, bytes,
                                                            byteCount, component == UriComponents.Query, true);
                            }
                            else
                            {
                                // copy escaped sequence as is
                                for (int i = startSeq; i <= next; ++i)
                                {
                                    Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                                    pDest[destOffset++] = pInput[i];
                                }
                            }
                        }
                    }
                    else
                    {
                        Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                        pDest[destOffset++] = pInput[next];
                    }
                }
                else if (ch > '\x7f')
                {
                    // unicode

                    char ch2;

                    if ((char.IsHighSurrogate(ch)) && (next + 1 < end))
                    {
                        ch2    = pInput[next + 1];
                        escape = !CheckIriUnicodeRange(ch, ch2, ref surrogatePair, component == UriComponents.Query);
                        if (!escape)
                        {
                            // copy the two chars
                            Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                            pDest[destOffset++] = pInput[next++];
                            Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                            pDest[destOffset++] = pInput[next];
                        }
                    }
                    else
                    {
                        if (CheckIriUnicodeRange(ch, component == UriComponents.Query))
                        {
                            if (!UriHelper.IsBidiControlCharacter(ch))
                            {
                                // copy it
                                Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                                pDest[destOffset++] = pInput[next];
                            }
                        }
                        else
                        {
                            // escape it
                            escape = true;
                        }
                    }
                }
                else
                {
                    // just copy the character
                    Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                    pDest[destOffset++] = pInput[next];
                }

                if (escape)
                {
                    const int MaxNumberOfBytesEncoded = 4;

                    if (bufferRemaining < MaxNumberOfBytesEncoded * percentEncodingLen)
                    {
                        int newBufferLength = 0;

                        checked
                        {
                            // may need more memory since we didn't anticipate escaping
                            newBufferLength  = dest.Length + bufferCapacityIncrease;
                            bufferRemaining += bufferCapacityIncrease;
                        }

                        char[] newDest = new char[newBufferLength];

                        fixed(char *pNewDest = newDest)
                        {
                            Buffer.MemoryCopy((byte *)pDest, (byte *)pNewDest, newBufferLength * sizeof(char), destOffset * sizeof(char));
                        }

                        if (destHandle.IsAllocated)
                        {
                            destHandle.Free();
                        }

                        dest = newDest;

                        // re-pin new dest[] array
                        destHandle = GCHandle.Alloc(dest, GCHandleType.Pinned);
                        pDest      = (char *)destHandle.AddrOfPinnedObject();
                    }

                    byte[] encodedBytes = new byte[MaxNumberOfBytesEncoded];
                    fixed(byte *pEncodedBytes = &encodedBytes[0])
                    {
                        int encodedBytesCount = Encoding.UTF8.GetBytes(pInput + next, surrogatePair ? 2 : 1, pEncodedBytes, MaxNumberOfBytesEncoded);

                        Debug.Assert(encodedBytesCount <= MaxNumberOfBytesEncoded, "UTF8 encoder should not exceed specified byteCount");

                        bufferRemaining -= encodedBytesCount * percentEncodingLen;

                        for (int count = 0; count < encodedBytesCount; ++count)
                        {
                            UriHelper.EscapeAsciiChar((char)encodedBytes[count], dest, ref destOffset);
                        }
                    }
                }
            }

            if (destHandle.IsAllocated)
            {
                destHandle.Free();
            }

            Debug.Assert(destOffset <= dest.Length, "Destination length met or exceeded destination offset.");
            return(new string(dest, 0, destOffset));
        }
Esempio n. 2
0
        //
        // Need to check for invalid utf sequences that may not have given any chars.
        // We got the unescaped chars, we then re-encode them and match off the bytes
        // to get the invalid sequence bytes that we just copy off
        //
        internal static unsafe void MatchUTF8Sequence(char *pDest, char[] dest, ref int destOffset, char[] unescapedChars,
                                                      int charCount, byte[] bytes, int byteCount, bool isQuery, bool iriParsing)
        {
            int count = 0;

            fixed(char *unescapedCharsPtr = unescapedChars)
            {
                for (int j = 0; j < charCount; ++j)
                {
                    bool isHighSurr = char.IsHighSurrogate(unescapedCharsPtr[j]);

                    byte[] encodedBytes       = Encoding.UTF8.GetBytes(unescapedChars, j, isHighSurr ? 2 : 1);
                    int    encodedBytesLength = encodedBytes.Length;

                    // we have to keep unicode chars outside Iri range escaped
                    bool inIriRange = false;
                    if (iriParsing)
                    {
                        if (!isHighSurr)
                        {
                            inIriRange = IriHelper.CheckIriUnicodeRange(unescapedChars[j], isQuery);
                        }
                        else
                        {
                            bool surrPair = false;
                            inIriRange = IriHelper.CheckIriUnicodeRange(unescapedChars[j], unescapedChars[j + 1],
                                                                        ref surrPair, isQuery);
                        }
                    }

                    while (true)
                    {
                        // Escape any invalid bytes that were before this character
                        while (bytes[count] != encodedBytes[0])
                        {
                            Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                            EscapeAsciiChar((char)bytes[count++], dest, ref destOffset);
                        }

                        // check if all bytes match
                        bool allBytesMatch = true;
                        int  k             = 0;
                        for (; k < encodedBytesLength; ++k)
                        {
                            if (bytes[count + k] != encodedBytes[k])
                            {
                                allBytesMatch = false;
                                break;
                            }
                        }

                        if (allBytesMatch)
                        {
                            count += encodedBytesLength;
                            if (iriParsing)
                            {
                                if (!inIriRange)
                                {
                                    // need to keep chars not allowed as escaped
                                    for (int l = 0; l < encodedBytes.Length; ++l)
                                    {
                                        Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                                        EscapeAsciiChar((char)encodedBytes[l], dest, ref destOffset);
                                    }
                                }
                                else if (!UriHelper.IsBidiControlCharacter(unescapedCharsPtr[j]))
                                {
                                    //copy chars
                                    Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                                    pDest[destOffset++] = unescapedCharsPtr[j];
                                    if (isHighSurr)
                                    {
                                        Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                                        pDest[destOffset++] = unescapedCharsPtr[j + 1];
                                    }
                                }
                            }
                            else
                            {
                                //copy chars
                                Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                                pDest[destOffset++] = unescapedCharsPtr[j];

                                if (isHighSurr)
                                {
                                    Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                                    pDest[destOffset++] = unescapedCharsPtr[j + 1];
                                }
                            }

                            break; // break out of while (true) since we've matched this char bytes
                        }
                        else
                        {
                            // copy bytes till place where bytes don't match
                            for (int l = 0; l < k; ++l)
                            {
                                Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                                EscapeAsciiChar((char)bytes[count++], dest, ref destOffset);
                            }
                        }
                    }

                    if (isHighSurr)
                    {
                        j++;
                    }
                }
            }

            // Include any trailing invalid sequences
            while (count < byteCount)
            {
                Debug.Assert(dest.Length > destOffset, "Destination length exceeded destination offset.");
                EscapeAsciiChar((char)bytes[count++], dest, ref destOffset);
            }
        }
Esempio n. 3
0
        //
        // IRI normalization for strings containing characters that are not allowed or
        // escaped characters that should be unescaped in the context of the specified Uri component.
        //
        internal static unsafe string EscapeUnescapeIri(char *pInput, int start, int end, UriComponents component)
        {
            int size = end - start;
            ValueStringBuilder dest = new ValueStringBuilder(size);

            byte[]? bytes = null;

            const int percentEncodingLen = 3; // Escaped UTF-8 will take 3 chars: %AB.
            int       bufferRemaining    = 0;

            int  next = start;
            char ch;
            bool escape        = false;
            bool surrogatePair = false;

            for (; next < end; ++next)
            {
                escape        = false;
                surrogatePair = false;

                if ((ch = pInput[next]) == '%')
                {
                    if (next + 2 < end)
                    {
                        ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]);

                        // Do not unescape a reserved char
                        if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch))
                        {
                            // keep as is
                            dest.Append(pInput[next++]);
                            dest.Append(pInput[next++]);
                            dest.Append(pInput[next]);
                            continue;
                        }
                        else if (ch <= '\x7F')
                        {
                            Debug.Assert(ch < 0xFF, "Expecting ASCII character.");
                            //ASCII
                            dest.Append(ch);
                            next += 2;
                            continue;
                        }
                        else
                        {
                            // possibly utf8 encoded sequence of unicode

                            // check if safe to unescape according to Iri rules

                            Debug.Assert(ch < 0xFF, "Expecting ASCII character.");

                            int startSeq  = next;
                            int byteCount = 1;
                            // lazy initialization of max size, will reuse the array for next sequences
                            if ((object?)bytes == null)
                            {
                                bytes = new byte[end - next];
                            }

                            bytes[0] = (byte)ch;
                            next    += 3;
                            while (next < end)
                            {
                                // Check on exit criterion
                                if ((ch = pInput[next]) != '%' || next + 2 >= end)
                                {
                                    break;
                                }

                                // already made sure we have 3 characters in str
                                ch = UriHelper.EscapedAscii(pInput[next + 1], pInput[next + 2]);

                                //invalid hex sequence ?
                                if (ch == Uri.c_DummyChar)
                                {
                                    break;
                                }
                                // character is not part of a UTF-8 sequence ?
                                else if (ch < '\x80')
                                {
                                    break;
                                }
                                else
                                {
                                    //a UTF-8 sequence
                                    bytes[byteCount++] = (byte)ch;
                                    next += 3;
                                }

                                Debug.Assert(ch < 0xFF, "Expecting ASCII character.");
                            }
                            next--; // for loop will increment


                            // Using encoder with no replacement fall-back will skip all invalid UTF-8 sequences.
                            Encoding noFallbackCharUTF8 = Encoding.GetEncoding(
                                Encoding.UTF8.CodePage,
                                new EncoderReplacementFallback(""),
                                new DecoderReplacementFallback(""));

                            char[] unescapedChars = new char[bytes.Length];
                            int    charCount      = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0);


                            if (charCount != 0)
                            {
                                // If invalid sequences were present in the original escaped string, we need to
                                // copy the escaped versions of those sequences.
                                // Decoded Unicode values will be kept only when they are allowed by the URI/IRI RFC
                                // rules.
                                UriHelper.MatchUTF8Sequence(ref dest, unescapedChars, charCount, bytes,
                                                            byteCount, component == UriComponents.Query, true);
                            }
                            else
                            {
                                // copy escaped sequence as is
                                for (int i = startSeq; i <= next; ++i)
                                {
                                    dest.Append(pInput[i]);
                                }
                            }
                        }
                    }
                    else
                    {
                        dest.Append(pInput[next]);
                    }
                }
                else if (ch > '\x7f')
                {
                    // unicode

                    char ch2;

                    if ((char.IsHighSurrogate(ch)) && (next + 1 < end))
                    {
                        ch2    = pInput[next + 1];
                        escape = !CheckIriUnicodeRange(ch, ch2, ref surrogatePair, component == UriComponents.Query);
                        if (!escape)
                        {
                            // copy the two chars
                            dest.Append(pInput[next++]);
                            dest.Append(pInput[next]);
                        }
                    }
                    else
                    {
                        if (CheckIriUnicodeRange(ch, component == UriComponents.Query))
                        {
                            if (!UriHelper.IsBidiControlCharacter(ch) || !UriParser.DontKeepUnicodeBidiFormattingCharacters)
                            {
                                // copy it
                                dest.Append(pInput[next]);
                            }
                        }
                        else
                        {
                            // escape it
                            escape = true;
                        }
                    }
                }
                else
                {
                    // just copy the character
                    dest.Append(pInput[next]);
                }

                if (escape)
                {
                    const int MaxNumberOfBytesEncoded = 4;

                    byte[] encodedBytes = new byte[MaxNumberOfBytesEncoded];
                    fixed(byte *pEncodedBytes = &encodedBytes[0])
                    {
                        int encodedBytesCount = Encoding.UTF8.GetBytes(pInput + next, surrogatePair ? 2 : 1, pEncodedBytes, MaxNumberOfBytesEncoded);

                        Debug.Assert(encodedBytesCount <= MaxNumberOfBytesEncoded, "UTF8 encoder should not exceed specified byteCount");

                        bufferRemaining -= encodedBytesCount * percentEncodingLen;

                        for (int count = 0; count < encodedBytesCount; ++count)
                        {
                            UriHelper.EscapeAsciiChar((char)encodedBytes[count], ref dest);
                        }
                    }
                }
            }

            string result = dest.ToString();

            return(result);
        }