Example #1
0
        public void Run()
        {
            if (endOfFile)
            {
                return;
            }

            char[] buffer  = null;
            var    start   = 0;
            var    current = 0;
            var    end     = 0;

            if (!input.ReadMore(ref buffer, ref start, ref current, ref end))
            {
                // cannot decode more data until next input chunk is available

                return;
            }

            if (input.EndOfFile)
            {
                endOfFile = true;
            }

            if (end - start != 0)
            {
                if (!gotAnyText)
                {
                    if (output is ConverterEncodingOutput)
                    {
                        var encodingOutput = output as ConverterEncodingOutput;

                        if (encodingOutput.CodePageSameAsInput)
                        {
                            if (input is ConverterDecodingInput)
                            {
                                encodingOutput.Encoding = (input as ConverterDecodingInput).Encoding;
                            }
                            else
                            {
                                encodingOutput.Encoding = Encoding.UTF8;
                            }
                        }
                    }

                    gotAnyText = true;
                }

                output.Write(buffer, start, end - start);

                input.ReportProcessed(end - start);
            }

            if (endOfFile)
            {
                output.Flush();
            }
        }
Example #2
0
        public TextTokenId Parse()
        {
            char      ch, chT;
            CharClass charClass, charClassT;
            bool      forceFlushToken;
            int       runStart;

            if (tokenBuilder.Valid)
            {
                // start the new token

                input.ReportProcessed(parseCurrent - parseStart);
                parseStart = parseCurrent;

                tokenBuilder.Reset();

                InternalDebug.Assert(tokenBuilder.TotalLength == 0);
            }

            while (true)
            {
                InternalDebug.Assert(parseThreshold > 0);

                // try to read and decode more input data if necessary

                forceFlushToken = false;

                if (parseCurrent + parseThreshold > parseEnd)
                {
                    if (!endOfFile)
                    {
                        if (!input.ReadMore(ref parseBuffer, ref parseStart, ref parseCurrent, ref parseEnd))
                        {
                            // cannot decode more data until next input chunk is available

                            // we may have incomplete token at this point
                            InternalDebug.Assert(!tokenBuilder.Valid);

                            return(TextTokenId.None);
                        }

                        // NOTE: in case of success, ReadMore can move the token in the buffer and / or
                        // switch to a new buffer

                        tokenBuilder.BufferChanged(parseBuffer, parseStart);

                        var decodingInput = input as ConverterDecodingInput;

                        if (decodingInput != null && decodingInput.EncodingChanged)
                        {
                            // reset the flag as required by ConverterInput protocol
                            decodingInput.EncodingChanged = false;

                            // signal encoding change to the caller
                            return(tokenBuilder.MakeEmptyToken(TextTokenId.EncodingChange, decodingInput.Encoding.CodePage));
                        }

                        if (input.EndOfFile)
                        {
                            endOfFile = true;
                        }

                        if (!endOfFile && parseEnd - parseStart < input.MaxTokenSize)
                        {
                            // we have successfuly read "something", ensure this something is above threshold
                            continue;
                        }
                    }

                    // end of file or token is too long, need to flush the token as is (split)
                    forceFlushToken = true;
                }

                // we should have read something unless this is EOF
                InternalDebug.Assert(parseEnd > parseCurrent || forceFlushToken);

                // compact, so that the next character (or parseThreshold next characters) are valid.

                // get the next input character

                ch        = parseBuffer[parseCurrent];
                charClass = ParseSupport.GetCharClass(ch);

                if (ParseSupport.InvalidUnicodeCharacter(charClass) || parseThreshold > 1)
                {
                    while (ParseSupport.InvalidUnicodeCharacter(charClass) && parseCurrent < parseEnd)
                    {
                        ch        = parseBuffer[++parseCurrent];
                        charClass = ParseSupport.GetCharClass(ch);
                    }


                    if (parseThreshold > 1 && parseCurrent + 1 < parseEnd)
                    {
                        InternalDebug.Assert(parseCurrent == parseStart);
                        InternalDebug.Assert(!ParseSupport.InvalidUnicodeCharacter(ParseSupport.GetCharClass(parseBuffer[parseCurrent])));

                        var src = parseCurrent + 1;
                        var dst = parseCurrent + 1;

                        while (src < parseEnd && dst < parseCurrent + parseThreshold)
                        {
                            chT        = parseBuffer[src];
                            charClassT = ParseSupport.GetCharClass(chT);

                            if (!ParseSupport.InvalidUnicodeCharacter(charClassT))
                            {
                                if (src != dst)
                                {
                                    InternalDebug.Assert(ParseSupport.InvalidUnicodeCharacter(ParseSupport.GetCharClass(parseBuffer[dst])));

                                    parseBuffer[dst] = chT;        // move source character
                                    parseBuffer[src] = '\0';       // replace source character with invalid (zero)
                                }

                                dst++;
                            }

                            src++;
                        }

                        if (src == parseEnd && parseCurrent + parseThreshold > dst)
                        {
                            Array.Copy(parseBuffer, parseCurrent, parseBuffer, parseEnd - (dst - parseCurrent), dst - parseCurrent);

                            parseCurrent = parseEnd - (dst - parseCurrent);

                            // reporting all invalid characters consumed
                            input.ReportProcessed(parseCurrent - parseStart);
                            parseStart = parseCurrent;
                        }
                    }


                    if (parseCurrent + parseThreshold > parseEnd)
                    {
                        // we still below threshold...

                        if (!forceFlushToken)
                        {
                            // go back and try to read more
                            continue;
                        }

                        // this is the end of file

                        if (parseCurrent == parseEnd && !tokenBuilder.IsStarted && endOfFile)
                        {
                            // EOF and token is empty, just return EOF token
                            break;
                        }

                        // this is the end of file, we cannot make it above threshold but still have some input data.
                    }

                    // reset the threshold to its default value
                    parseThreshold = 1;
                }

                // now parse the buffer content

                runStart = parseCurrent;

                InternalDebug.Assert(!tokenBuilder.IsStarted);

                tokenBuilder.StartText(runStart);

                while (tokenBuilder.PrepareToAddMoreRuns(9, runStart, RunKind.Text))
                {
                    while (ParseSupport.TextUriCharacter(charClass))
                    {
                        ch        = parseBuffer[++parseCurrent];
                        charClass = ParseSupport.GetCharClass(ch);
                    }

                    if (ParseSupport.TextNonUriCharacter(charClass))
                    {
                        if (parseCurrent != runStart)
                        {
                            // we have nonempty NWSP run

                            AddTextRun(RunTextType.NonSpace, runStart, parseCurrent);
                        }

                        runStart = parseCurrent;

                        do
                        {
                            ch        = parseBuffer[++parseCurrent];
                            charClass = ParseSupport.GetCharClass(ch);
                        }while (ParseSupport.NbspCharacter(charClass));

                        AddTextRun(RunTextType.NonSpace, runStart, parseCurrent);
                    }
                    else if (ParseSupport.WhitespaceCharacter(charClass))
                    {
                        if (parseCurrent != runStart)
                        {
                            // we have nonempty NWSP run

                            AddTextRun(RunTextType.NonSpace, runStart, parseCurrent);
                        }

                        runStart = parseCurrent;

                        if (ch == ' ')
                        {
                            // ordinary space

                            chT        = parseBuffer[parseCurrent + 1];
                            charClassT = ParseSupport.GetCharClass(chT);

                            if (!ParseSupport.WhitespaceCharacter(charClassT))
                            {
                                // add single space to text run

                                ch        = chT;
                                charClass = charClassT;

                                parseCurrent++;

                                AddTextRun(RunTextType.Space, runStart, parseCurrent);

                                runStart = parseCurrent;

                                continue;
                            }
                        }

                        // this is a potentially collapsable whitespace, accumulate whitespace run(s)

                        ParseWhitespace(ch, charClass);

                        if (parseThreshold > 1)
                        {
                            // terminate the text parse loop to read more data
                            break;
                        }

                        runStart = parseCurrent;

                        ch        = parseBuffer[parseCurrent];
                        charClass = ParseSupport.GetCharClass(ch);
                    }
                    else if (ParseSupport.NbspCharacter(charClass))
                    {
                        if (parseCurrent != runStart)
                        {
                            AddTextRun(RunTextType.NonSpace, runStart, parseCurrent);
                        }

                        runStart = parseCurrent;

                        do
                        {
                            ch        = parseBuffer[++parseCurrent];
                            charClass = ParseSupport.GetCharClass(ch);
                        }while (ParseSupport.NbspCharacter(charClass));

                        AddTextRun(RunTextType.Nbsp, runStart, parseCurrent);
                    }
                    else
                    {
                        InternalDebug.Assert(ParseSupport.InvalidUnicodeCharacter(charClass));

                        // finish the "non-whitespace" run

                        if (parseCurrent != runStart)
                        {
                            AddTextRun(RunTextType.NonSpace, runStart, parseCurrent);
                        }

                        if (parseCurrent >= parseEnd)
                        {
                            // end of available input (EOB), flush the current text token
                            break;
                        }

                        // this is just an embedded invalid character, skip any such invalid
                        // characters and try to continue collecting text

                        do
                        {
                            ch        = parseBuffer[++parseCurrent];
                            charClass = ParseSupport.GetCharClass(ch);
                        }while (ParseSupport.InvalidUnicodeCharacter(charClass) && parseCurrent < parseEnd);
                    }

                    // prepare for a new run

                    runStart = parseCurrent;
                }

                if (token.IsEmpty)
                {
                    // text token is empty, we need more data

                    tokenBuilder.Reset();     // reset open text...

                    // reporting everything below parseCurrent consumed
                    input.ReportProcessed(parseCurrent - parseStart);
                    parseStart = parseCurrent;
                    continue;
                }

                // finish the current text token, return anything we have collected so far

                tokenBuilder.EndText();

                return((TextTokenId)token.TokenId);
            }

            return(tokenBuilder.MakeEmptyToken(TextTokenId.EndOfFile));
        }