Example #1
0
        /**
         * Does the actual parsing. Perform this immediately
         * after creating the parser object.
         */
        private void Go(TextReader reader)
        {
            doc.StartDocument();
            while (true)
            {
                // read a new character
                if (previousCharacter == -1)
                {
                    character = reader.Read();
                }
                // or re-examin the previous character
                else
                {
                    character         = previousCharacter;
                    previousCharacter = -1;
                }

                // the end of the file was reached
                if (character == -1)
                {
                    if (html)
                    {
                        if (html && state == TEXT)
                        {
                            Flush();
                        }
                        doc.EndDocument();
                    }
                    else
                    {
                        ThrowException(MessageLocalization.GetComposedMessage("missing.end.tag"));
                    }
                    return;
                }

                // dealing with  \n and \r
                if (character == '\n' && eol)
                {
                    eol = false;
                    continue;
                }
                else if (eol)
                {
                    eol = false;
                }
                else if (character == '\n')
                {
                    lines++;
                    columns = 0;
                }
                else if (character == '\r')
                {
                    eol       = true;
                    character = '\n';
                    lines++;
                    columns = 0;
                }
                else
                {
                    columns++;
                }

                switch (state)
                {
                // we are in an unknown state before there's actual content
                case UNKNOWN:
                    if (character == '<')
                    {
                        SaveState(TEXT);
                        state = TAG_ENCOUNTERED;
                    }
                    break;

                // we can encounter any content
                case TEXT:
                    if (character == '<')
                    {
                        Flush();
                        SaveState(state);
                        state = TAG_ENCOUNTERED;
                    }
                    else if (character == '&')
                    {
                        SaveState(state);
                        entity.Length = 0;
                        state         = ENTITY;
                        nowhite       = true;
                    }
                    else if (character == ' ')
                    {
                        if (html && nowhite)
                        {
                            text.Append(' ');
                            nowhite = false;
                        }
                        else
                        {
                            if (nowhite)
                            {
                                text.Append((char)character);
                            }
                            nowhite = false;
                        }
                    }
                    else if (Char.IsWhiteSpace((char)character))
                    {
                        if (html)
                        {
                            // totally ignore other whitespace
                        }
                        else
                        {
                            if (nowhite)
                            {
                                text.Append((char)character);
                            }
                            nowhite = false;
                        }
                    }
                    else
                    {
                        text.Append((char)character);
                        nowhite = true;
                    }
                    break;

                // we have just seen a < and are wondering what we are looking at
                // <foo>, </foo>, <!-- ... --->, etc.
                case TAG_ENCOUNTERED:
                    InitTag();
                    if (character == '/')
                    {
                        state = IN_CLOSETAG;
                    }
                    else if (character == '?')
                    {
                        RestoreState();
                        state = PI;
                    }
                    else
                    {
                        text.Append((char)character);
                        state = EXAMIN_TAG;
                    }
                    break;

                // we are processing something like this <foo ... >.
                // It could still be a <!-- ... --> or something.
                case EXAMIN_TAG:
                    if (character == '>')
                    {
                        DoTag();
                        ProcessTag(true);
                        InitTag();
                        state = RestoreState();
                    }
                    else if (character == '/')
                    {
                        state = SINGLE_TAG;
                    }
                    else if (character == '-' && text.ToString().Equals("!-"))
                    {
                        Flush();
                        state = COMMENT;
                    }
                    else if (character == '[' && text.ToString().Equals("![CDATA"))
                    {
                        Flush();
                        state = CDATA;
                    }
                    else if (character == 'E' && text.ToString().Equals("!DOCTYP"))
                    {
                        Flush();
                        state = PI;
                    }
                    else if (char.IsWhiteSpace((char)character))
                    {
                        DoTag();
                        state = TAG_EXAMINED;
                    }
                    else
                    {
                        text.Append((char)character);
                    }
                    break;

                // we know the name of the tag now.
                case TAG_EXAMINED:
                    if (character == '>')
                    {
                        ProcessTag(true);
                        InitTag();
                        state = RestoreState();
                    }
                    else if (character == '/')
                    {
                        state = SINGLE_TAG;
                    }
                    else if (char.IsWhiteSpace((char)character))
                    {
                        // empty
                    }
                    else
                    {
                        text.Append((char)character);
                        state = ATTRIBUTE_KEY;
                    }
                    break;

                // we are processing a closing tag: e.g. </foo>
                case IN_CLOSETAG:
                    if (character == '>')
                    {
                        DoTag();
                        ProcessTag(false);
                        if (!html && nested == 0)
                        {
                            return;
                        }
                        state = RestoreState();
                    }
                    else
                    {
                        if (!char.IsWhiteSpace((char)character))
                        {
                            text.Append((char)character);
                        }
                    }
                    break;

                // we have just seen something like this: <foo a="b"/
                // and are looking for the final >.
                case SINGLE_TAG:
                    if (character != '>')
                    {
                        ThrowException(MessageLocalization.GetComposedMessage("expected.gt.for.tag.lt.1.gt", tag));
                    }
                    DoTag();
                    ProcessTag(true);
                    ProcessTag(false);
                    InitTag();
                    if (!html && nested == 0)
                    {
                        doc.EndDocument();
                        return;
                    }
                    state = RestoreState();
                    break;

                // we are processing CDATA
                case CDATA:
                    if (character == '>' &&
                        text.ToString().EndsWith("]]"))
                    {
                        text.Length = text.Length - 2;
                        Flush();
                        state = RestoreState();
                    }
                    else
                    {
                        text.Append((char)character);
                    }
                    break;

                // we are processing a comment.  We are inside
                // the <!-- .... --> looking for the -->.
                case COMMENT:
                    if (character == '>' &&
                        text.ToString().EndsWith("--"))
                    {
                        text.Length = text.Length - 2;
                        Flush();
                        state = RestoreState();
                    }
                    else
                    {
                        text.Append((char)character);
                    }
                    break;

                // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
                case PI:
                    if (character == '>')
                    {
                        state = RestoreState();
                        if (state == TEXT)
                        {
                            state = UNKNOWN;
                        }
                    }
                    break;

                // we are processing an entity, e.g. &lt;, &#187;, etc.
                case ENTITY:
                    if (character == ';')
                    {
                        state = RestoreState();
                        String cent = entity.ToString();
                        entity.Length = 0;
                        char ce = EntitiesToUnicode.DecodeEntity(cent);
                        if (ce == '\0')
                        {
                            text.Append('&').Append(cent).Append(';');
                        }
                        else
                        {
                            text.Append(ce);
                        }
                    }
                    else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z') &&
                              (character < 'A' || character > 'Z')) || entity.Length >= 7)
                    {
                        state             = RestoreState();
                        previousCharacter = character;
                        text.Append('&').Append(entity.ToString());
                        entity.Length = 0;
                    }
                    else
                    {
                        entity.Append((char)character);
                    }
                    break;

                // We are processing the quoted right-hand side of an element's attribute.
                case QUOTE:
                    if (html && quoteCharacter == ' ' && character == '>')
                    {
                        Flush();
                        ProcessTag(true);
                        InitTag();
                        state = RestoreState();
                    }
                    else if (html && quoteCharacter == ' ' && char.IsWhiteSpace((char)character))
                    {
                        Flush();
                        state = TAG_EXAMINED;
                    }
                    else if (html && quoteCharacter == ' ')
                    {
                        text.Append((char)character);
                    }
                    else if (character == quoteCharacter)
                    {
                        Flush();
                        state = TAG_EXAMINED;
                    }
                    else if (" \r\n\u0009".IndexOf((char)character) >= 0)
                    {
                        text.Append(' ');
                    }
                    else if (character == '&')
                    {
                        SaveState(state);
                        state         = ENTITY;
                        entity.Length = 0;
                    }
                    else
                    {
                        text.Append((char)character);
                    }
                    break;

                case ATTRIBUTE_KEY:
                    if (char.IsWhiteSpace((char)character))
                    {
                        Flush();
                        state = ATTRIBUTE_EQUAL;
                    }
                    else if (character == '=')
                    {
                        Flush();
                        state = ATTRIBUTE_VALUE;
                    }
                    else if (html && character == '>')
                    {
                        text.Length = 0;
                        ProcessTag(true);
                        InitTag();
                        state = RestoreState();
                    }
                    else
                    {
                        text.Append((char)character);
                    }
                    break;

                case ATTRIBUTE_EQUAL:
                    if (character == '=')
                    {
                        state = ATTRIBUTE_VALUE;
                    }
                    else if (char.IsWhiteSpace((char)character))
                    {
                        // empty
                    }
                    else if (html && character == '>')
                    {
                        text.Length = 0;
                        ProcessTag(true);
                        InitTag();
                        state = RestoreState();
                    }
                    else if (html && character == '/')
                    {
                        Flush();
                        state = SINGLE_TAG;
                    }
                    else if (html)
                    {
                        Flush();
                        text.Append((char)character);
                        state = ATTRIBUTE_KEY;
                    }
                    else
                    {
                        ThrowException(MessageLocalization.GetComposedMessage("error.in.attribute.processing"));
                    }
                    break;

                case ATTRIBUTE_VALUE:
                    if (character == '"' || character == '\'')
                    {
                        quoteCharacter = character;
                        state          = QUOTE;
                    }
                    else if (char.IsWhiteSpace((char)character))
                    {
                        // empty
                    }
                    else if (html && character == '>')
                    {
                        Flush();
                        ProcessTag(true);
                        InitTag();
                        state = RestoreState();
                    }
                    else if (html)
                    {
                        text.Append((char)character);
                        quoteCharacter = ' ';
                        state          = QUOTE;
                    }
                    else
                    {
                        ThrowException(MessageLocalization.GetComposedMessage("error.in.attribute.processing"));
                    }
                    break;
                }
            }
        }
        /// <summary>
        /// Does the actual parsing. Perform this immediately
        /// after creating the parser object.
        /// </summary>
        private void go(TextReader reader)
        {
            Doc.StartDocument();
            while (true)
            {
                // read a new character
                if (PreviousCharacter == -1)
                {
                    Character = reader.Read();
                }
                // or re-examin the previous character
                else
                {
                    Character         = PreviousCharacter;
                    PreviousCharacter = -1;
                }

                // the end of the file was reached
                if (Character == -1)
                {
                    if (Html)
                    {
                        if (Html && State == Text)
                        {
                            flush();
                        }

                        Doc.EndDocument();
                    }
                    else
                    {
                        throwException("Missing end tag");
                    }
                    return;
                }

                // dealing with  \n and \r
                if (Character == '\n' && Eol)
                {
                    Eol = false;
                    continue;
                }
                else if (Eol)
                {
                    Eol = false;
                }
                else if (Character == '\n')
                {
                    Lines++;
                    Columns = 0;
                }
                else if (Character == '\r')
                {
                    Eol       = true;
                    Character = '\n';
                    Lines++;
                    Columns = 0;
                }
                else
                {
                    Columns++;
                }

                switch (State)
                {
                // we are in an unknown state before there's actual content
                case Unknown:
                    if (Character == '<')
                    {
                        saveState(Text);
                        State = TagEncountered;
                    }
                    break;

                // we can encounter any content
                case Text:
                    if (Character == '<')
                    {
                        flush();
                        saveState(State);
                        State = TagEncountered;
                    }
                    else if (Character == '&')
                    {
                        saveState(State);
                        entity.Length = 0;
                        State         = Entity;
                    }
                    else if (char.IsWhiteSpace((char)Character))
                    {
                        if (Nowhite)
                        {
                            text.Append((char)Character);
                        }

                        Nowhite = false;
                    }
                    else
                    {
                        text.Append((char)Character);
                        Nowhite = true;
                    }
                    break;

                // we have just seen a < and are wondering what we are looking at
                // <foo>, </foo>, <!-- ... --->, etc.
                case TagEncountered:
                    initTag();
                    if (Character == '/')
                    {
                        State = InClosetag;
                    }
                    else if (Character == '?')
                    {
                        restoreState();
                        State = Pi;
                    }
                    else
                    {
                        text.Append((char)Character);
                        State = ExaminTag;
                    }
                    break;

                // we are processing something like this <foo ... >.
                // It could still be a <!-- ... --> or something.
                case ExaminTag:
                    if (Character == '>')
                    {
                        doTag();
                        processTag(true);
                        initTag();
                        State = restoreState();
                    }
                    else if (Character == '/')
                    {
                        State = SingleTag;
                    }
                    else if (Character == '-' && text.ToString().Equals("!-"))
                    {
                        flush();
                        State = Comment;
                    }
                    else if (Character == '[' && text.ToString().Equals("![CDATA"))
                    {
                        flush();
                        State = Cdata;
                    }
                    else if (Character == 'E' && text.ToString().Equals("!DOCTYP"))
                    {
                        flush();
                        State = Pi;
                    }
                    else if (char.IsWhiteSpace((char)Character))
                    {
                        doTag();
                        State = TagExamined;
                    }
                    else
                    {
                        text.Append((char)Character);
                    }
                    break;

                // we know the name of the tag now.
                case TagExamined:
                    if (Character == '>')
                    {
                        processTag(true);
                        initTag();
                        State = restoreState();
                    }
                    else if (Character == '/')
                    {
                        State = SingleTag;
                    }
                    else if (char.IsWhiteSpace((char)Character))
                    {
                        // empty
                    }
                    else
                    {
                        text.Append((char)Character);
                        State = AttributeKey;
                    }
                    break;

                // we are processing a closing tag: e.g. </foo>
                case InClosetag:
                    if (Character == '>')
                    {
                        doTag();
                        processTag(false);
                        if (!Html && Nested == 0)
                        {
                            return;
                        }

                        State = restoreState();
                    }
                    else
                    {
                        if (!char.IsWhiteSpace((char)Character))
                        {
                            text.Append((char)Character);
                        }
                    }
                    break;

                // we have just seen something like this: <foo a="b"/
                // and are looking for the final >.
                case SingleTag:
                    if (Character != '>')
                    {
                        throwException($"Expected > for tag: <{Tag}/>");
                    }

                    doTag();
                    processTag(true);
                    processTag(false);
                    initTag();
                    if (!Html && Nested == 0)
                    {
                        Doc.EndDocument();
                        return;
                    }
                    State = restoreState();
                    break;

                // we are processing CDATA
                case Cdata:
                    if (Character == '>' &&
                        text.ToString().EndsWith("]]"))
                    {
                        text.Length = text.Length - 2;
                        flush();
                        State = restoreState();
                    }
                    else
                    {
                        text.Append((char)Character);
                    }

                    break;

                // we are processing a comment.  We are inside
                // the <!-- .... --> looking for the -->.
                case Comment:
                    if (Character == '>' &&
                        text.ToString().EndsWith("--"))
                    {
                        text.Length = text.Length - 2;
                        flush();
                        State = restoreState();
                    }
                    else
                    {
                        text.Append((char)Character);
                    }

                    break;

                // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
                case Pi:
                    if (Character == '>')
                    {
                        State = restoreState();
                        if (State == Text)
                        {
                            State = Unknown;
                        }
                    }
                    break;

                // we are processing an entity, e.g. &lt;, &#187;, etc.
                case Entity:
                    if (Character == ';')
                    {
                        State = restoreState();
                        var cent = entity.ToString();
                        entity.Length = 0;
                        var ce = EntitiesToUnicode.DecodeEntity(cent);
                        if (ce == '\0')
                        {
                            text.Append('&').Append(cent).Append(';');
                        }
                        else
                        {
                            text.Append(ce);
                        }
                    }
                    else if ((Character != '#' && (Character < '0' || Character > '9') && (Character < 'a' || Character > 'z') &&
                              (Character < 'A' || Character > 'Z')) || entity.Length >= 7)
                    {
                        State             = restoreState();
                        PreviousCharacter = Character;
                        text.Append('&').Append(entity);
                        entity.Length = 0;
                    }
                    else
                    {
                        entity.Append((char)Character);
                    }
                    break;

                // We are processing the quoted right-hand side of an element's attribute.
                case Quote:
                    if (Html && QuoteCharacter == ' ' && Character == '>')
                    {
                        flush();
                        processTag(true);
                        initTag();
                        State = restoreState();
                    }
                    else if (Html && QuoteCharacter == ' ' && char.IsWhiteSpace((char)Character))
                    {
                        flush();
                        State = TagExamined;
                    }
                    else if (Html && QuoteCharacter == ' ')
                    {
                        text.Append((char)Character);
                    }
                    else if (Character == QuoteCharacter)
                    {
                        flush();
                        State = TagExamined;
                    }
                    else if (" \r\n\u0009".IndexOf(((char)Character).ToString(), StringComparison.Ordinal) >= 0)
                    {
                        text.Append(' ');
                    }
                    else if (Character == '&')
                    {
                        saveState(State);
                        State         = Entity;
                        entity.Length = 0;
                    }
                    else
                    {
                        text.Append((char)Character);
                    }
                    break;

                case AttributeKey:
                    if (char.IsWhiteSpace((char)Character))
                    {
                        flush();
                        State = AttributeEqual;
                    }
                    else if (Character == '=')
                    {
                        flush();
                        State = AttributeValue;
                    }
                    else if (Html && Character == '>')
                    {
                        text.Length = 0;
                        processTag(true);
                        initTag();
                        State = restoreState();
                    }
                    else
                    {
                        text.Append((char)Character);
                    }
                    break;

                case AttributeEqual:
                    if (Character == '=')
                    {
                        State = AttributeValue;
                    }
                    else if (char.IsWhiteSpace((char)Character))
                    {
                        // empty
                    }
                    else if (Html && Character == '>')
                    {
                        text.Length = 0;
                        processTag(true);
                        initTag();
                        State = restoreState();
                    }
                    else if (Html && Character == '/')
                    {
                        flush();
                        State = SingleTag;
                    }
                    else if (Html)
                    {
                        flush();
                        text.Append((char)Character);
                        State = AttributeKey;
                    }
                    else
                    {
                        throwException("Error in attribute processing.");
                    }
                    break;

                case AttributeValue:
                    if (Character == '"' || Character == '\'')
                    {
                        QuoteCharacter = Character;
                        State          = Quote;
                    }
                    else if (char.IsWhiteSpace((char)Character))
                    {
                        // empty
                    }
                    else if (Html && Character == '>')
                    {
                        flush();
                        processTag(true);
                        initTag();
                        State = restoreState();
                    }
                    else if (Html)
                    {
                        text.Append((char)Character);
                        QuoteCharacter = ' ';
                        State          = Quote;
                    }
                    else
                    {
                        throwException("Error in attribute processing");
                    }
                    break;
                }
            }
        }