示例#1
0
        /// <summary>
        /// Parse the HTML, and return it, based on options set.
        /// </summary>
        ///
        /// <returns>
        /// An enumerator of the top-level elements.
        /// </returns>

        protected IEnumerable <IDomObject> ParseImplementation()
        {
            int pos = 0;
            Stack <IterationData> stack = new Stack <IterationData>();

            while (pos <= EndPos)
            {
                IterationData current = new IterationData();
                if (WrapRootTextNodes)
                {
                    current.WrapLiterals = true;
                }

                current.Reset(pos);
                stack.Push(current);

                while (stack.Count != 0)
                {
                    current = stack.Pop();

                    while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos)
                    {
                        char c = Html[current.Pos];
                        switch (current.TokenizerState)
                        {
                        case TokenizerState.Default:
                            if (current.FindNextTag(Html))
                            {
                                // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish
                                current.TokenizerState = TokenizerState.TagStart;
                            }
                            break;

                        case TokenizerState.TagStart:
                            IDomObject literal;
                            if (current.TryGetLiteral(this, out literal))
                            {
                                yield return(literal);
                            }

                            int tagStartPos = current.Pos;

                            string newTag = current.GetTagOpener(Html);

                            if (newTag == String.Empty)
                            {
                                // It's a tag closer. Make sure it's the right one.
                                current.Pos = tagStartPos + 1;
                                ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html));

                                // Ignore empty tags, or closing tags found when no parent is open
                                bool isProperClose = closeTagId == current.ParentTagID();
                                if (closeTagId == 0)
                                {
                                    // ignore empty tags
                                    continue;
                                }
                                else
                                {
                                    // locate match for this closer up the heirarchy
                                    IterationData actualParent = null;

                                    if (!isProperClose)
                                    {
                                        actualParent = current.Parent;
                                        while (actualParent != null && actualParent.Element.NodeNameID != closeTagId)
                                        {
                                            actualParent = actualParent.Parent;
                                        }
                                    }
                                    // if no matching close tag was found up the tree, ignore it
                                    // otherwise always close this and repeat at the same position until the match is found
                                    if (!isProperClose && actualParent == null)
                                    {
                                        current.InsertionMode = InsertionMode.Invalid;
                                        continue;
                                    }
                                }
                                // element is closed

                                if (current.Parent.Parent == null)
                                {
                                    yield return(current.Parent.Element);
                                }
                                current.TokenizerState = TokenizerState.Finished;
                                if (isProperClose)
                                {
                                    current.Parent.Reset(current.Pos);
                                }
                                else
                                {
                                    current.Parent.Reset(tagStartPos);
                                }
                                // already been returned before we added the children
                                continue;
                            }
                            else if (newTag[0] == '!')
                            {
                                IDomSpecialElement specialElement = null;
                                string             newTagUpper    = newTag.ToUpper();
                                if (newTagUpper.StartsWith("!DOCTYPE"))
                                {
                                    specialElement  = new DomDocumentType();
                                    current.Element = specialElement;
                                }
                                else if (newTagUpper.StartsWith("![CDATA["))
                                {
                                    specialElement  = new DomCData();
                                    current.Element = specialElement;
                                    current.Pos     = tagStartPos + 9;
                                }
                                else
                                {
                                    specialElement  = new DomComment();
                                    current.Element = specialElement;
                                    if (newTag.StartsWith("!--"))
                                    {
                                        ((DomComment)specialElement).IsQuoted = true;
                                        current.Pos = tagStartPos + 4;
                                    }
                                    else
                                    {
                                        current.Pos = tagStartPos + 1;
                                    }
                                }

                                string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">";

                                int tagEndPos = Html.Seek(endTag, current.Pos);
                                if (tagEndPos < 0)
                                {
                                    // if a tag is unclosed entirely, then just find a new line.
                                    tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos);
                                }
                                if (tagEndPos < 0)
                                {
                                    // Never closed, no newline - junk, treat it like such
                                    tagEndPos = EndPos;
                                }

                                specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos);
                                current.Pos = tagEndPos;
                            }
                            else
                            {
                                // seems to be a new element tag, parse it.

                                ushort newTagId = HtmlData.Tokenize(newTag);

                                // Before we keep going see if this is an implicit close
                                ushort parentTagId = current.ParentTagID();

                                int lastPos = current.Pos;

                                if (parentTagId == 0 && IsDocument)
                                {
                                    if (newTagId != HtmlData.tagHTML)
                                    {
                                        current.Element = DomElement.Create(HtmlData.tagHTML);
                                        current         = current.AddNewChild();
                                        parentTagId     = HtmlData.tagHTML;
                                    }
                                }

                                if (parentTagId != 0)
                                {
                                    ushort action = SpecialTagActionDelegate(parentTagId, newTagId);

                                    while (action != HtmlData.tagActionNothing)
                                    {
                                        if (action == HtmlData.tagActionClose)
                                        {
                                            // track the next parent up the chain

                                            var newNode = (current.Parent != null) ?
                                                          current.Parent : null;

                                            // same tag for a repeater like li occcurred - treat like a close tag

                                            if (current.Parent.Parent == null)
                                            {
                                                yield return(current.Parent.Element);
                                            }

                                            current.TokenizerState = TokenizerState.Finished;
                                            //current.Parent.Reset(tagStartPos);

                                            if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null)
                                            {
                                                action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId);
                                                if (action != HtmlData.tagActionNothing)
                                                {
                                                    current = newNode;
                                                }
                                            }
                                            else
                                            {
                                                action = HtmlData.tagActionNothing;
                                            }
                                        }
                                        else
                                        {
                                            if (GenerateOptionalElements)
                                            {
                                                stack.Push(current);
                                                current = current.AddNewParent(action, lastPos);
                                            }
                                            action = HtmlData.tagActionNothing;
                                        }
                                    }
                                    if (current.TokenizerState == TokenizerState.Finished)
                                    {
                                        current.Parent.Reset(tagStartPos);
                                        continue;
                                    }
                                }


                                current.Element = DomElement.Create(newTagId);


                                if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed)
                                {
                                    current.InsertionMode  = InsertionMode.Text;
                                    current.TokenizerState = TokenizerState.Default;
                                }

                                // Parse attribute data
                                while (current.Pos <= EndPos)
                                {
                                    if (!current.GetTagAttribute(Html))
                                    {
                                        break;
                                    }
                                }
                            }

                            IDomObject el;

                            if (current.FinishTagOpener(Html, out el))
                            {
                                stack.Push(current);
                                current = current.AddNewChild();
                            }

                            if (el != null)
                            {
                                yield return(el);
                            }

                            break;
                        }
                    }


                    // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because  top-level tag was unclosed.
                    // THis will wrap up any straggling text and close any open tags after it.

                    if (current.TokenizerState != TokenizerState.Finished)
                    {
                        foreach (var el in current.CloseElement(this))
                        {
                            yield return(el);
                        }
                    }
                }
                pos = current.Pos;
            }
        }
示例#2
0
        /// <summary>
        /// Parse the HTML, and return it, based on options set.
        /// </summary>
        ///
        /// <returns>
        /// An enumerator of the top-level elements.
        /// </returns>

        protected IEnumerable<IDomObject> ParseImplementation()
        {
            int pos=0;
            Stack<IterationData> stack = new Stack<IterationData>();

            while (pos <= EndPos)
            {
                IterationData current = new IterationData();
                if (WrapRootTextNodes)
                {
                    current.WrapLiterals = true;
                }

                current.Reset(pos);
                stack.Push(current);

                while (stack.Count != 0)
                {

                    current = stack.Pop();

                    while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos)
                    {
                        char c = Html[current.Pos];
                        switch (current.TokenizerState)
                        {
                            case TokenizerState.Default:
                                if (current.FindNextTag(Html)) {

                                    // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish
                                    current.TokenizerState = TokenizerState.TagStart;
                                }
                                break;
                            case TokenizerState.TagStart:
                                IDomObject literal;
                                if (current.TryGetLiteral(this, out literal))
                                {
                                    yield return literal;
                                }
                                
                                int tagStartPos = current.Pos;
                                
                                string newTag=current.GetTagOpener(Html);

                                if (newTag == String.Empty)
                                {
                            
                            
                                    // It's a tag closer. Make sure it's the right one.
                                    current.Pos = tagStartPos + 1;
                                    ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html));

                                    // Ignore empty tags, or closing tags found when no parent is open
                                    bool isProperClose = closeTagId == current.ParentTagID();
                                    if (closeTagId == 0)
                                    {
                                        // ignore empty tags
                                        continue;
                                    }
                                    else
                                    {
                                        // locate match for this closer up the heirarchy
                                        IterationData actualParent =null;
                                        
                                        if (!isProperClose)
                                        {
                                            actualParent = current.Parent;
                                            while (actualParent != null && actualParent.Element.NodeNameID != closeTagId)
                                            {
                                                actualParent = actualParent.Parent;
                                            }
                                        }
                                        // if no matching close tag was found up the tree, ignore it
                                        // otherwise always close this and repeat at the same position until the match is found
                                        if (!isProperClose && actualParent == null)
                                        {
                                            current.InsertionMode = InsertionMode.Invalid;
                                            continue;
                                        }
                                    }
                                   // element is closed 
                                    
                                    if (current.Parent.Parent == null)
                                    {
                                        yield return current.Parent.Element;
                                    }
                                    current.TokenizerState = TokenizerState.Finished ;
                                    if (isProperClose)
                                    {
                                        current.Parent.Reset(current.Pos);
                                    }
                                    else
                                    {
                                        current.Parent.Reset(tagStartPos);
                                    }
                                    // already been returned before we added the children
                                    continue;
                                } 
                                else if (newTag[0] == '!')
                                {
                                    IDomSpecialElement specialElement = null;
                                    string newTagUpper = newTag.ToUpper();
                                    if (newTagUpper.StartsWith("!DOCTYPE"))
                                    {
                                        specialElement = new DomDocumentType();
                                        current.Element = specialElement;
                                    }
                                    else if (newTagUpper.StartsWith("![CDATA["))
                                    {
                                        specialElement = new DomCData();
                                        current.Element = specialElement;
                                        current.Pos = tagStartPos + 9;
                                    }
                                    else 
                                    {
                                        specialElement = new DomComment();
                                        current.Element = specialElement;
                                        if (newTag.StartsWith("!--"))
                                        {
                                            ((DomComment)specialElement).IsQuoted = true;
                                            current.Pos = tagStartPos + 4;
                                        } else {
                                            current.Pos = tagStartPos+1;
                                        }
                                    }

                                    string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">";

                                    int tagEndPos = Html.Seek(endTag, current.Pos);
                                    if (tagEndPos < 0)
                                    {
                                        // if a tag is unclosed entirely, then just find a new line.
                                        tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos);
                                    }
                                    if (tagEndPos < 0)
                                    {
                                        // Never closed, no newline - junk, treat it like such
                                        tagEndPos = EndPos;
                                    }

                                    specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos);
                                    current.Pos = tagEndPos;

                                }
                                else
                                {

                                    // seems to be a new element tag, parse it.

                                    ushort newTagId = HtmlData.Tokenize(newTag);
                                    
                                    // Before we keep going see if this is an implicit close
                                    ushort parentTagId = current.ParentTagID();

                                    int lastPos = current.Pos;

                                    if (parentTagId ==0 && IsDocument) {
                                        if (newTagId != HtmlData.tagHTML) {
                                            current.Element =DomElement.Create(HtmlData.tagHTML);
                                            current = current.AddNewChild();
                                            parentTagId = HtmlData.tagHTML;
                                        }
                                    }
                                    
                                    if (parentTagId != 0)
                                    {
                                        ushort action = SpecialTagActionDelegate(parentTagId, newTagId);

                                        while (action != HtmlData.tagActionNothing)
                                        {
                                            if (action == HtmlData.tagActionClose)
                                            {

                                                // track the next parent up the chain

                                                var newNode = (current.Parent != null) ?
                                                    current.Parent : null;

                                                // same tag for a repeater like li occcurred - treat like a close tag

                                                if (current.Parent.Parent == null)
                                                {
                                                    yield return current.Parent.Element;
                                                }

                                                current.TokenizerState = TokenizerState.Finished;
                                                //current.Parent.Reset(tagStartPos);

                                                if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null)
                                                {
                                                    action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId);
                                                    if (action != HtmlData.tagActionNothing)
                                                    {
                                                        current = newNode;
                                                    }
                                                }
                                                else
                                                {
                                                    action = HtmlData.tagActionNothing;
                                                }
                                            }
                                            else 
                                            {
                                                if (GenerateOptionalElements)
                                                {
                                                    stack.Push(current);
                                                    current = current.AddNewParent(action, lastPos);

                                                }
                                                action = HtmlData.tagActionNothing;

                                            }
                                        }
                                        if (current.TokenizerState == TokenizerState.Finished)
                                        {
                                            current.Parent.Reset(tagStartPos);
                                            continue;
                                        }

                                    }

                                    
                                    current.Element = DomElement.Create(newTagId);


                                    if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed)
                                    {
                                        current.InsertionMode = InsertionMode.Text;
                                        current.TokenizerState = TokenizerState.Default;
                                    }

                                    // Parse attribute data
                                    while (current.Pos <= EndPos)
                                    {
                                        if (!current.GetTagAttribute(Html)) break;
                                    }
                                }

                                IDomObject el;
                                
                                if (current.FinishTagOpener(Html, out el))
                                {
                                    stack.Push(current);
                                    current = current.AddNewChild();
                                }

                                if (el != null)
                                {
                                    yield return el;
                                }

                                break;

                        }
                    }


                    // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because  top-level tag was unclosed.
                    // THis will wrap up any straggling text and close any open tags after it.

                    if (current.TokenizerState != TokenizerState.Finished)
                    {
                        foreach (var el in current.CloseElement(this)) {
                            yield return el;
                        }

                    }
                }
                pos = current.Pos;
            }

        }