Reference data about HTML tags and attributes; methods to test tokens for certain properties; and the tokenizer.
Пример #1
0
        /// <summary>
        /// Returns a literal object for the text between HtmlStart (the last position of the end of a
        /// tag) and the current position. If !AllowLiterals then it's wrapped in a span.
        /// </summary>
        ///
        /// <param name="factory">
        /// The HTML factory to operate against
        /// </param>
        /// <param name="literal">
        /// [out] The literal.
        /// </param>
        ///
        /// <returns>
        /// true if it succeeds, false if it fails.
        /// </returns>

        public bool TryGetLiteral(HtmlElementFactory factory, out IDomObject literal)
        {
            if (Pos <= HtmlStart)
            {
                literal = null;
                return(false);
            }

            // There's plain text -return it as a literal.

            DomText lit;

            switch (InsertionMode)
            {
            case InsertionMode.Invalid:
                lit = new DomInvalidElement();
                break;

            case InsertionMode.Text:
                InsertionMode = InsertionMode.Default;
                lit           = new DomInnerText();
                break;

            default:
                lit = new DomText();
                break;
            }
            literal = lit;

            if (factory.IsBound)
            {
                lit.SetTextIndex(factory.Document, factory.Document.DocumentIndex.TokenizeString(HtmlStart, Pos - HtmlStart));
            }
            else
            {
                string text = factory.Html.SubstringBetween(HtmlStart, Pos);
                literal.NodeValue = HtmlData.HtmlDecode(text);
            }

            if (WrapLiterals)
            {
                DomElement wrapper = DomElement.Create("span");
                wrapper.ChildNodesInternal.AddAlways(literal);
                literal = wrapper;
            }


            if (Parent != null)
            {
                ((DomElement)Parent.Element).ChildNodesInternal.AddAlways(literal);
                Reset();
                return(false);
            }
            else
            {
                TokenizerState = TokenizerState.Finished;
                return(true);
            }
        }
Пример #2
0
        /// <summary>
        /// Parse the HTML, and return it, based on options set.
        /// </summary>
        ///
        /// <returns>
        /// An enumerator of the top-level elements.
        /// </returns>

        protected IEnumerable <IDomObject> ParseImplementation()
        {
            int pos = 0;
            Stack <IterationData> stack = new Stack <IterationData>();

            while (pos <= EndPos)
            {
                IterationData current = new IterationData();
                if (WrapRootTextNodes)
                {
                    current.WrapLiterals = true;
                }

                current.Reset(pos);
                stack.Push(current);

                while (stack.Count != 0)
                {
                    current = stack.Pop();

                    while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos)
                    {
                        char c = Html[current.Pos];
                        switch (current.TokenizerState)
                        {
                        case TokenizerState.Default:
                            if (current.FindNextTag(Html))
                            {
                                // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish
                                current.TokenizerState = TokenizerState.TagStart;
                            }
                            break;

                        case TokenizerState.TagStart:
                            IDomObject literal;
                            if (current.TryGetLiteral(this, out literal))
                            {
                                yield return(literal);
                            }

                            int tagStartPos = current.Pos;

                            string newTag = current.GetTagOpener(Html);

                            if (newTag == String.Empty)
                            {
                                // It's a tag closer. Make sure it's the right one.
                                current.Pos = tagStartPos + 1;
                                ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html));

                                // Ignore empty tags, or closing tags found when no parent is open
                                bool isProperClose = closeTagId == current.ParentTagID();
                                if (closeTagId == 0)
                                {
                                    // ignore empty tags
                                    continue;
                                }
                                else
                                {
                                    // locate match for this closer up the heirarchy
                                    IterationData actualParent = null;

                                    if (!isProperClose)
                                    {
                                        actualParent = current.Parent;
                                        while (actualParent != null && actualParent.Element.NodeNameID != closeTagId)
                                        {
                                            actualParent = actualParent.Parent;
                                        }
                                    }
                                    // if no matching close tag was found up the tree, ignore it
                                    // otherwise always close this and repeat at the same position until the match is found
                                    if (!isProperClose && actualParent == null)
                                    {
                                        current.InsertionMode = InsertionMode.Invalid;
                                        continue;
                                    }
                                }
                                // element is closed

                                if (current.Parent.Parent == null)
                                {
                                    yield return(current.Parent.Element);
                                }
                                current.TokenizerState = TokenizerState.Finished;
                                if (isProperClose)
                                {
                                    current.Parent.Reset(current.Pos);
                                }
                                else
                                {
                                    current.Parent.Reset(tagStartPos);
                                }
                                // already been returned before we added the children
                                continue;
                            }
                            else if (newTag[0] == '!')
                            {
                                IDomSpecialElement specialElement = null;
                                string             newTagUpper    = newTag.ToUpper();
                                if (newTagUpper.StartsWith("!DOCTYPE"))
                                {
                                    specialElement  = new DomDocumentType();
                                    current.Element = specialElement;
                                }
                                else if (newTagUpper.StartsWith("![CDATA["))
                                {
                                    specialElement  = new DomCData();
                                    current.Element = specialElement;
                                    current.Pos     = tagStartPos + 9;
                                }
                                else
                                {
                                    specialElement  = new DomComment();
                                    current.Element = specialElement;
                                    if (newTag.StartsWith("!--"))
                                    {
                                        ((DomComment)specialElement).IsQuoted = true;
                                        current.Pos = tagStartPos + 4;
                                    }
                                    else
                                    {
                                        current.Pos = tagStartPos + 1;
                                    }
                                }

                                string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">";

                                int tagEndPos = Html.Seek(endTag, current.Pos);
                                if (tagEndPos < 0)
                                {
                                    // if a tag is unclosed entirely, then just find a new line.
                                    tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos);
                                }
                                if (tagEndPos < 0)
                                {
                                    // Never closed, no newline - junk, treat it like such
                                    tagEndPos = EndPos;
                                }

                                specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos);
                                current.Pos = tagEndPos;
                            }
                            else
                            {
                                // seems to be a new element tag, parse it.

                                ushort newTagId = HtmlData.Tokenize(newTag);

                                // Before we keep going see if this is an implicit close
                                ushort parentTagId = current.ParentTagID();

                                int lastPos = current.Pos;

                                if (parentTagId == 0 && IsDocument)
                                {
                                    if (newTagId != HtmlData.tagHTML)
                                    {
                                        current.Element = DomElement.Create(HtmlData.tagHTML);
                                        current         = current.AddNewChild();
                                        parentTagId     = HtmlData.tagHTML;
                                    }
                                }

                                if (parentTagId != 0)
                                {
                                    ushort action = SpecialTagActionDelegate(parentTagId, newTagId);

                                    while (action != HtmlData.tagActionNothing)
                                    {
                                        if (action == HtmlData.tagActionClose)
                                        {
                                            // track the next parent up the chain

                                            var newNode = (current.Parent != null) ?
                                                          current.Parent : null;

                                            // same tag for a repeater like li occcurred - treat like a close tag

                                            if (current.Parent.Parent == null)
                                            {
                                                yield return(current.Parent.Element);
                                            }

                                            current.TokenizerState = TokenizerState.Finished;
                                            //current.Parent.Reset(tagStartPos);

                                            if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null)
                                            {
                                                action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId);
                                                if (action != HtmlData.tagActionNothing)
                                                {
                                                    current = newNode;
                                                }
                                            }
                                            else
                                            {
                                                action = HtmlData.tagActionNothing;
                                            }
                                        }
                                        else
                                        {
                                            if (GenerateOptionalElements)
                                            {
                                                stack.Push(current);
                                                current = current.AddNewParent(action, lastPos);
                                            }
                                            action = HtmlData.tagActionNothing;
                                        }
                                    }
                                    if (current.TokenizerState == TokenizerState.Finished)
                                    {
                                        current.Parent.Reset(tagStartPos);
                                        continue;
                                    }
                                }


                                current.Element = DomElement.Create(newTagId);


                                if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed)
                                {
                                    current.InsertionMode  = InsertionMode.Text;
                                    current.TokenizerState = TokenizerState.Default;
                                }

                                // Parse attribute data
                                while (current.Pos <= EndPos)
                                {
                                    if (!current.GetTagAttribute(Html))
                                    {
                                        break;
                                    }
                                }
                            }

                            IDomObject el;

                            if (current.FinishTagOpener(Html, out el))
                            {
                                stack.Push(current);
                                current = current.AddNewChild();
                            }

                            if (el != null)
                            {
                                yield return(el);
                            }

                            break;
                        }
                    }


                    // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because  top-level tag was unclosed.
                    // THis will wrap up any straggling text and close any open tags after it.

                    if (current.TokenizerState != TokenizerState.Finished)
                    {
                        foreach (var el in current.CloseElement(this))
                        {
                            yield return(el);
                        }
                    }
                }
                pos = current.Pos;
            }
        }