Beispiel #1
0
        // Opens structurig element such as Div or Table etc.
        private void OpenStructuringElement(XmlElement htmlElement)
        {
            // Close all pending inline elements
            // All block elements are considered as delimiters for inline elements
            // which forces all inline elements to be closed and re-opened in the following
            // structural element (if any).
            // By doing that we guarantee that all inline elements appear only within most nested blocks
            if (HtmlSchema.IsBlockElement(htmlElement.LocalName))
            {
                while (_openedElements.Count > 0 && HtmlSchema.IsInlineElement(_openedElements.Peek().LocalName))
                {
                    XmlElement htmlInlineElement = _openedElements.Pop();
                    InvariantAssert(_openedElements.Count > 0, "OpenStructuringElement: stack of opened elements cannot become empty here");

                    _pendingInlineElements.Push(CreateElementCopy(htmlInlineElement));
                }
            }

            // Add this block element to its parent
            if (_openedElements.Count > 0)
            {
                XmlElement htmlParent = _openedElements.Peek();

                // Check some known block elements for auto-closing (LI and P)
                if (HtmlSchema.ClosesOnNextElementStart(htmlParent.LocalName, htmlElement.LocalName))
                {
                    _openedElements.Pop();
                    htmlParent = _openedElements.Count > 0 ? _openedElements.Peek() : null;
                }

                if (htmlParent != null)
                {
                    // NOTE:
                    // Actually we never expect null - it would mean two top-level P or LI (without a parent).
                    // In such weird case we will loose all paragraphs except the first one...
                    htmlParent.AppendChild(htmlElement);
                }
            }

            // Push it onto a stack
            _openedElements.Push(htmlElement);
        }
        /// <summary>
        /// returns true if the current element closes when the new element, whose name has just been read, starts
        /// </summary>
        /// <param name="currentElementName">
        /// string representing current element name
        /// </param>
        /// <param name="elementName"></param>
        /// string representing name of the next element that will start
        internal static bool ClosesOnNextElementStart(string currentElementName, string nextElementName)
        {
            Debug.Assert(currentElementName == currentElementName.ToLower());
            switch (currentElementName)
            {
            case "colgroup":
                return(_htmlElementsClosingColgroup.Contains(nextElementName) && HtmlSchema.IsBlockElement(nextElementName));

            case "dd":
                return(_htmlElementsClosingDD.Contains(nextElementName) && HtmlSchema.IsBlockElement(nextElementName));

            case "dt":
                return(_htmlElementsClosingDT.Contains(nextElementName) && HtmlSchema.IsBlockElement(nextElementName));

            case "li":
                return(_htmlElementsClosingLI.Contains(nextElementName));

            case "p":
                return(HtmlSchema.IsBlockElement(nextElementName));

            case "tbody":
                return(_htmlElementsClosingTbody.Contains(nextElementName));

            case "tfoot":
                return(_htmlElementsClosingTfoot.Contains(nextElementName));

            case "thead":
                return(_htmlElementsClosingThead.Contains(nextElementName));

            case "tr":
                return(_htmlElementsClosingTR.Contains(nextElementName));

            case "td":
                return(_htmlElementsClosingTD.Contains(nextElementName));

            case "th":
                return(_htmlElementsClosingTH.Contains(nextElementName));
            }
            return(false);
        }
Beispiel #3
0
        /// <summary>
        /// Parses the stream of html tokens starting
        /// from the name of top-level element.
        /// Returns XmlElement representing the top-level
        /// html element
        /// </summary>
        private XmlElement ParseHtmlContent()
        {
            // Create artificial root elelemt to be able to group multiple
            // top-level elements.  We create "html" element which may be a
            // duplicate of the real HTML element (a duplicate HTML element
            // is ok since HtmlConverter will process it without a problem).
            XmlElement htmlRootElement = _document.CreateElement("html", XhtmlNamespace);

            OpenStructuringElement(htmlRootElement);

            while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.EOF)
            {
                if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.OpeningTagStart)
                {
                    _htmlLexicalAnalyzer.GetNextTagToken();
                    if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
                    {
                        string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower();
                        _htmlLexicalAnalyzer.GetNextTagToken();

                        // Create an element
                        XmlElement htmlElement = _document.CreateElement(htmlElementName, XhtmlNamespace);

                        // Parse element attributes
                        ParseAttributes(htmlElement);

                        if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.EmptyTagEnd || HtmlSchema.IsEmptyElement(htmlElementName))
                        {
                            // It is an element without content (because of explicit slash or based on implicit knowledge aboout html)
                            AddEmptyElement(htmlElement);
                        }
                        else if (HtmlSchema.IsInlineElement(htmlElementName))
                        {
                            // Elements known as formatting are pushed to some special
                            // pending stack, which allows them to be transferred
                            // over block tags - by doing this we convert
                            // overlapping tags into normal heirarchical element structure.
                            OpenInlineElement(htmlElement);
                        }
                        else if (HtmlSchema.IsBlockElement(htmlElementName) || HtmlSchema.IsKnownOpenableElement(htmlElementName))
                        {
                            // This includes no-scope elements
                            OpenStructuringElement(htmlElement);
                        }
                        else
                        {
                            // Do nothing. Skip the whole opening tag.
                            // Ignoring all unknown elements on their start tags.
                            // Thus we will ignore them on closinng tag as well.
                            // Anyway we don't know what to do withthem on conversion to Xaml.
                        }
                    }
                    else
                    {
                        // Note that the token following opening angle bracket must be a name - lexical analyzer must guarantee that.
                        // Otherwise - we skip the angle bracket and continue parsing the content as if it is just text.
                        // TODO: Add the following asserion here, right? or output "<" as a text run instead?:
                        // InvariantAssert(false, "Angle bracket without a following name is not expected");
                    }
                }
                else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.ClosingTagStart)
                {
                    _htmlLexicalAnalyzer.GetNextTagToken();
                    if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
                    {
                        string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower();

                        // Skip the name token. Assume that the following token is end of tag,
                        // but do not check this. If it is not true, we simply ignore one token
                        // - this is our recovery from bad xml in this case.
                        _htmlLexicalAnalyzer.GetNextTagToken();

                        CloseElement(htmlElementName);
                    }
                }
                else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Text)
                {
                    AddTextContent(_htmlLexicalAnalyzer.NextToken);
                }
                else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Comment)
                {
                    AddComment(_htmlLexicalAnalyzer.NextToken);
                }

                _htmlLexicalAnalyzer.GetNextContentToken();
            }

            // Get rid of the artificial root element
            if (htmlRootElement.FirstChild is XmlElement &&
                htmlRootElement.FirstChild == htmlRootElement.LastChild &&
                htmlRootElement.FirstChild.LocalName.ToLower() == "html")
            {
                htmlRootElement = (XmlElement)htmlRootElement.FirstChild;
            }

            return(htmlRootElement);
        }