// Opens structurig element such as Div or Table etc. private void OpenStructuringElement(XmlElement htmlElement) { // Close all pending inline elements // All block elements are considered as delimiters for inline elements // which forces all inline elements to be closed and re-opened in the following // structural element (if any). // By doing that we guarantee that all inline elements appear only within most nested blocks if (HtmlSchema.IsBlockElement(htmlElement.LocalName)) { while (_openedElements.Count > 0 && HtmlSchema.IsInlineElement(_openedElements.Peek().LocalName)) { XmlElement htmlInlineElement = _openedElements.Pop(); InvariantAssert(_openedElements.Count > 0, "OpenStructuringElement: stack of opened elements cannot become empty here"); _pendingInlineElements.Push(CreateElementCopy(htmlInlineElement)); } } // Add this block element to its parent if (_openedElements.Count > 0) { XmlElement htmlParent = _openedElements.Peek(); // Check some known block elements for auto-closing (LI and P) if (HtmlSchema.ClosesOnNextElementStart(htmlParent.LocalName, htmlElement.LocalName)) { _openedElements.Pop(); htmlParent = _openedElements.Count > 0 ? _openedElements.Peek() : null; } if (htmlParent != null) { // NOTE: // Actually we never expect null - it would mean two top-level P or LI (without a parent). // In such weird case we will loose all paragraphs except the first one... htmlParent.AppendChild(htmlElement); } } // Push it onto a stack _openedElements.Push(htmlElement); }
private void CloseElement(string htmlElementName) { // Check if the element is opened and already added to the parent InvariantAssert(_openedElements.Count > 0, "CloseElement: Stack of opened elements cannot be empty, as we have at least one artificial root element"); // Check if the element is opened and still waiting to be added to the parent if (_pendingInlineElements.Count > 0 && _pendingInlineElements.Peek().LocalName == htmlElementName) { // Closing an empty inline element. // Note that HtmlConverter will skip empty inlines, but for completeness we keep them here on parser level. XmlElement htmlInlineElement = _pendingInlineElements.Pop(); InvariantAssert(_openedElements.Count > 0, "CloseElement: Stack of opened elements cannot be empty, as we have at least one artificial root element"); XmlElement htmlParent = _openedElements.Peek(); htmlParent.AppendChild(htmlInlineElement); return; } else if (IsElementOpened(htmlElementName)) { while (_openedElements.Count > 1) // we never pop the last element - the artificial root { // Close all unbalanced elements. XmlElement htmlOpenedElement = _openedElements.Pop(); if (htmlOpenedElement.LocalName == htmlElementName) { return; } if (HtmlSchema.IsInlineElement(htmlOpenedElement.LocalName)) { // Unbalances Inlines will be transfered to the next element content _pendingInlineElements.Push(CreateElementCopy(htmlOpenedElement)); } } } // If element was not opened, we simply ignore the unbalanced closing tag return; }
/// <summary> /// Parses the stream of html tokens starting /// from the name of top-level element. /// Returns XmlElement representing the top-level /// html element /// </summary> private XmlElement ParseHtmlContent() { // Create artificial root elelemt to be able to group multiple // top-level elements. We create "html" element which may be a // duplicate of the real HTML element (a duplicate HTML element // is ok since HtmlConverter will process it without a problem). XmlElement htmlRootElement = _document.CreateElement("html", XhtmlNamespace); OpenStructuringElement(htmlRootElement); while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.EOF) { if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.OpeningTagStart) { _htmlLexicalAnalyzer.GetNextTagToken(); if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name) { string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower(); _htmlLexicalAnalyzer.GetNextTagToken(); // Create an element XmlElement htmlElement = _document.CreateElement(htmlElementName, XhtmlNamespace); // Parse element attributes ParseAttributes(htmlElement); if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.EmptyTagEnd || HtmlSchema.IsEmptyElement(htmlElementName)) { // It is an element without content (because of explicit slash or based on implicit knowledge aboout html) AddEmptyElement(htmlElement); } else if (HtmlSchema.IsInlineElement(htmlElementName)) { // Elements known as formatting are pushed to some special // pending stack, which allows them to be transferred // over block tags - by doing this we convert // overlapping tags into normal heirarchical element structure. OpenInlineElement(htmlElement); } else if (HtmlSchema.IsBlockElement(htmlElementName) || HtmlSchema.IsKnownOpenableElement(htmlElementName)) { // This includes no-scope elements OpenStructuringElement(htmlElement); } else { // Do nothing. Skip the whole opening tag. // Ignoring all unknown elements on their start tags. // Thus we will ignore them on closinng tag as well. // Anyway we don't know what to do withthem on conversion to Xaml. } } else { // Note that the token following opening angle bracket must be a name - lexical analyzer must guarantee that. // Otherwise - we skip the angle bracket and continue parsing the content as if it is just text. // TODO: Add the following asserion here, right? or output "<" as a text run instead?: // InvariantAssert(false, "Angle bracket without a following name is not expected"); } } else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.ClosingTagStart) { _htmlLexicalAnalyzer.GetNextTagToken(); if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name) { string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower(); // Skip the name token. Assume that the following token is end of tag, // but do not check this. If it is not true, we simply ignore one token // - this is our recovery from bad xml in this case. _htmlLexicalAnalyzer.GetNextTagToken(); CloseElement(htmlElementName); } } else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Text) { AddTextContent(_htmlLexicalAnalyzer.NextToken); } else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Comment) { AddComment(_htmlLexicalAnalyzer.NextToken); } _htmlLexicalAnalyzer.GetNextContentToken(); } // Get rid of the artificial root element if (htmlRootElement.FirstChild is XmlElement && htmlRootElement.FirstChild == htmlRootElement.LastChild && htmlRootElement.FirstChild.LocalName.ToLower() == "html") { htmlRootElement = (XmlElement)htmlRootElement.FirstChild; } return(htmlRootElement); }