/// <summary> /// Parses the stream of html tokens starting /// from the name of top-level element. /// Returns XmlElement representing the top-level /// html element /// </summary> private XmlElement ParseHtmlContent() { // Create artificial root elelemt to be able to group multiple top-level elements // We create "html" element which may be a duplicate of real HTML element, which is ok, as HtmlConverter will swallow it painlessly.. XmlElement htmlRootElement = _document.CreateElement ( "html", XhtmlNamespace); OpenStructuringElement(htmlRootElement); while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.Eof) { switch (_htmlLexicalAnalyzer.NextTokenType) { case HtmlTokenType.OpeningTagStart: _htmlLexicalAnalyzer.GetNextTagToken(); if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name) { string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower(); _htmlLexicalAnalyzer.GetNextTagToken(); // Create an element XmlElement htmlElement = _document.CreateElement ( htmlElementName, XhtmlNamespace); // Parse element attributes ParseAttributes(htmlElement); if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.EmptyTagEnd || HtmlSchema.IsEmptyElement(htmlElementName)) { // It is an element without content (because of explicit slash or based on implicit knowledge aboout html) AddEmptyElement(htmlElement); } else if (HtmlSchema.IsInlineElement(htmlElementName)) { // Elements known as formatting are pushed to some special // pending stack, which allows them to be transferred // over block tags - by doing this we convert // overlapping tags into normal heirarchical element structure. OpenInlineElement(htmlElement); } else if (HtmlSchema.IsBlockElement(htmlElementName) || HtmlSchema.IsKnownOpenableElement(htmlElementName)) { // This includes no-scope elements OpenStructuringElement(htmlElement); } } break; case HtmlTokenType.ClosingTagStart: _htmlLexicalAnalyzer.GetNextTagToken(); if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name) { string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower(); // Skip the name token. Assume that the following token is end of tag, // but do not check this. If it is not true, we simply ignore one token // - this is our recovery from bad xml in this case. _htmlLexicalAnalyzer.GetNextTagToken(); CloseElement(htmlElementName); } break; case HtmlTokenType.Text: AddTextContent(_htmlLexicalAnalyzer.NextToken); break; case HtmlTokenType.Comment: AddComment(_htmlLexicalAnalyzer.NextToken); break; } _htmlLexicalAnalyzer.GetNextContentToken(); } // Get rid of the artificial root element if (htmlRootElement.FirstChild is XmlElement && htmlRootElement.FirstChild == htmlRootElement.LastChild && htmlRootElement.FirstChild.LocalName.ToLower() == "html") { htmlRootElement = (XmlElement)htmlRootElement.FirstChild; } return(htmlRootElement); }