/// <summary> /// Parses the stream of html tokens starting /// from the name of top-level element. /// Returns XmlElement representing the top-level /// html element /// </summary> private XmlElement ParseHtmlContent() { // Create artificial root elelemt to be able to group multiple top-level elements // We create "html" element which may be a duplicate of real HTML element, which is ok, as HtmlConverter will swallow it painlessly.. var htmlRootElement = _document.CreateElement("html", XhtmlNamespace); OpenStructuringElement(htmlRootElement); while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.Eof) { if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.OpeningTagStart) { _htmlLexicalAnalyzer.GetNextTagToken(); if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name) { var htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower(); _htmlLexicalAnalyzer.GetNextTagToken(); // Create an element var htmlElement = _document.CreateElement(htmlElementName, XhtmlNamespace); // Parse element attributes ParseAttributes(htmlElement); if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.EmptyTagEnd || HtmlSchema.IsEmptyElement(htmlElementName)) { // It is an element without content (because of explicit slash or based on implicit knowledge aboout html) AddEmptyElement(htmlElement); } else if (HtmlSchema.IsInlineElement(htmlElementName)) { // Elements known as formatting are pushed to some special // pending stack, which allows them to be transferred // over block tags - by doing this we convert // overlapping tags into normal heirarchical element structure. OpenInlineElement(htmlElement); } else if (HtmlSchema.IsBlockElement(htmlElementName) || HtmlSchema.IsKnownOpenableElement(htmlElementName)) { // This includes no-scope elements OpenStructuringElement(htmlElement); } } } else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.ClosingTagStart) { _htmlLexicalAnalyzer.GetNextTagToken(); if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name) { var htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower(); // Skip the name token. Assume that the following token is end of tag, // but do not check this. If it is not true, we simply ignore one token // - this is our recovery from bad xml in this case. _htmlLexicalAnalyzer.GetNextTagToken(); CloseElement(htmlElementName); } } else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Text) { AddTextContent(_htmlLexicalAnalyzer.NextToken); } else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Comment) { AddComment(_htmlLexicalAnalyzer.NextToken); } _htmlLexicalAnalyzer.GetNextContentToken(); } // Get rid of the artificial root element if (htmlRootElement.FirstChild is XmlElement && htmlRootElement.FirstChild == htmlRootElement.LastChild && htmlRootElement.FirstChild.LocalName.ToLower() == "html") { htmlRootElement = (XmlElement)htmlRootElement.FirstChild; } return(htmlRootElement); }
// --------------------------------------------------------------------- // // Private methods // // --------------------------------------------------------------------- #region Private Methods /// <summary> /// Advances a reading position by one character code /// and reads the next availbale character from a stream. /// This character becomes available as NextCharacter property. /// </summary> /// <remarks> /// Throws InvalidOperationException if attempted to be called on EndOfStream /// condition. /// </remarks> private void GetNextCharacter() { if (_nextCharacterCode == -1) { throw new InvalidOperationException("GetNextCharacter method called at the end of a stream"); } _previousCharacter = NextCharacter; NextCharacter = _lookAheadCharacter; _nextCharacterCode = _lookAheadCharacterCode; // next character not an entity as of now IsNextCharacterEntity = false; ReadLookAheadCharacter(); if (NextCharacter == '&') { if (_lookAheadCharacter == '#') { // numeric entity - parse digits - &#DDDDD; int entityCode; entityCode = 0; ReadLookAheadCharacter(); // largest numeric entity is 7 characters for (var i = 0; i < 7 && char.IsDigit(_lookAheadCharacter); i++) { entityCode = 10 * entityCode + (_lookAheadCharacterCode - '0'); ReadLookAheadCharacter(); } if (_lookAheadCharacter == ';') { // correct format - advance ReadLookAheadCharacter(); _nextCharacterCode = entityCode; // if this is out of range it will set the character to '?' NextCharacter = (char)_nextCharacterCode; // as far as we are concerned, this is an entity IsNextCharacterEntity = true; } else { // not an entity, set next character to the current lookahread character // we would have eaten up some digits NextCharacter = _lookAheadCharacter; _nextCharacterCode = _lookAheadCharacterCode; ReadLookAheadCharacter(); IsNextCharacterEntity = false; } } else if (char.IsLetter(_lookAheadCharacter)) { // entity is written as a string var entity = ""; // maximum length of string entities is 10 characters for (var i = 0; i < 10 && (char.IsLetter(_lookAheadCharacter) || char.IsDigit(_lookAheadCharacter)); i++) { entity += _lookAheadCharacter; ReadLookAheadCharacter(); } if (_lookAheadCharacter == ';') { // advance ReadLookAheadCharacter(); if (HtmlSchema.IsEntity(entity)) { NextCharacter = HtmlSchema.EntityCharacterValue(entity); _nextCharacterCode = NextCharacter; IsNextCharacterEntity = true; } else { // just skip the whole thing - invalid entity // move on to the next character NextCharacter = _lookAheadCharacter; _nextCharacterCode = _lookAheadCharacterCode; ReadLookAheadCharacter(); // not an entity IsNextCharacterEntity = false; } } else { // skip whatever we read after the ampersand // set next character and move on NextCharacter = _lookAheadCharacter; ReadLookAheadCharacter(); IsNextCharacterEntity = false; } } } }