/// <summary> /// Parse the HTML, and return it, based on options set. /// </summary> /// /// <returns> /// An enumerator of the top-level elements. /// </returns> protected IEnumerable <IDomObject> ParseImplementation() { int pos = 0; Stack <IterationData> stack = new Stack <IterationData>(); while (pos <= EndPos) { IterationData current = new IterationData(); if (WrapRootTextNodes) { current.WrapLiterals = true; } current.Reset(pos); stack.Push(current); while (stack.Count != 0) { current = stack.Pop(); while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos) { char c = Html[current.Pos]; switch (current.TokenizerState) { case TokenizerState.Default: if (current.FindNextTag(Html)) { // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish current.TokenizerState = TokenizerState.TagStart; } break; case TokenizerState.TagStart: IDomObject literal; if (current.TryGetLiteral(this, out literal)) { yield return(literal); } int tagStartPos = current.Pos; string newTag = current.GetTagOpener(Html); if (newTag == String.Empty) { // It's a tag closer. Make sure it's the right one. current.Pos = tagStartPos + 1; ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html)); // Ignore empty tags, or closing tags found when no parent is open bool isProperClose = closeTagId == current.ParentTagID(); if (closeTagId == 0) { // ignore empty tags continue; } else { // locate match for this closer up the heirarchy IterationData actualParent = null; if (!isProperClose) { actualParent = current.Parent; while (actualParent != null && actualParent.Element.NodeNameID != closeTagId) { actualParent = actualParent.Parent; } } // if no matching close tag was found up the tree, ignore it // otherwise always close this and repeat at the same position until the match is found if (!isProperClose && actualParent == null) { current.InsertionMode = InsertionMode.Invalid; continue; } } // element is closed if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.TokenizerState = TokenizerState.Finished; if (isProperClose) { current.Parent.Reset(current.Pos); } else { current.Parent.Reset(tagStartPos); } // already been returned before we added the children continue; } else if (newTag[0] == '!') { IDomSpecialElement specialElement = null; string newTagUpper = newTag.ToUpper(); if (newTagUpper.StartsWith("!DOCTYPE")) { specialElement = new DomDocumentType(); current.Element = specialElement; } else if (newTagUpper.StartsWith("![CDATA[")) { specialElement = new DomCData(); current.Element = specialElement; current.Pos = tagStartPos + 9; } else { specialElement = new DomComment(); current.Element = specialElement; if (newTag.StartsWith("!--")) { ((DomComment)specialElement).IsQuoted = true; current.Pos = tagStartPos + 4; } else { current.Pos = tagStartPos + 1; } } string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">"; int tagEndPos = Html.Seek(endTag, current.Pos); if (tagEndPos < 0) { // if a tag is unclosed entirely, then just find a new line. tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos); } if (tagEndPos < 0) { // Never closed, no newline - junk, treat it like such tagEndPos = EndPos; } specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos); current.Pos = tagEndPos; } else { // seems to be a new element tag, parse it. ushort newTagId = HtmlData.Tokenize(newTag); // Before we keep going see if this is an implicit close ushort parentTagId = current.ParentTagID(); int lastPos = current.Pos; if (parentTagId == 0 && IsDocument) { if (newTagId != HtmlData.tagHTML) { current.Element = DomElement.Create(HtmlData.tagHTML); current = current.AddNewChild(); parentTagId = HtmlData.tagHTML; } } if (parentTagId != 0) { ushort action = SpecialTagActionDelegate(parentTagId, newTagId); while (action != HtmlData.tagActionNothing) { if (action == HtmlData.tagActionClose) { // track the next parent up the chain var newNode = (current.Parent != null) ? current.Parent : null; // same tag for a repeater like li occcurred - treat like a close tag if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.TokenizerState = TokenizerState.Finished; //current.Parent.Reset(tagStartPos); if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null) { action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId); if (action != HtmlData.tagActionNothing) { current = newNode; } } else { action = HtmlData.tagActionNothing; } } else { if (GenerateOptionalElements) { stack.Push(current); current = current.AddNewParent(action, lastPos); } action = HtmlData.tagActionNothing; } } if (current.TokenizerState == TokenizerState.Finished) { current.Parent.Reset(tagStartPos); continue; } } current.Element = DomElement.Create(newTagId); if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed) { current.InsertionMode = InsertionMode.Text; current.TokenizerState = TokenizerState.Default; } // Parse attribute data while (current.Pos <= EndPos) { if (!current.GetTagAttribute(Html)) { break; } } } IDomObject el; if (current.FinishTagOpener(Html, out el)) { stack.Push(current); current = current.AddNewChild(); } if (el != null) { yield return(el); } break; } } // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because top-level tag was unclosed. // THis will wrap up any straggling text and close any open tags after it. if (current.TokenizerState != TokenizerState.Finished) { foreach (var el in current.CloseElement(this)) { yield return(el); } } } pos = current.Pos; } }
/// <summary> /// Parse the HTML, and return it, based on options set. /// </summary> /// /// <returns> /// An enumerator of the top-level elements. /// </returns> protected IEnumerable<IDomObject> ParseImplementation() { int pos=0; Stack<IterationData> stack = new Stack<IterationData>(); while (pos <= EndPos) { IterationData current = new IterationData(); if (WrapRootTextNodes) { current.WrapLiterals = true; } current.Reset(pos); stack.Push(current); while (stack.Count != 0) { current = stack.Pop(); while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos) { char c = Html[current.Pos]; switch (current.TokenizerState) { case TokenizerState.Default: if (current.FindNextTag(Html)) { // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish current.TokenizerState = TokenizerState.TagStart; } break; case TokenizerState.TagStart: IDomObject literal; if (current.TryGetLiteral(this, out literal)) { yield return literal; } int tagStartPos = current.Pos; string newTag=current.GetTagOpener(Html); if (newTag == String.Empty) { // It's a tag closer. Make sure it's the right one. current.Pos = tagStartPos + 1; ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html)); // Ignore empty tags, or closing tags found when no parent is open bool isProperClose = closeTagId == current.ParentTagID(); if (closeTagId == 0) { // ignore empty tags continue; } else { // locate match for this closer up the heirarchy IterationData actualParent =null; if (!isProperClose) { actualParent = current.Parent; while (actualParent != null && actualParent.Element.NodeNameID != closeTagId) { actualParent = actualParent.Parent; } } // if no matching close tag was found up the tree, ignore it // otherwise always close this and repeat at the same position until the match is found if (!isProperClose && actualParent == null) { current.InsertionMode = InsertionMode.Invalid; continue; } } // element is closed if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.TokenizerState = TokenizerState.Finished ; if (isProperClose) { current.Parent.Reset(current.Pos); } else { current.Parent.Reset(tagStartPos); } // already been returned before we added the children continue; } else if (newTag[0] == '!') { IDomSpecialElement specialElement = null; string newTagUpper = newTag.ToUpper(); if (newTagUpper.StartsWith("!DOCTYPE")) { specialElement = new DomDocumentType(); current.Element = specialElement; } else if (newTagUpper.StartsWith("![CDATA[")) { specialElement = new DomCData(); current.Element = specialElement; current.Pos = tagStartPos + 9; } else { specialElement = new DomComment(); current.Element = specialElement; if (newTag.StartsWith("!--")) { ((DomComment)specialElement).IsQuoted = true; current.Pos = tagStartPos + 4; } else { current.Pos = tagStartPos+1; } } string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">"; int tagEndPos = Html.Seek(endTag, current.Pos); if (tagEndPos < 0) { // if a tag is unclosed entirely, then just find a new line. tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos); } if (tagEndPos < 0) { // Never closed, no newline - junk, treat it like such tagEndPos = EndPos; } specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos); current.Pos = tagEndPos; } else { // seems to be a new element tag, parse it. ushort newTagId = HtmlData.Tokenize(newTag); // Before we keep going see if this is an implicit close ushort parentTagId = current.ParentTagID(); int lastPos = current.Pos; if (parentTagId ==0 && IsDocument) { if (newTagId != HtmlData.tagHTML) { current.Element =DomElement.Create(HtmlData.tagHTML); current = current.AddNewChild(); parentTagId = HtmlData.tagHTML; } } if (parentTagId != 0) { ushort action = SpecialTagActionDelegate(parentTagId, newTagId); while (action != HtmlData.tagActionNothing) { if (action == HtmlData.tagActionClose) { // track the next parent up the chain var newNode = (current.Parent != null) ? current.Parent : null; // same tag for a repeater like li occcurred - treat like a close tag if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.TokenizerState = TokenizerState.Finished; //current.Parent.Reset(tagStartPos); if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null) { action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId); if (action != HtmlData.tagActionNothing) { current = newNode; } } else { action = HtmlData.tagActionNothing; } } else { if (GenerateOptionalElements) { stack.Push(current); current = current.AddNewParent(action, lastPos); } action = HtmlData.tagActionNothing; } } if (current.TokenizerState == TokenizerState.Finished) { current.Parent.Reset(tagStartPos); continue; } } current.Element = DomElement.Create(newTagId); if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed) { current.InsertionMode = InsertionMode.Text; current.TokenizerState = TokenizerState.Default; } // Parse attribute data while (current.Pos <= EndPos) { if (!current.GetTagAttribute(Html)) break; } } IDomObject el; if (current.FinishTagOpener(Html, out el)) { stack.Push(current); current = current.AddNewChild(); } if (el != null) { yield return el; } break; } } // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because top-level tag was unclosed. // THis will wrap up any straggling text and close any open tags after it. if (current.TokenizerState != TokenizerState.Finished) { foreach (var el in current.CloseElement(this)) { yield return el; } } } pos = current.Pos; } }