/// <summary> /// When CsQuery is provided, an initial indexing context can be used /// </summary> /// <param name="csq"></param> /// <param name="allowLiterals"></param> /// <returns></returns> protected IEnumerable<IDomObject> Parse(bool allowLiterals) { int pos=0; Stack<IterationData> stack = new Stack<IterationData>(); while (pos <= EndPos) { IterationData current = new IterationData(); current.AllowLiterals = allowLiterals; current.Reset(pos); stack.Push(current); while (stack.Count != 0) { current = stack.Pop(); //Debug.Assert(current.Object == null); while (!current.Finished && current.Pos <= EndPos) { char c = BaseHtml[current.Pos]; switch (current.Step) { case 0: current.Pos = CharIndexOf(BaseHtml, '<', current.Pos); if (current.Pos < 0) { // done - no new tags found current.Pos = EndPos + 1; } else { // deal with when we're in a literal block (script/textarea) if (current.ReadTextOnly) { int endPos = current.Pos; while (endPos >= 0) { // keep going until we find the closing tag for this element int caretPos = CharIndexOf(BaseHtml, '>', endPos + 1); if (caretPos > 0) { string tag = BaseHtml.SubstringBetween(endPos + 1, caretPos).Trim().ToLower(); if (tag == "/" +current.Parent.Element.NodeName) { // this is the end tag -- exit the block current.Pos=endPos; break; } } endPos = CharIndexOf(BaseHtml, '<', endPos + 1); } } // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish current.Step=1; } break; case 1: if (current.Pos > current.HtmlStart) { IDomObject literal = GetLiteral(current); if (literal != null) { yield return literal; } continue; } int tagStartPos = current.Pos; string newTag; newTag = GetTagOpener(current); string newTagLower = newTag.ToLower(); // when Element exists, it's because a previous iteration created it: it's our parent string parentTag = String.Empty; if (current.Parent != null) { parentTag = current.Parent.Element.NodeName.ToLower(); } if (newTag == String.Empty) { // It's a tag closer. Make sure it's the right one. current.Pos = tagStartPos + 1; string closeTag = GetCloseTag(current); // Ignore empty tags, or closing tags found when no parent is open bool isProperClose = closeTag.ToLower() == parentTag; if (closeTag == String.Empty) { // ignore empty tags continue; } else { // locate match for this closer up the heirarchy IterationData actualParent =null; if (!isProperClose) { actualParent = current.Parent; while (actualParent != null && actualParent.Element.NodeName.ToLower() != closeTag.ToLower()) { actualParent = actualParent.Parent; } } // if no matching close tag was found up the tree, ignore it // otherwise always close this and repeat at the same position until the match is found if (!isProperClose && actualParent == null) { current.Invalid = true; continue; } } // element is closed if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.Finished = true; if (isProperClose) { current.Parent.Reset(current.Pos); } else { current.Parent.Reset(tagStartPos); } // already been returned before we added the children continue; } // Before we keep going see if this is an implicit close if (parentTag != String.Empty) { if (TagHasImplicitClose(parentTag,newTag) && parentTag == newTag) { // same tag for a repeater like li occcurred - treat like a close tag if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.Parent.Reset(tagStartPos); current.Finished = true; continue; } } // seems to be a new tag. Parse it IDomSpecialElement specialElement = null; if (newTagLower[0] == '!') { if (newTagLower.StartsWith("!doctype")) { specialElement = new DomDocumentType(); current.Object = specialElement; } else if (newTagLower.StartsWith("![cdata[")) { specialElement = new DomCData(); current.Object = specialElement; current.Pos = tagStartPos + 9; } else { specialElement = new DomComment(); current.Object = specialElement; if (newTagLower.StartsWith("!--")) { ((DomComment)specialElement).IsQuoted = true; current.Pos = tagStartPos + 4; } else { current.Pos = tagStartPos+1; } } } else { current.Object = new DomElement(newTag); if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed) { current.ReadTextOnly = true; current.Step = 0; } } // Handle non-element/text types -- they have data inside the tag construct if (current.Object is IDomSpecialElement) { string endTag = (current.Object is IDomComment && ((IDomComment)current.Object).IsQuoted) ? "-->" : ">"; int tagEndPos = BaseHtml.Seek(endTag, current.Pos); if (tagEndPos <0) { // if a tag is unclosed entirely, then just find a new line. tagEndPos = BaseHtml.Seek(System.Environment.NewLine, current.Pos); } if (tagEndPos < 0) { // Never closed, no newline - junk, treat it like such tagEndPos = EndPos; } specialElement.NonAttributeData = BaseHtml.SubstringBetween(current.Pos, tagEndPos); current.Pos = tagEndPos; } else { // Parse attribute data while (current.Pos <= EndPos) { if (!GetTagAttribute(current)) break; } } bool hasChildren = MoveOutsideTag(current); // tricky part: if there are children, push ourselves back on the stack and start with a new object // from this position. The children will add themselves as they are created, avoiding recursion. // When the close tag is found, the parent will be yielded if it's a root element. // I think there's a slightly better way to do this, capturing all the yield logic at the end of the // stack but it works for now. if (current.Parent != null) { current.Parent.Element.AppendChild(current.Object); } else if (!hasChildren) { yield return current.Object; } if (!hasChildren) { current.Reset(); continue; } stack.Push(current); IterationData subItem = new IterationData(); subItem.Parent = current; subItem.AllowLiterals = true; subItem.Reset(current.Pos); subItem.ReadTextOnly = current.ReadTextOnly; current = subItem; break; } } // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because top-level tag was unclosed. // THis will wrap up any straggling text and close any open tags after it. if (!current.Finished) { if (current.Pos > current.HtmlStart) { IDomObject literal = GetLiteral(current); if (literal != null) { yield return literal; } } if (current.Parent != null) { if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.Parent.Reset(current.Pos); current.Finished = true; } } } pos = current.Pos; } }
/// <summary> /// Parse the HTML, and return it, based on options set. /// </summary> /// /// <returns> /// An enumerator of the top-level elements. /// </returns> protected IEnumerable <IDomObject> ParseImplementation() { int pos = 0; Stack <IterationData> stack = new Stack <IterationData>(); while (pos <= EndPos) { IterationData current = new IterationData(); if (WrapRootTextNodes) { current.WrapLiterals = true; } current.Reset(pos); stack.Push(current); while (stack.Count != 0) { current = stack.Pop(); while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos) { char c = Html[current.Pos]; switch (current.TokenizerState) { case TokenizerState.Default: if (current.FindNextTag(Html)) { // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish current.TokenizerState = TokenizerState.TagStart; } break; case TokenizerState.TagStart: IDomObject literal; if (current.TryGetLiteral(this, out literal)) { yield return(literal); } int tagStartPos = current.Pos; string newTag = current.GetTagOpener(Html); if (newTag == String.Empty) { // It's a tag closer. Make sure it's the right one. current.Pos = tagStartPos + 1; ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html)); // Ignore empty tags, or closing tags found when no parent is open bool isProperClose = closeTagId == current.ParentTagID(); if (closeTagId == 0) { // ignore empty tags continue; } else { // locate match for this closer up the heirarchy IterationData actualParent = null; if (!isProperClose) { actualParent = current.Parent; while (actualParent != null && actualParent.Element.NodeNameID != closeTagId) { actualParent = actualParent.Parent; } } // if no matching close tag was found up the tree, ignore it // otherwise always close this and repeat at the same position until the match is found if (!isProperClose && actualParent == null) { current.InsertionMode = InsertionMode.Invalid; continue; } } // element is closed if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.TokenizerState = TokenizerState.Finished; if (isProperClose) { current.Parent.Reset(current.Pos); } else { current.Parent.Reset(tagStartPos); } // already been returned before we added the children continue; } else if (newTag[0] == '!') { IDomSpecialElement specialElement = null; string newTagUpper = newTag.ToUpper(); if (newTagUpper.StartsWith("!DOCTYPE")) { specialElement = new DomDocumentType(); current.Element = specialElement; } else if (newTagUpper.StartsWith("![CDATA[")) { specialElement = new DomCData(); current.Element = specialElement; current.Pos = tagStartPos + 9; } else { specialElement = new DomComment(); current.Element = specialElement; if (newTag.StartsWith("!--")) { ((DomComment)specialElement).IsQuoted = true; current.Pos = tagStartPos + 4; } else { current.Pos = tagStartPos + 1; } } string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">"; int tagEndPos = Html.Seek(endTag, current.Pos); if (tagEndPos < 0) { // if a tag is unclosed entirely, then just find a new line. tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos); } if (tagEndPos < 0) { // Never closed, no newline - junk, treat it like such tagEndPos = EndPos; } specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos); current.Pos = tagEndPos; } else { // seems to be a new element tag, parse it. ushort newTagId = HtmlData.Tokenize(newTag); // Before we keep going see if this is an implicit close ushort parentTagId = current.ParentTagID(); int lastPos = current.Pos; if (parentTagId == 0 && IsDocument) { if (newTagId != HtmlData.tagHTML) { current.Element = DomElement.Create(HtmlData.tagHTML); current = current.AddNewChild(); parentTagId = HtmlData.tagHTML; } } if (parentTagId != 0) { ushort action = SpecialTagActionDelegate(parentTagId, newTagId); while (action != HtmlData.tagActionNothing) { if (action == HtmlData.tagActionClose) { // track the next parent up the chain var newNode = (current.Parent != null) ? current.Parent : null; // same tag for a repeater like li occcurred - treat like a close tag if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.TokenizerState = TokenizerState.Finished; //current.Parent.Reset(tagStartPos); if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null) { action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId); if (action != HtmlData.tagActionNothing) { current = newNode; } } else { action = HtmlData.tagActionNothing; } } else { if (GenerateOptionalElements) { stack.Push(current); current = current.AddNewParent(action, lastPos); } action = HtmlData.tagActionNothing; } } if (current.TokenizerState == TokenizerState.Finished) { current.Parent.Reset(tagStartPos); continue; } } current.Element = DomElement.Create(newTagId); if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed) { current.InsertionMode = InsertionMode.Text; current.TokenizerState = TokenizerState.Default; } // Parse attribute data while (current.Pos <= EndPos) { if (!current.GetTagAttribute(Html)) { break; } } } IDomObject el; if (current.FinishTagOpener(Html, out el)) { stack.Push(current); current = current.AddNewChild(); } if (el != null) { yield return(el); } break; } } // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because top-level tag was unclosed. // THis will wrap up any straggling text and close any open tags after it. if (current.TokenizerState != TokenizerState.Finished) { foreach (var el in current.CloseElement(this)) { yield return(el); } } } pos = current.Pos; } }
/// <summary> /// When CsQuery is provided, an initial indexing context can be used /// </summary> /// <param name="csq"></param> /// <param name="allowLiterals"></param> /// <returns></returns> protected IEnumerable <IDomObject> Parse(bool allowLiterals) { int pos = 0; Stack <IterationData> stack = new Stack <IterationData>(); while (pos <= EndPos) { IterationData current = new IterationData(); current.AllowLiterals = allowLiterals; current.Reset(pos); stack.Push(current); while (stack.Count != 0) { current = stack.Pop(); //Debug.Assert(current.Object == null); while (!current.Finished && current.Pos <= EndPos) { char c = BaseHtml[current.Pos]; switch (current.Step) { case 0: current.Pos = CharIndexOf(BaseHtml, '<', current.Pos); if (current.Pos < 0) { // done - no new tags found current.Pos = EndPos + 1; } else { // deal with when we're in a literal block (script/textarea) if (current.ReadTextOnly) { int endPos = current.Pos; while (endPos >= 0) { // keep going until we find the closing tag for this element int caretPos = CharIndexOf(BaseHtml, '>', endPos + 1); if (caretPos > 0) { string tag = BaseHtml.SubstringBetween(endPos + 1, caretPos).Trim().ToLower(); if (tag == "/" + current.Parent.Element.NodeName) { // this is the end tag -- exit the block current.Pos = endPos; break; } } endPos = CharIndexOf(BaseHtml, '<', endPos + 1); } } // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish current.Step = 1; } break; case 1: if (current.Pos > current.HtmlStart) { IDomObject literal = GetLiteral(current); if (literal != null) { yield return(literal); } continue; } int tagStartPos = current.Pos; string newTag; newTag = GetTagOpener(current); string newTagLower = newTag.ToLower(); // when Element exists, it's because a previous iteration created it: it's our parent string parentTag = String.Empty; if (current.Parent != null) { parentTag = current.Parent.Element.NodeName.ToLower(); } if (newTag == String.Empty) { // It's a tag closer. Make sure it's the right one. current.Pos = tagStartPos + 1; string closeTag = GetCloseTag(current); // Ignore empty tags, or closing tags found when no parent is open bool isProperClose = closeTag.ToLower() == parentTag; if (closeTag == String.Empty) { // ignore empty tags continue; } else { // locate match for this closer up the heirarchy IterationData actualParent = null; if (!isProperClose) { actualParent = current.Parent; while (actualParent != null && actualParent.Element.NodeName.ToLower() != closeTag.ToLower()) { actualParent = actualParent.Parent; } } // if no matching close tag was found up the tree, ignore it // otherwise always close this and repeat at the same position until the match is found if (!isProperClose && actualParent == null) { current.Invalid = true; continue; } } // element is closed if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.Finished = true; if (isProperClose) { current.Parent.Reset(current.Pos); } else { current.Parent.Reset(tagStartPos); } // already been returned before we added the children continue; } // Before we keep going see if this is an implicit close if (parentTag != String.Empty) { if (TagHasImplicitClose(parentTag, newTag) && parentTag == newTag) { // same tag for a repeater like li occcurred - treat like a close tag if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.Parent.Reset(tagStartPos); current.Finished = true; continue; } } // seems to be a new tag. Parse it IDomSpecialElement specialElement = null; if (newTagLower[0] == '!') { if (newTagLower.StartsWith("!doctype")) { specialElement = new DomDocumentType(); current.Object = specialElement; } else if (newTagLower.StartsWith("![cdata[")) { specialElement = new DomCData(); current.Object = specialElement; current.Pos = tagStartPos + 9; } else { specialElement = new DomComment(); current.Object = specialElement; if (newTagLower.StartsWith("!--")) { ((DomComment)specialElement).IsQuoted = true; current.Pos = tagStartPos + 4; } else { current.Pos = tagStartPos + 1; } } } else { current.Object = new DomElement(newTag); if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed) { current.ReadTextOnly = true; current.Step = 0; } } // Handle non-element/text types -- they have data inside the tag construct if (current.Object is IDomSpecialElement) { string endTag = (current.Object is IDomComment && ((IDomComment)current.Object).IsQuoted) ? "-->" : ">"; int tagEndPos = BaseHtml.Seek(endTag, current.Pos); if (tagEndPos < 0) { // if a tag is unclosed entirely, then just find a new line. tagEndPos = BaseHtml.Seek(System.Environment.NewLine, current.Pos); } if (tagEndPos < 0) { // Never closed, no newline - junk, treat it like such tagEndPos = EndPos; } specialElement.NonAttributeData = BaseHtml.SubstringBetween(current.Pos, tagEndPos); current.Pos = tagEndPos; } else { // Parse attribute data while (current.Pos <= EndPos) { if (!GetTagAttribute(current)) { break; } } } bool hasChildren = MoveOutsideTag(current); // tricky part: if there are children, push ourselves back on the stack and start with a new object // from this position. The children will add themselves as they are created, avoiding recursion. // When the close tag is found, the parent will be yielded if it's a root element. // I think there's a slightly better way to do this, capturing all the yield logic at the end of the // stack but it works for now. if (current.Parent != null) { current.Parent.Element.AppendChild(current.Object); } else if (!hasChildren) { yield return(current.Object); } if (!hasChildren) { current.Reset(); continue; } stack.Push(current); IterationData subItem = new IterationData(); subItem.Parent = current; subItem.AllowLiterals = true; subItem.Reset(current.Pos); subItem.ReadTextOnly = current.ReadTextOnly; current = subItem; break; } } // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because top-level tag was unclosed. // THis will wrap up any straggling text and close any open tags after it. if (!current.Finished) { if (current.Pos > current.HtmlStart) { IDomObject literal = GetLiteral(current); if (literal != null) { yield return(literal); } } if (current.Parent != null) { if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.Parent.Reset(current.Pos); current.Finished = true; } } } pos = current.Pos; } }
/// <summary> /// Parse the HTML, and return it, based on options set. /// </summary> /// /// <returns> /// An enumerator of the top-level elements. /// </returns> protected IEnumerable<IDomObject> ParseImplementation() { int pos=0; Stack<IterationData> stack = new Stack<IterationData>(); while (pos <= EndPos) { IterationData current = new IterationData(); if (WrapRootTextNodes) { current.WrapLiterals = true; } current.Reset(pos); stack.Push(current); while (stack.Count != 0) { current = stack.Pop(); while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos) { char c = Html[current.Pos]; switch (current.TokenizerState) { case TokenizerState.Default: if (current.FindNextTag(Html)) { // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish current.TokenizerState = TokenizerState.TagStart; } break; case TokenizerState.TagStart: IDomObject literal; if (current.TryGetLiteral(this, out literal)) { yield return literal; } int tagStartPos = current.Pos; string newTag=current.GetTagOpener(Html); if (newTag == String.Empty) { // It's a tag closer. Make sure it's the right one. current.Pos = tagStartPos + 1; ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html)); // Ignore empty tags, or closing tags found when no parent is open bool isProperClose = closeTagId == current.ParentTagID(); if (closeTagId == 0) { // ignore empty tags continue; } else { // locate match for this closer up the heirarchy IterationData actualParent =null; if (!isProperClose) { actualParent = current.Parent; while (actualParent != null && actualParent.Element.NodeNameID != closeTagId) { actualParent = actualParent.Parent; } } // if no matching close tag was found up the tree, ignore it // otherwise always close this and repeat at the same position until the match is found if (!isProperClose && actualParent == null) { current.InsertionMode = InsertionMode.Invalid; continue; } } // element is closed if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.TokenizerState = TokenizerState.Finished ; if (isProperClose) { current.Parent.Reset(current.Pos); } else { current.Parent.Reset(tagStartPos); } // already been returned before we added the children continue; } else if (newTag[0] == '!') { IDomSpecialElement specialElement = null; string newTagUpper = newTag.ToUpper(); if (newTagUpper.StartsWith("!DOCTYPE")) { specialElement = new DomDocumentType(); current.Element = specialElement; } else if (newTagUpper.StartsWith("![CDATA[")) { specialElement = new DomCData(); current.Element = specialElement; current.Pos = tagStartPos + 9; } else { specialElement = new DomComment(); current.Element = specialElement; if (newTag.StartsWith("!--")) { ((DomComment)specialElement).IsQuoted = true; current.Pos = tagStartPos + 4; } else { current.Pos = tagStartPos+1; } } string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">"; int tagEndPos = Html.Seek(endTag, current.Pos); if (tagEndPos < 0) { // if a tag is unclosed entirely, then just find a new line. tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos); } if (tagEndPos < 0) { // Never closed, no newline - junk, treat it like such tagEndPos = EndPos; } specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos); current.Pos = tagEndPos; } else { // seems to be a new element tag, parse it. ushort newTagId = HtmlData.Tokenize(newTag); // Before we keep going see if this is an implicit close ushort parentTagId = current.ParentTagID(); int lastPos = current.Pos; if (parentTagId ==0 && IsDocument) { if (newTagId != HtmlData.tagHTML) { current.Element =DomElement.Create(HtmlData.tagHTML); current = current.AddNewChild(); parentTagId = HtmlData.tagHTML; } } if (parentTagId != 0) { ushort action = SpecialTagActionDelegate(parentTagId, newTagId); while (action != HtmlData.tagActionNothing) { if (action == HtmlData.tagActionClose) { // track the next parent up the chain var newNode = (current.Parent != null) ? current.Parent : null; // same tag for a repeater like li occcurred - treat like a close tag if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.TokenizerState = TokenizerState.Finished; //current.Parent.Reset(tagStartPos); if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null) { action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId); if (action != HtmlData.tagActionNothing) { current = newNode; } } else { action = HtmlData.tagActionNothing; } } else { if (GenerateOptionalElements) { stack.Push(current); current = current.AddNewParent(action, lastPos); } action = HtmlData.tagActionNothing; } } if (current.TokenizerState == TokenizerState.Finished) { current.Parent.Reset(tagStartPos); continue; } } current.Element = DomElement.Create(newTagId); if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed) { current.InsertionMode = InsertionMode.Text; current.TokenizerState = TokenizerState.Default; } // Parse attribute data while (current.Pos <= EndPos) { if (!current.GetTagAttribute(Html)) break; } } IDomObject el; if (current.FinishTagOpener(Html, out el)) { stack.Push(current); current = current.AddNewChild(); } if (el != null) { yield return el; } break; } } // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because top-level tag was unclosed. // THis will wrap up any straggling text and close any open tags after it. if (current.TokenizerState != TokenizerState.Finished) { foreach (var el in current.CloseElement(this)) { yield return el; } } } pos = current.Pos; } }