/// <summary> /// Returns a literal object for the text between HtmlStart (the last position of the end of a tag) and the current position. /// If !AllowLiterals then it's wrapped in a span. /// </summary> /// <param name="current"></param> /// <returns></returns> protected IDomObject GetLiteral(IterationData current) { // There's plain text -return it as a literal. IDomObject textObj; DomText lit; if (current.Invalid) { lit = new DomInvalidElement(); } else if (current.ReadTextOnly) { current.ReadTextOnly = false; lit = new DomInnerText(); } else { lit = new DomText(); } if (isBound) { lit.SetTextIndex(Document, Document.TokenizeString(current.HtmlStart, current.Pos - current.HtmlStart)); } else { string text = BaseHtml.SubstringBetween(current.HtmlStart, current.Pos); lit.NodeValue = Objects.HtmlDecode(text); } if (!current.AllowLiterals) { IDomElement wrapper = new DomElement("span"); wrapper.AppendChild(lit); textObj = wrapper; } else { textObj = lit; } if (current.Parent != null) { current.Parent.Element.AppendChild(textObj); current.Reset(); return(null); } else { current.Finished = true; return(textObj); } }
/// <summary> /// Adds a new child to this item, and returns it /// </summary> /// /// <param name="pos"> /// The index position for the iteration data. /// </param> /// /// <returns> /// New IterationData that is a child of the current IterationData /// </returns> public IterationData AddNewChild(int pos) { IterationData subItem = new IterationData { Parent = this, Pos = pos, HtmlStart = Pos, InsertionMode = InsertionMode }; return(subItem); }
/// <summary> /// Start: the opening caret of a tag /// End: the first stop character (e.g. space after the tag name) /// </summary> /// <param name="current"></param> /// <returns>Tag name</returns> protected string GetTagOpener(IterationData current) { bool finished = false; int step = 0; int tagStart = -1; while (!finished && current.Pos <= EndPos) { char c = BaseHtml[current.Pos]; switch (step) { case 0: if (c == '<') { tagStart = current.Pos + 1; step = 1; } current.Pos++; break; case 1: // skip whitespace between opening caret and text -- probably not allowed but can't hurt to do this if (c == ' ') { current.Pos++; } else { step = 2; } break; case 2: if (isHtmlTagEnd(c)) { return(BaseHtml.SubstringBetween(tagStart, current.Pos).Trim()); } else { current.Pos++; } break; } } return(String.Empty); }
/// <summary> /// Move pointer to the first character after the closing caret of this tag. /// </summary> /// <returns> /// Returns True if there are children /// </returns> protected bool MoveOutsideTag(IterationData current) { int endPos = CharIndexOf(BaseHtml, '>', current.Pos); current.HtmlStart = current.Pos + 1; if (endPos > 0) { current.Pos = endPos + 1; return(BaseHtml[endPos - 1] == '/' ? false : current.Object.InnerHtmlAllowed || current.Object.InnerTextAllowed); } else { current.Pos = EndPos + 1; return(false); } }
/// <summary> /// Start: Expects the position to be after an opening caret for a close tag, and returns the tag name. /// End: Position after closing caret /// </summary> /// <param name="current"></param> /// <returns></returns> protected string GetCloseTag(IterationData current) { bool finished = false; int step = 0; int nameStart = 0; string name = null; char c; while (!finished && current.Pos <= EndPos) { c = BaseHtml[current.Pos]; switch (step) { case 0: if (CharacterData.IsType(c, CharacterType.HtmlTagNameStart)) { nameStart = current.Pos; step = 1; } current.Pos++; break; case 1: if (!CharacterData.IsType(c, CharacterType.HtmlTagNameExceptStart)) { name = BaseHtml.SubstringBetween(nameStart, current.Pos); step = 2; } else { current.Pos++; } break; case 2: if (c == '>') { finished = true; } current.Pos++; break; } } return(name ?? ""); }
protected string GetOpenText(IterationData current) { int pos = CharIndexOf(BaseHtml, '<', current.Pos); if (pos > current.Pos) { int startPos = current.Pos; current.Pos = pos; return(BaseHtml.SubstringBetween(startPos, pos)); } else if (pos == -1) { int oldPos = current.Pos; current.Pos = BaseHtml.Length; return(BaseHtml.SubstringBetween(oldPos, current.Pos)); } else { return(String.Empty); } }
/// <summary> /// Start: the opening caret of a tag /// End: the first stop character (e.g. space after the tag name) /// </summary> /// <param name="current"></param> /// <returns>Tag name</returns> protected string GetTagOpener(IterationData current) { bool finished = false; int step = 0; int tagStart = -1; while (!finished && current.Pos <= EndPos) { char c = BaseHtml[current.Pos]; switch (step) { case 0: if (c == '<') { tagStart = current.Pos + 1; step = 1; } current.Pos++; break; case 1: // skip whitespace between opening caret and text -- probably not allowed but can't hurt to do this if (c == ' ') { current.Pos++; } else { step = 2; } break; case 2: if (isHtmlTagEnd(c)) { return BaseHtml.SubstringBetween(tagStart, current.Pos).Trim(); } else { current.Pos++; } break; } } return String.Empty; }
/// <summary> /// Start: Position inside a tag opening construct /// End: position after last character of tag construct {x=["|']y["|]]} or just {x}) and adds attribute if successful /// position ON closing caret of tag opener if failed /// </summary> /// <param name="current"></param> /// <returns> /// Returns true if an attribute was added, false if no more attributes were found /// </returns> protected bool GetTagAttribute(IterationData current) { bool finished = false; int step = 0; string aName = null; string aValue = null; int nameStart = -1; int valStart = -1; bool isQuoted = false; char quoteChar = ' '; while (!finished && current.Pos <= EndPos) { char c = BaseHtml[current.Pos]; switch (step) { case 0: // find name if (CharacterData.IsType(c, CharacterType.HtmlTagNameStart)) { step = 1; nameStart = current.Pos; current.Pos++; } else if (isTagChar(c)) { finished = true; } else { current.Pos++; } break; case 1: if (!CharacterData.IsType(c, CharacterType.HtmlTagNameExceptStart)) { step = 2; aName = BaseHtml.SubstringBetween(nameStart, current.Pos); } else { current.Pos++; } break; case 2: // find value switch(c) { case '=': step = 3; current.Pos++; break; case ' ': current.Pos++; break; default: // anything else means new attribute finished = true; break; } break; case 3: // find quote start switch(c) { case '\\': case '>': finished = true; break; case ' ': current.Pos++; break; case '"': case '\'': isQuoted = true; valStart = current.Pos+1; current.Pos++; quoteChar = c; step = 4; break; default: valStart = current.Pos; step = 4; break; } // any non-whitespace is part of the attribute break; case 4: // parse the attribute until whitespace or closing quote if ((isQuoted && c == quoteChar) || (!isQuoted && isHtmlTagEnd(c))) { aValue = BaseHtml.SubstringBetween(valStart, current.Pos); if (isQuoted) { isQuoted = false; current.Pos++; } finished = true; } else { current.Pos++; } break; } } if (aName != null) { // 12-15-11 - don't replace a valid attribute with a bad one var curVal = current.Element.GetAttribute(aName); if (string.IsNullOrEmpty(curVal)) { if (aValue == null) { current.Element.SetAttribute(aName); } else { current.Element.SetAttribute(aName, aValue); } } return true; } else { return false; } }
protected string GetOpenText(IterationData current) { int pos = CharIndexOf(BaseHtml, '<', current.Pos); if (pos > current.Pos) { int startPos = current.Pos; current.Pos = pos; return BaseHtml.SubstringBetween(startPos, pos); } else if (pos == -1) { int oldPos = current.Pos; current.Pos = BaseHtml.Length; return BaseHtml.SubstringBetween(oldPos, current.Pos); } else { return String.Empty; } }
/// <summary> /// Returns a literal object for the text between HtmlStart (the last position of the end of a tag) and the current position. /// If !AllowLiterals then it's wrapped in a span. /// </summary> /// <param name="current"></param> /// <returns></returns> protected IDomObject GetLiteral(IterationData current) { // There's plain text -return it as a literal. IDomObject textObj; DomText lit; if (current.Invalid) { lit = new DomInvalidElement(); } else if (current.ReadTextOnly) { current.ReadTextOnly = false; lit = new DomInnerText(); } else { lit = new DomText(); } if (isBound) { lit.SetTextIndex(Document, Document.TokenizeString(current.HtmlStart, current.Pos - current.HtmlStart)); } else { string text = BaseHtml.SubstringBetween(current.HtmlStart, current.Pos); lit.NodeValue = Objects.HtmlDecode(text); } if (!current.AllowLiterals) { IDomElement wrapper = new DomElement("span"); wrapper.AppendChild(lit); textObj = wrapper; } else { textObj = lit; } if (current.Parent != null) { current.Parent.Element.AppendChild(textObj); current.Reset(); return null; } else { current.Finished = true; return textObj; } }
/// <summary> /// Start: Expects the position to be after an opening caret for a close tag, and returns the tag name. /// End: Position after closing caret /// </summary> /// <param name="current"></param> /// <returns></returns> protected string GetCloseTag(IterationData current) { bool finished = false; int step = 0; int nameStart = 0; string name=null; char c; while (!finished && current.Pos <= EndPos) { c = BaseHtml[current.Pos]; switch (step) { case 0: if (CharacterData.IsType(c,CharacterType.HtmlTagNameStart)) { nameStart = current.Pos; step = 1; } current.Pos++; break; case 1: if (!CharacterData.IsType(c, CharacterType.HtmlTagNameExceptStart)) { name = BaseHtml.SubstringBetween(nameStart, current.Pos); step = 2; } else { current.Pos++; } break; case 2: if (c == '>') { finished = true; } current.Pos++; break; } } return name ?? ""; }
/// <summary> /// Parse the HTML, and return it, based on options set. /// </summary> /// /// <returns> /// An enumerator of the top-level elements. /// </returns> protected IEnumerable <IDomObject> ParseImplementation() { int pos = 0; Stack <IterationData> stack = new Stack <IterationData>(); while (pos <= EndPos) { IterationData current = new IterationData(); if (WrapRootTextNodes) { current.WrapLiterals = true; } current.Reset(pos); stack.Push(current); while (stack.Count != 0) { current = stack.Pop(); while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos) { char c = Html[current.Pos]; switch (current.TokenizerState) { case TokenizerState.Default: if (current.FindNextTag(Html)) { // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish current.TokenizerState = TokenizerState.TagStart; } break; case TokenizerState.TagStart: IDomObject literal; if (current.TryGetLiteral(this, out literal)) { yield return(literal); } int tagStartPos = current.Pos; string newTag = current.GetTagOpener(Html); if (newTag == String.Empty) { // It's a tag closer. Make sure it's the right one. current.Pos = tagStartPos + 1; ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html)); // Ignore empty tags, or closing tags found when no parent is open bool isProperClose = closeTagId == current.ParentTagID(); if (closeTagId == 0) { // ignore empty tags continue; } else { // locate match for this closer up the heirarchy IterationData actualParent = null; if (!isProperClose) { actualParent = current.Parent; while (actualParent != null && actualParent.Element.NodeNameID != closeTagId) { actualParent = actualParent.Parent; } } // if no matching close tag was found up the tree, ignore it // otherwise always close this and repeat at the same position until the match is found if (!isProperClose && actualParent == null) { current.InsertionMode = InsertionMode.Invalid; continue; } } // element is closed if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.TokenizerState = TokenizerState.Finished; if (isProperClose) { current.Parent.Reset(current.Pos); } else { current.Parent.Reset(tagStartPos); } // already been returned before we added the children continue; } else if (newTag[0] == '!') { IDomSpecialElement specialElement = null; string newTagUpper = newTag.ToUpper(); if (newTagUpper.StartsWith("!DOCTYPE")) { specialElement = new DomDocumentType(); current.Element = specialElement; } else if (newTagUpper.StartsWith("![CDATA[")) { specialElement = new DomCData(); current.Element = specialElement; current.Pos = tagStartPos + 9; } else { specialElement = new DomComment(); current.Element = specialElement; if (newTag.StartsWith("!--")) { ((DomComment)specialElement).IsQuoted = true; current.Pos = tagStartPos + 4; } else { current.Pos = tagStartPos + 1; } } string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">"; int tagEndPos = Html.Seek(endTag, current.Pos); if (tagEndPos < 0) { // if a tag is unclosed entirely, then just find a new line. tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos); } if (tagEndPos < 0) { // Never closed, no newline - junk, treat it like such tagEndPos = EndPos; } specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos); current.Pos = tagEndPos; } else { // seems to be a new element tag, parse it. ushort newTagId = HtmlData.Tokenize(newTag); // Before we keep going see if this is an implicit close ushort parentTagId = current.ParentTagID(); int lastPos = current.Pos; if (parentTagId == 0 && IsDocument) { if (newTagId != HtmlData.tagHTML) { current.Element = DomElement.Create(HtmlData.tagHTML); current = current.AddNewChild(); parentTagId = HtmlData.tagHTML; } } if (parentTagId != 0) { ushort action = SpecialTagActionDelegate(parentTagId, newTagId); while (action != HtmlData.tagActionNothing) { if (action == HtmlData.tagActionClose) { // track the next parent up the chain var newNode = (current.Parent != null) ? current.Parent : null; // same tag for a repeater like li occcurred - treat like a close tag if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.TokenizerState = TokenizerState.Finished; //current.Parent.Reset(tagStartPos); if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null) { action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId); if (action != HtmlData.tagActionNothing) { current = newNode; } } else { action = HtmlData.tagActionNothing; } } else { if (GenerateOptionalElements) { stack.Push(current); current = current.AddNewParent(action, lastPos); } action = HtmlData.tagActionNothing; } } if (current.TokenizerState == TokenizerState.Finished) { current.Parent.Reset(tagStartPos); continue; } } current.Element = DomElement.Create(newTagId); if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed) { current.InsertionMode = InsertionMode.Text; current.TokenizerState = TokenizerState.Default; } // Parse attribute data while (current.Pos <= EndPos) { if (!current.GetTagAttribute(Html)) { break; } } } IDomObject el; if (current.FinishTagOpener(Html, out el)) { stack.Push(current); current = current.AddNewChild(); } if (el != null) { yield return(el); } break; } } // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because top-level tag was unclosed. // THis will wrap up any straggling text and close any open tags after it. if (current.TokenizerState != TokenizerState.Finished) { foreach (var el in current.CloseElement(this)) { yield return(el); } } } pos = current.Pos; } }
/// <summary> /// Start: Position inside a tag opening construct /// End: position after last character of tag construct {x=["|']y["|]]} or just {x}) and adds attribute if successful /// position ON closing caret of tag opener if failed /// </summary> /// <param name="current"></param> /// <returns> /// Returns true if an attribute was added, false if no more attributes were found /// </returns> protected bool GetTagAttribute(IterationData current) { bool finished = false; int step = 0; string aName = null; string aValue = null; int nameStart = -1; int valStart = -1; bool isQuoted = false; char quoteChar = ' '; while (!finished && current.Pos <= EndPos) { char c = BaseHtml[current.Pos]; switch (step) { case 0: // find name if (CharacterData.IsType(c, CharacterType.HtmlTagNameStart)) { step = 1; nameStart = current.Pos; current.Pos++; } else if (isTagChar(c)) { finished = true; } else { current.Pos++; } break; case 1: if (!CharacterData.IsType(c, CharacterType.HtmlTagNameExceptStart)) { step = 2; aName = BaseHtml.SubstringBetween(nameStart, current.Pos); } else { current.Pos++; } break; case 2: // find value switch (c) { case '=': step = 3; current.Pos++; break; case ' ': current.Pos++; break; default: // anything else means new attribute finished = true; break; } break; case 3: // find quote start switch (c) { case '\\': case '>': finished = true; break; case ' ': current.Pos++; break; case '"': case '\'': isQuoted = true; valStart = current.Pos + 1; current.Pos++; quoteChar = c; step = 4; break; default: valStart = current.Pos; step = 4; break; } // any non-whitespace is part of the attribute break; case 4: // parse the attribute until whitespace or closing quote if ((isQuoted && c == quoteChar) || (!isQuoted && isHtmlTagEnd(c))) { aValue = BaseHtml.SubstringBetween(valStart, current.Pos); if (isQuoted) { isQuoted = false; current.Pos++; } finished = true; } else { current.Pos++; } break; } } if (aName != null) { // 12-15-11 - don't replace a valid attribute with a bad one var curVal = current.Element.GetAttribute(aName); if (string.IsNullOrEmpty(curVal)) { if (aValue == null) { current.Element.SetAttribute(aName); } else { current.Element.SetAttribute(aName, aValue); } } return(true); } else { return(false); } }
/// <summary> /// When CsQuery is provided, an initial indexing context can be used /// </summary> /// <param name="csq"></param> /// <param name="allowLiterals"></param> /// <returns></returns> protected IEnumerable <IDomObject> Parse(bool allowLiterals) { int pos = 0; Stack <IterationData> stack = new Stack <IterationData>(); while (pos <= EndPos) { IterationData current = new IterationData(); current.AllowLiterals = allowLiterals; current.Reset(pos); stack.Push(current); while (stack.Count != 0) { current = stack.Pop(); //Debug.Assert(current.Object == null); while (!current.Finished && current.Pos <= EndPos) { char c = BaseHtml[current.Pos]; switch (current.Step) { case 0: current.Pos = CharIndexOf(BaseHtml, '<', current.Pos); if (current.Pos < 0) { // done - no new tags found current.Pos = EndPos + 1; } else { // deal with when we're in a literal block (script/textarea) if (current.ReadTextOnly) { int endPos = current.Pos; while (endPos >= 0) { // keep going until we find the closing tag for this element int caretPos = CharIndexOf(BaseHtml, '>', endPos + 1); if (caretPos > 0) { string tag = BaseHtml.SubstringBetween(endPos + 1, caretPos).Trim().ToLower(); if (tag == "/" + current.Parent.Element.NodeName) { // this is the end tag -- exit the block current.Pos = endPos; break; } } endPos = CharIndexOf(BaseHtml, '<', endPos + 1); } } // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish current.Step = 1; } break; case 1: if (current.Pos > current.HtmlStart) { IDomObject literal = GetLiteral(current); if (literal != null) { yield return(literal); } continue; } int tagStartPos = current.Pos; string newTag; newTag = GetTagOpener(current); string newTagLower = newTag.ToLower(); // when Element exists, it's because a previous iteration created it: it's our parent string parentTag = String.Empty; if (current.Parent != null) { parentTag = current.Parent.Element.NodeName.ToLower(); } if (newTag == String.Empty) { // It's a tag closer. Make sure it's the right one. current.Pos = tagStartPos + 1; string closeTag = GetCloseTag(current); // Ignore empty tags, or closing tags found when no parent is open bool isProperClose = closeTag.ToLower() == parentTag; if (closeTag == String.Empty) { // ignore empty tags continue; } else { // locate match for this closer up the heirarchy IterationData actualParent = null; if (!isProperClose) { actualParent = current.Parent; while (actualParent != null && actualParent.Element.NodeName.ToLower() != closeTag.ToLower()) { actualParent = actualParent.Parent; } } // if no matching close tag was found up the tree, ignore it // otherwise always close this and repeat at the same position until the match is found if (!isProperClose && actualParent == null) { current.Invalid = true; continue; } } // element is closed if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.Finished = true; if (isProperClose) { current.Parent.Reset(current.Pos); } else { current.Parent.Reset(tagStartPos); } // already been returned before we added the children continue; } // Before we keep going see if this is an implicit close if (parentTag != String.Empty) { if (TagHasImplicitClose(parentTag, newTag) && parentTag == newTag) { // same tag for a repeater like li occcurred - treat like a close tag if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.Parent.Reset(tagStartPos); current.Finished = true; continue; } } // seems to be a new tag. Parse it IDomSpecialElement specialElement = null; if (newTagLower[0] == '!') { if (newTagLower.StartsWith("!doctype")) { specialElement = new DomDocumentType(); current.Object = specialElement; } else if (newTagLower.StartsWith("![cdata[")) { specialElement = new DomCData(); current.Object = specialElement; current.Pos = tagStartPos + 9; } else { specialElement = new DomComment(); current.Object = specialElement; if (newTagLower.StartsWith("!--")) { ((DomComment)specialElement).IsQuoted = true; current.Pos = tagStartPos + 4; } else { current.Pos = tagStartPos + 1; } } } else { current.Object = new DomElement(newTag); if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed) { current.ReadTextOnly = true; current.Step = 0; } } // Handle non-element/text types -- they have data inside the tag construct if (current.Object is IDomSpecialElement) { string endTag = (current.Object is IDomComment && ((IDomComment)current.Object).IsQuoted) ? "-->" : ">"; int tagEndPos = BaseHtml.Seek(endTag, current.Pos); if (tagEndPos < 0) { // if a tag is unclosed entirely, then just find a new line. tagEndPos = BaseHtml.Seek(System.Environment.NewLine, current.Pos); } if (tagEndPos < 0) { // Never closed, no newline - junk, treat it like such tagEndPos = EndPos; } specialElement.NonAttributeData = BaseHtml.SubstringBetween(current.Pos, tagEndPos); current.Pos = tagEndPos; } else { // Parse attribute data while (current.Pos <= EndPos) { if (!GetTagAttribute(current)) { break; } } } bool hasChildren = MoveOutsideTag(current); // tricky part: if there are children, push ourselves back on the stack and start with a new object // from this position. The children will add themselves as they are created, avoiding recursion. // When the close tag is found, the parent will be yielded if it's a root element. // I think there's a slightly better way to do this, capturing all the yield logic at the end of the // stack but it works for now. if (current.Parent != null) { current.Parent.Element.AppendChild(current.Object); } else if (!hasChildren) { yield return(current.Object); } if (!hasChildren) { current.Reset(); continue; } stack.Push(current); IterationData subItem = new IterationData(); subItem.Parent = current; subItem.AllowLiterals = true; subItem.Reset(current.Pos); subItem.ReadTextOnly = current.ReadTextOnly; current = subItem; break; } } // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because top-level tag was unclosed. // THis will wrap up any straggling text and close any open tags after it. if (!current.Finished) { if (current.Pos > current.HtmlStart) { IDomObject literal = GetLiteral(current); if (literal != null) { yield return(literal); } } if (current.Parent != null) { if (current.Parent.Parent == null) { yield return(current.Parent.Element); } current.Parent.Reset(current.Pos); current.Finished = true; } } } pos = current.Pos; } }
/// <summary> /// Adds a new child to this item, and returns it /// </summary> /// /// <param name="pos"> /// The index position for the iteration data. /// </param> /// /// <returns> /// New IterationData that is a child of the current IterationData /// </returns> public IterationData AddNewChild(int pos) { IterationData subItem = new IterationData { Parent = this, Pos = pos, HtmlStart = Pos, InsertionMode = InsertionMode }; return subItem; }
/// <summary> /// Move pointer to the first character after the closing caret of this tag. /// </summary> /// <returns> /// Returns True if there are children /// </returns> protected bool MoveOutsideTag(IterationData current) { int endPos = CharIndexOf(BaseHtml, '>', current.Pos); current.HtmlStart = current.Pos + 1; if (endPos > 0) { current.Pos = endPos + 1; return BaseHtml[endPos - 1] == '/' ? false : current.Object.InnerHtmlAllowed || current.Object.InnerTextAllowed; } else { current.Pos = EndPos + 1; return false; } }
/// <summary> /// When CsQuery is provided, an initial indexing context can be used /// </summary> /// <param name="csq"></param> /// <param name="allowLiterals"></param> /// <returns></returns> protected IEnumerable<IDomObject> Parse(bool allowLiterals) { int pos=0; Stack<IterationData> stack = new Stack<IterationData>(); while (pos <= EndPos) { IterationData current = new IterationData(); current.AllowLiterals = allowLiterals; current.Reset(pos); stack.Push(current); while (stack.Count != 0) { current = stack.Pop(); //Debug.Assert(current.Object == null); while (!current.Finished && current.Pos <= EndPos) { char c = BaseHtml[current.Pos]; switch (current.Step) { case 0: current.Pos = CharIndexOf(BaseHtml, '<', current.Pos); if (current.Pos < 0) { // done - no new tags found current.Pos = EndPos + 1; } else { // deal with when we're in a literal block (script/textarea) if (current.ReadTextOnly) { int endPos = current.Pos; while (endPos >= 0) { // keep going until we find the closing tag for this element int caretPos = CharIndexOf(BaseHtml, '>', endPos + 1); if (caretPos > 0) { string tag = BaseHtml.SubstringBetween(endPos + 1, caretPos).Trim().ToLower(); if (tag == "/" +current.Parent.Element.NodeName) { // this is the end tag -- exit the block current.Pos=endPos; break; } } endPos = CharIndexOf(BaseHtml, '<', endPos + 1); } } // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish current.Step=1; } break; case 1: if (current.Pos > current.HtmlStart) { IDomObject literal = GetLiteral(current); if (literal != null) { yield return literal; } continue; } int tagStartPos = current.Pos; string newTag; newTag = GetTagOpener(current); string newTagLower = newTag.ToLower(); // when Element exists, it's because a previous iteration created it: it's our parent string parentTag = String.Empty; if (current.Parent != null) { parentTag = current.Parent.Element.NodeName.ToLower(); } if (newTag == String.Empty) { // It's a tag closer. Make sure it's the right one. current.Pos = tagStartPos + 1; string closeTag = GetCloseTag(current); // Ignore empty tags, or closing tags found when no parent is open bool isProperClose = closeTag.ToLower() == parentTag; if (closeTag == String.Empty) { // ignore empty tags continue; } else { // locate match for this closer up the heirarchy IterationData actualParent =null; if (!isProperClose) { actualParent = current.Parent; while (actualParent != null && actualParent.Element.NodeName.ToLower() != closeTag.ToLower()) { actualParent = actualParent.Parent; } } // if no matching close tag was found up the tree, ignore it // otherwise always close this and repeat at the same position until the match is found if (!isProperClose && actualParent == null) { current.Invalid = true; continue; } } // element is closed if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.Finished = true; if (isProperClose) { current.Parent.Reset(current.Pos); } else { current.Parent.Reset(tagStartPos); } // already been returned before we added the children continue; } // Before we keep going see if this is an implicit close if (parentTag != String.Empty) { if (TagHasImplicitClose(parentTag,newTag) && parentTag == newTag) { // same tag for a repeater like li occcurred - treat like a close tag if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.Parent.Reset(tagStartPos); current.Finished = true; continue; } } // seems to be a new tag. Parse it IDomSpecialElement specialElement = null; if (newTagLower[0] == '!') { if (newTagLower.StartsWith("!doctype")) { specialElement = new DomDocumentType(); current.Object = specialElement; } else if (newTagLower.StartsWith("![cdata[")) { specialElement = new DomCData(); current.Object = specialElement; current.Pos = tagStartPos + 9; } else { specialElement = new DomComment(); current.Object = specialElement; if (newTagLower.StartsWith("!--")) { ((DomComment)specialElement).IsQuoted = true; current.Pos = tagStartPos + 4; } else { current.Pos = tagStartPos+1; } } } else { current.Object = new DomElement(newTag); if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed) { current.ReadTextOnly = true; current.Step = 0; } } // Handle non-element/text types -- they have data inside the tag construct if (current.Object is IDomSpecialElement) { string endTag = (current.Object is IDomComment && ((IDomComment)current.Object).IsQuoted) ? "-->" : ">"; int tagEndPos = BaseHtml.Seek(endTag, current.Pos); if (tagEndPos <0) { // if a tag is unclosed entirely, then just find a new line. tagEndPos = BaseHtml.Seek(System.Environment.NewLine, current.Pos); } if (tagEndPos < 0) { // Never closed, no newline - junk, treat it like such tagEndPos = EndPos; } specialElement.NonAttributeData = BaseHtml.SubstringBetween(current.Pos, tagEndPos); current.Pos = tagEndPos; } else { // Parse attribute data while (current.Pos <= EndPos) { if (!GetTagAttribute(current)) break; } } bool hasChildren = MoveOutsideTag(current); // tricky part: if there are children, push ourselves back on the stack and start with a new object // from this position. The children will add themselves as they are created, avoiding recursion. // When the close tag is found, the parent will be yielded if it's a root element. // I think there's a slightly better way to do this, capturing all the yield logic at the end of the // stack but it works for now. if (current.Parent != null) { current.Parent.Element.AppendChild(current.Object); } else if (!hasChildren) { yield return current.Object; } if (!hasChildren) { current.Reset(); continue; } stack.Push(current); IterationData subItem = new IterationData(); subItem.Parent = current; subItem.AllowLiterals = true; subItem.Reset(current.Pos); subItem.ReadTextOnly = current.ReadTextOnly; current = subItem; break; } } // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because top-level tag was unclosed. // THis will wrap up any straggling text and close any open tags after it. if (!current.Finished) { if (current.Pos > current.HtmlStart) { IDomObject literal = GetLiteral(current); if (literal != null) { yield return literal; } } if (current.Parent != null) { if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.Parent.Reset(current.Pos); current.Finished = true; } } } pos = current.Pos; } }
/// <summary> /// Parse the HTML, and return it, based on options set. /// </summary> /// /// <returns> /// An enumerator of the top-level elements. /// </returns> protected IEnumerable<IDomObject> ParseImplementation() { int pos=0; Stack<IterationData> stack = new Stack<IterationData>(); while (pos <= EndPos) { IterationData current = new IterationData(); if (WrapRootTextNodes) { current.WrapLiterals = true; } current.Reset(pos); stack.Push(current); while (stack.Count != 0) { current = stack.Pop(); while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos) { char c = Html[current.Pos]; switch (current.TokenizerState) { case TokenizerState.Default: if (current.FindNextTag(Html)) { // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish current.TokenizerState = TokenizerState.TagStart; } break; case TokenizerState.TagStart: IDomObject literal; if (current.TryGetLiteral(this, out literal)) { yield return literal; } int tagStartPos = current.Pos; string newTag=current.GetTagOpener(Html); if (newTag == String.Empty) { // It's a tag closer. Make sure it's the right one. current.Pos = tagStartPos + 1; ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html)); // Ignore empty tags, or closing tags found when no parent is open bool isProperClose = closeTagId == current.ParentTagID(); if (closeTagId == 0) { // ignore empty tags continue; } else { // locate match for this closer up the heirarchy IterationData actualParent =null; if (!isProperClose) { actualParent = current.Parent; while (actualParent != null && actualParent.Element.NodeNameID != closeTagId) { actualParent = actualParent.Parent; } } // if no matching close tag was found up the tree, ignore it // otherwise always close this and repeat at the same position until the match is found if (!isProperClose && actualParent == null) { current.InsertionMode = InsertionMode.Invalid; continue; } } // element is closed if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.TokenizerState = TokenizerState.Finished ; if (isProperClose) { current.Parent.Reset(current.Pos); } else { current.Parent.Reset(tagStartPos); } // already been returned before we added the children continue; } else if (newTag[0] == '!') { IDomSpecialElement specialElement = null; string newTagUpper = newTag.ToUpper(); if (newTagUpper.StartsWith("!DOCTYPE")) { specialElement = new DomDocumentType(); current.Element = specialElement; } else if (newTagUpper.StartsWith("![CDATA[")) { specialElement = new DomCData(); current.Element = specialElement; current.Pos = tagStartPos + 9; } else { specialElement = new DomComment(); current.Element = specialElement; if (newTag.StartsWith("!--")) { ((DomComment)specialElement).IsQuoted = true; current.Pos = tagStartPos + 4; } else { current.Pos = tagStartPos+1; } } string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">"; int tagEndPos = Html.Seek(endTag, current.Pos); if (tagEndPos < 0) { // if a tag is unclosed entirely, then just find a new line. tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos); } if (tagEndPos < 0) { // Never closed, no newline - junk, treat it like such tagEndPos = EndPos; } specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos); current.Pos = tagEndPos; } else { // seems to be a new element tag, parse it. ushort newTagId = HtmlData.Tokenize(newTag); // Before we keep going see if this is an implicit close ushort parentTagId = current.ParentTagID(); int lastPos = current.Pos; if (parentTagId ==0 && IsDocument) { if (newTagId != HtmlData.tagHTML) { current.Element =DomElement.Create(HtmlData.tagHTML); current = current.AddNewChild(); parentTagId = HtmlData.tagHTML; } } if (parentTagId != 0) { ushort action = SpecialTagActionDelegate(parentTagId, newTagId); while (action != HtmlData.tagActionNothing) { if (action == HtmlData.tagActionClose) { // track the next parent up the chain var newNode = (current.Parent != null) ? current.Parent : null; // same tag for a repeater like li occcurred - treat like a close tag if (current.Parent.Parent == null) { yield return current.Parent.Element; } current.TokenizerState = TokenizerState.Finished; //current.Parent.Reset(tagStartPos); if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null) { action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId); if (action != HtmlData.tagActionNothing) { current = newNode; } } else { action = HtmlData.tagActionNothing; } } else { if (GenerateOptionalElements) { stack.Push(current); current = current.AddNewParent(action, lastPos); } action = HtmlData.tagActionNothing; } } if (current.TokenizerState == TokenizerState.Finished) { current.Parent.Reset(tagStartPos); continue; } } current.Element = DomElement.Create(newTagId); if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed) { current.InsertionMode = InsertionMode.Text; current.TokenizerState = TokenizerState.Default; } // Parse attribute data while (current.Pos <= EndPos) { if (!current.GetTagAttribute(Html)) break; } } IDomObject el; if (current.FinishTagOpener(Html, out el)) { stack.Push(current); current = current.AddNewChild(); } if (el != null) { yield return el; } break; } } // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because top-level tag was unclosed. // THis will wrap up any straggling text and close any open tags after it. if (current.TokenizerState != TokenizerState.Finished) { foreach (var el in current.CloseElement(this)) { yield return el; } } } pos = current.Pos; } }