/// <summary>
        /// Returns a literal object for the text between HtmlStart (the last position of the end of a tag) and the current position.
        /// If !AllowLiterals then it's wrapped in a span.
        /// </summary>
        /// <param name="current"></param>
        /// <returns></returns>
        protected IDomObject GetLiteral(IterationData current)
        {
            // There's plain text -return it as a literal.

            IDomObject textObj;
            DomText    lit;

            if (current.Invalid)
            {
                lit = new DomInvalidElement();
            }
            else if (current.ReadTextOnly)
            {
                current.ReadTextOnly = false;
                lit = new DomInnerText();
            }
            else
            {
                lit = new DomText();
            }

            if (isBound)
            {
                lit.SetTextIndex(Document, Document.TokenizeString(current.HtmlStart, current.Pos - current.HtmlStart));
            }
            else
            {
                string text = BaseHtml.SubstringBetween(current.HtmlStart, current.Pos);
                lit.NodeValue = Objects.HtmlDecode(text);
            }

            if (!current.AllowLiterals)
            {
                IDomElement wrapper = new DomElement("span");
                wrapper.AppendChild(lit);
                textObj = wrapper;
            }
            else
            {
                textObj = lit;
            }

            if (current.Parent != null)
            {
                current.Parent.Element.AppendChild(textObj);
                current.Reset();
                return(null);
            }
            else
            {
                current.Finished = true;
                return(textObj);
            }
        }
Example #2
0
        /// <summary>
        /// Adds a new child to this item, and returns it
        /// </summary>
        ///
        /// <param name="pos">
        /// The index position for the iteration data.
        /// </param>
        ///
        /// <returns>
        /// New IterationData that is a child of the current IterationData
        /// </returns>

        public IterationData AddNewChild(int pos)
        {
            IterationData subItem = new IterationData
            {
                Parent        = this,
                Pos           = pos,
                HtmlStart     = Pos,
                InsertionMode = InsertionMode
            };

            return(subItem);
        }
        /// <summary>
        /// Start: the opening caret of a tag
        /// End: the first stop character (e.g. space after the tag name)
        /// </summary>
        /// <param name="current"></param>
        /// <returns>Tag name</returns>
        protected string GetTagOpener(IterationData current)
        {
            bool finished = false;
            int  step     = 0;
            int  tagStart = -1;

            while (!finished && current.Pos <= EndPos)
            {
                char c = BaseHtml[current.Pos];
                switch (step)
                {
                case 0:

                    if (c == '<')
                    {
                        tagStart = current.Pos + 1;
                        step     = 1;
                    }
                    current.Pos++;
                    break;

                case 1:
                    // skip whitespace between opening caret and text -- probably not allowed but can't hurt to do this
                    if (c == ' ')
                    {
                        current.Pos++;
                    }
                    else
                    {
                        step = 2;
                    }
                    break;

                case 2:
                    if (isHtmlTagEnd(c))
                    {
                        return(BaseHtml.SubstringBetween(tagStart, current.Pos).Trim());
                    }
                    else
                    {
                        current.Pos++;
                    }
                    break;
                }
            }
            return(String.Empty);
        }
        /// <summary>
        /// Move pointer to the first character after the closing caret of this tag.
        /// </summary>
        /// <returns>
        /// Returns True if there are children
        /// </returns>
        protected bool MoveOutsideTag(IterationData current)
        {
            int endPos = CharIndexOf(BaseHtml, '>', current.Pos);

            current.HtmlStart = current.Pos + 1;
            if (endPos > 0)
            {
                current.Pos = endPos + 1;
                return(BaseHtml[endPos - 1] == '/' ? false :
                       current.Object.InnerHtmlAllowed || current.Object.InnerTextAllowed);
            }
            else
            {
                current.Pos = EndPos + 1;
                return(false);
            }
        }
        /// <summary>
        /// Start: Expects the position to be after an opening caret for a close tag, and returns the tag name.
        /// End: Position after closing caret
        /// </summary>
        /// <param name="current"></param>
        /// <returns></returns>
        protected string GetCloseTag(IterationData current)
        {
            bool   finished  = false;
            int    step      = 0;
            int    nameStart = 0;
            string name      = null;
            char   c;

            while (!finished && current.Pos <= EndPos)
            {
                c = BaseHtml[current.Pos];
                switch (step)
                {
                case 0:
                    if (CharacterData.IsType(c, CharacterType.HtmlTagNameStart))
                    {
                        nameStart = current.Pos;
                        step      = 1;
                    }
                    current.Pos++;
                    break;

                case 1:
                    if (!CharacterData.IsType(c, CharacterType.HtmlTagNameExceptStart))
                    {
                        name = BaseHtml.SubstringBetween(nameStart, current.Pos);
                        step = 2;
                    }
                    else
                    {
                        current.Pos++;
                    }
                    break;

                case 2:
                    if (c == '>')
                    {
                        finished = true;
                    }
                    current.Pos++;
                    break;
                }
            }
            return(name ?? "");
        }
        protected string GetOpenText(IterationData current)
        {
            int pos = CharIndexOf(BaseHtml, '<', current.Pos);

            if (pos > current.Pos)
            {
                int startPos = current.Pos;
                current.Pos = pos;
                return(BaseHtml.SubstringBetween(startPos, pos));
            }
            else if (pos == -1)
            {
                int oldPos = current.Pos;
                current.Pos = BaseHtml.Length;
                return(BaseHtml.SubstringBetween(oldPos, current.Pos));
            }
            else
            {
                return(String.Empty);
            }
        }
        /// <summary>
        /// Start: the opening caret of a tag
        /// End: the first stop character (e.g. space after the tag name)
        /// </summary>
        /// <param name="current"></param>
        /// <returns>Tag name</returns>
        protected string GetTagOpener(IterationData current)
        {
            bool finished = false;
            int step = 0;
            int tagStart = -1;

            while (!finished && current.Pos <= EndPos)
            {
                char c = BaseHtml[current.Pos];
                switch (step)
                {
                    case 0:

                        if (c == '<')
                        {
                            tagStart = current.Pos + 1;
                            step = 1;
                        }
                        current.Pos++;
                        break;
                    case 1:
                        // skip whitespace between opening caret and text -- probably not allowed but can't hurt to do this
                        if (c == ' ')
                        {
                            current.Pos++;
                        }
                        else
                        {
                            step = 2;
                        }
                        break;
                    case 2:
                        if (isHtmlTagEnd(c))
                        {
                            return BaseHtml.SubstringBetween(tagStart, current.Pos).Trim();
                        }
                        else
                        {
                            current.Pos++;
                        }
                        break;
                }
            }
            return String.Empty;
        }
        /// <summary>
        /// Start: Position inside a tag opening construct
        /// End: position after last character of tag construct {x=["|']y["|]]} or just {x}) and adds attribute if successful
        ///      position ON closing caret of tag opener if failed
        /// </summary>
        /// <param name="current"></param>
        /// <returns>
        /// Returns true if an attribute was added, false if no more attributes were found
        /// </returns>
        protected bool GetTagAttribute(IterationData current)
        {
            bool finished = false;
            int step = 0;
            string aName = null;
            string aValue = null;
            int nameStart = -1;
            int valStart = -1;
            bool isQuoted = false;
            char quoteChar = ' ';

            while (!finished && current.Pos <= EndPos)
            {
                char c = BaseHtml[current.Pos];
                switch (step)
                {
                    case 0: // find name
                        if (CharacterData.IsType(c, CharacterType.HtmlTagNameStart))
                        {
                            step = 1;
                            nameStart = current.Pos;
                            current.Pos++;
                        }
                        else if (isTagChar(c))
                        {
                            finished = true;
                        }
                        else
                        {
                            current.Pos++;
                        }

                        break;
                    case 1:
                        if (!CharacterData.IsType(c, CharacterType.HtmlTagNameExceptStart))
                        {
                            step = 2;
                            aName = BaseHtml.SubstringBetween(nameStart, current.Pos);
                        }
                        else
                        {
                            current.Pos++;
                        }
                        break;
                    case 2: // find value
                        switch(c) {
                            case '=':
                                step = 3;
                                current.Pos++;
                                break;
                            case ' ':
                                current.Pos++;
                                break;
                            default:
                                // anything else means new attribute
                                finished = true;
                                break;
                        }
                        break;
                    case 3: // find quote start
                        switch(c) {
                            case '\\':
                            case '>':
                                finished = true;
                                break;
                            case ' ':
                                current.Pos++;
                                break;
                            case '"':
                            case '\'':
                                isQuoted = true;
                                valStart = current.Pos+1;
                                current.Pos++;
                                quoteChar = c;
                                step = 4;
                                break;
                            default:
                                valStart = current.Pos;
                                step = 4;
                                break;
                        }
                        // any non-whitespace is part of the attribute

                        break;
                    case 4: // parse the attribute until whitespace or closing quote
                        if ((isQuoted && c == quoteChar) ||
                            (!isQuoted && isHtmlTagEnd(c)))
                        {
                            aValue = BaseHtml.SubstringBetween(valStart, current.Pos);
                            if (isQuoted)
                            {
                                isQuoted = false;
                                current.Pos++;
                            }
                            finished = true;
                        }
                        else
                        {
                            current.Pos++;
                        }
                        break;
                }
            }
            if (aName != null)
            {
                // 12-15-11 - don't replace a valid attribute with a bad one

                var curVal = current.Element.GetAttribute(aName);
                if (string.IsNullOrEmpty(curVal))
                {
                    if (aValue == null)
                    {
                        current.Element.SetAttribute(aName);
                    }
                    else
                    {
                        current.Element.SetAttribute(aName, aValue);
                    }
                }
                return true;
            }
            else
            {
                return false;
            }
        }
 protected string GetOpenText(IterationData current)
 {
     int pos = CharIndexOf(BaseHtml, '<', current.Pos);
     if (pos > current.Pos)
     {
         int startPos = current.Pos;
         current.Pos = pos;
         return BaseHtml.SubstringBetween(startPos, pos);
     }
     else if (pos == -1)
     {
         int oldPos = current.Pos;
         current.Pos = BaseHtml.Length;
         return BaseHtml.SubstringBetween(oldPos, current.Pos);
     }
     else
     {
         return String.Empty;
     }
 }
        /// <summary>
        /// Returns a literal object for the text between HtmlStart (the last position of the end of a tag) and the current position.
        /// If !AllowLiterals then it's wrapped in a span.
        /// </summary>
        /// <param name="current"></param>
        /// <returns></returns>
        protected IDomObject GetLiteral(IterationData current)
        {
            // There's plain text -return it as a literal.

            IDomObject textObj;
            DomText lit;
            if (current.Invalid) {
                lit = new DomInvalidElement();
            }
            else if (current.ReadTextOnly)
            {
                current.ReadTextOnly = false;
                lit = new DomInnerText();
            } else {
                lit = new DomText();
            }

            if (isBound)
            {
                lit.SetTextIndex(Document, Document.TokenizeString(current.HtmlStart, current.Pos - current.HtmlStart));
            }
            else
            {
                string text = BaseHtml.SubstringBetween(current.HtmlStart, current.Pos);
                lit.NodeValue = Objects.HtmlDecode(text);
            }

            if (!current.AllowLiterals)
            {
                IDomElement wrapper = new DomElement("span");
                wrapper.AppendChild(lit);
                textObj = wrapper;
            }
            else
            {
                textObj = lit;
            }

            if (current.Parent != null)
            {
                current.Parent.Element.AppendChild(textObj);
                current.Reset();
                return null;
            }
            else
            {
                current.Finished = true;
                return textObj;
            }
        }
 /// <summary>
 /// Start: Expects the position to be after an opening caret for a close tag, and returns the tag name.
 /// End: Position after closing caret
 /// </summary>
 /// <param name="current"></param>
 /// <returns></returns>
 protected string GetCloseTag(IterationData current)
 {
     bool finished = false;
     int step = 0;
     int nameStart = 0;
     string name=null;
     char c;
     while (!finished && current.Pos <= EndPos)
     {
         c = BaseHtml[current.Pos];
         switch (step)
         {
             case 0:
                 if (CharacterData.IsType(c,CharacterType.HtmlTagNameStart))
                 {
                     nameStart = current.Pos;
                     step = 1;
                 }
                 current.Pos++;
                 break;
             case 1:
                 if (!CharacterData.IsType(c, CharacterType.HtmlTagNameExceptStart))
                 {
                     name = BaseHtml.SubstringBetween(nameStart, current.Pos);
                     step = 2;
                 }
                 else
                 {
                     current.Pos++;
                 }
                 break;
             case 2:
                 if (c == '>')
                 {
                     finished = true;
                 }
                 current.Pos++;
                 break;
         }
     }
     return name ?? "";
 }
Example #12
0
        /// <summary>
        /// Parse the HTML, and return it, based on options set.
        /// </summary>
        ///
        /// <returns>
        /// An enumerator of the top-level elements.
        /// </returns>

        protected IEnumerable <IDomObject> ParseImplementation()
        {
            int pos = 0;
            Stack <IterationData> stack = new Stack <IterationData>();

            while (pos <= EndPos)
            {
                IterationData current = new IterationData();
                if (WrapRootTextNodes)
                {
                    current.WrapLiterals = true;
                }

                current.Reset(pos);
                stack.Push(current);

                while (stack.Count != 0)
                {
                    current = stack.Pop();

                    while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos)
                    {
                        char c = Html[current.Pos];
                        switch (current.TokenizerState)
                        {
                        case TokenizerState.Default:
                            if (current.FindNextTag(Html))
                            {
                                // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish
                                current.TokenizerState = TokenizerState.TagStart;
                            }
                            break;

                        case TokenizerState.TagStart:
                            IDomObject literal;
                            if (current.TryGetLiteral(this, out literal))
                            {
                                yield return(literal);
                            }

                            int tagStartPos = current.Pos;

                            string newTag = current.GetTagOpener(Html);

                            if (newTag == String.Empty)
                            {
                                // It's a tag closer. Make sure it's the right one.
                                current.Pos = tagStartPos + 1;
                                ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html));

                                // Ignore empty tags, or closing tags found when no parent is open
                                bool isProperClose = closeTagId == current.ParentTagID();
                                if (closeTagId == 0)
                                {
                                    // ignore empty tags
                                    continue;
                                }
                                else
                                {
                                    // locate match for this closer up the heirarchy
                                    IterationData actualParent = null;

                                    if (!isProperClose)
                                    {
                                        actualParent = current.Parent;
                                        while (actualParent != null && actualParent.Element.NodeNameID != closeTagId)
                                        {
                                            actualParent = actualParent.Parent;
                                        }
                                    }
                                    // if no matching close tag was found up the tree, ignore it
                                    // otherwise always close this and repeat at the same position until the match is found
                                    if (!isProperClose && actualParent == null)
                                    {
                                        current.InsertionMode = InsertionMode.Invalid;
                                        continue;
                                    }
                                }
                                // element is closed

                                if (current.Parent.Parent == null)
                                {
                                    yield return(current.Parent.Element);
                                }
                                current.TokenizerState = TokenizerState.Finished;
                                if (isProperClose)
                                {
                                    current.Parent.Reset(current.Pos);
                                }
                                else
                                {
                                    current.Parent.Reset(tagStartPos);
                                }
                                // already been returned before we added the children
                                continue;
                            }
                            else if (newTag[0] == '!')
                            {
                                IDomSpecialElement specialElement = null;
                                string             newTagUpper    = newTag.ToUpper();
                                if (newTagUpper.StartsWith("!DOCTYPE"))
                                {
                                    specialElement  = new DomDocumentType();
                                    current.Element = specialElement;
                                }
                                else if (newTagUpper.StartsWith("![CDATA["))
                                {
                                    specialElement  = new DomCData();
                                    current.Element = specialElement;
                                    current.Pos     = tagStartPos + 9;
                                }
                                else
                                {
                                    specialElement  = new DomComment();
                                    current.Element = specialElement;
                                    if (newTag.StartsWith("!--"))
                                    {
                                        ((DomComment)specialElement).IsQuoted = true;
                                        current.Pos = tagStartPos + 4;
                                    }
                                    else
                                    {
                                        current.Pos = tagStartPos + 1;
                                    }
                                }

                                string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">";

                                int tagEndPos = Html.Seek(endTag, current.Pos);
                                if (tagEndPos < 0)
                                {
                                    // if a tag is unclosed entirely, then just find a new line.
                                    tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos);
                                }
                                if (tagEndPos < 0)
                                {
                                    // Never closed, no newline - junk, treat it like such
                                    tagEndPos = EndPos;
                                }

                                specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos);
                                current.Pos = tagEndPos;
                            }
                            else
                            {
                                // seems to be a new element tag, parse it.

                                ushort newTagId = HtmlData.Tokenize(newTag);

                                // Before we keep going see if this is an implicit close
                                ushort parentTagId = current.ParentTagID();

                                int lastPos = current.Pos;

                                if (parentTagId == 0 && IsDocument)
                                {
                                    if (newTagId != HtmlData.tagHTML)
                                    {
                                        current.Element = DomElement.Create(HtmlData.tagHTML);
                                        current         = current.AddNewChild();
                                        parentTagId     = HtmlData.tagHTML;
                                    }
                                }

                                if (parentTagId != 0)
                                {
                                    ushort action = SpecialTagActionDelegate(parentTagId, newTagId);

                                    while (action != HtmlData.tagActionNothing)
                                    {
                                        if (action == HtmlData.tagActionClose)
                                        {
                                            // track the next parent up the chain

                                            var newNode = (current.Parent != null) ?
                                                          current.Parent : null;

                                            // same tag for a repeater like li occcurred - treat like a close tag

                                            if (current.Parent.Parent == null)
                                            {
                                                yield return(current.Parent.Element);
                                            }

                                            current.TokenizerState = TokenizerState.Finished;
                                            //current.Parent.Reset(tagStartPos);

                                            if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null)
                                            {
                                                action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId);
                                                if (action != HtmlData.tagActionNothing)
                                                {
                                                    current = newNode;
                                                }
                                            }
                                            else
                                            {
                                                action = HtmlData.tagActionNothing;
                                            }
                                        }
                                        else
                                        {
                                            if (GenerateOptionalElements)
                                            {
                                                stack.Push(current);
                                                current = current.AddNewParent(action, lastPos);
                                            }
                                            action = HtmlData.tagActionNothing;
                                        }
                                    }
                                    if (current.TokenizerState == TokenizerState.Finished)
                                    {
                                        current.Parent.Reset(tagStartPos);
                                        continue;
                                    }
                                }


                                current.Element = DomElement.Create(newTagId);


                                if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed)
                                {
                                    current.InsertionMode  = InsertionMode.Text;
                                    current.TokenizerState = TokenizerState.Default;
                                }

                                // Parse attribute data
                                while (current.Pos <= EndPos)
                                {
                                    if (!current.GetTagAttribute(Html))
                                    {
                                        break;
                                    }
                                }
                            }

                            IDomObject el;

                            if (current.FinishTagOpener(Html, out el))
                            {
                                stack.Push(current);
                                current = current.AddNewChild();
                            }

                            if (el != null)
                            {
                                yield return(el);
                            }

                            break;
                        }
                    }


                    // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because  top-level tag was unclosed.
                    // THis will wrap up any straggling text and close any open tags after it.

                    if (current.TokenizerState != TokenizerState.Finished)
                    {
                        foreach (var el in current.CloseElement(this))
                        {
                            yield return(el);
                        }
                    }
                }
                pos = current.Pos;
            }
        }
        /// <summary>
        /// Start: Position inside a tag opening construct
        /// End: position after last character of tag construct {x=["|']y["|]]} or just {x}) and adds attribute if successful
        ///      position ON closing caret of tag opener if failed
        /// </summary>
        /// <param name="current"></param>
        /// <returns>
        /// Returns true if an attribute was added, false if no more attributes were found
        /// </returns>
        protected bool GetTagAttribute(IterationData current)
        {
            bool   finished  = false;
            int    step      = 0;
            string aName     = null;
            string aValue    = null;
            int    nameStart = -1;
            int    valStart  = -1;
            bool   isQuoted  = false;
            char   quoteChar = ' ';

            while (!finished && current.Pos <= EndPos)
            {
                char c = BaseHtml[current.Pos];
                switch (step)
                {
                case 0:     // find name
                    if (CharacterData.IsType(c, CharacterType.HtmlTagNameStart))
                    {
                        step      = 1;
                        nameStart = current.Pos;
                        current.Pos++;
                    }
                    else if (isTagChar(c))
                    {
                        finished = true;
                    }
                    else
                    {
                        current.Pos++;
                    }

                    break;

                case 1:
                    if (!CharacterData.IsType(c, CharacterType.HtmlTagNameExceptStart))
                    {
                        step  = 2;
                        aName = BaseHtml.SubstringBetween(nameStart, current.Pos);
                    }
                    else
                    {
                        current.Pos++;
                    }
                    break;

                case 2:     // find value
                    switch (c)
                    {
                    case '=':
                        step = 3;
                        current.Pos++;
                        break;

                    case ' ':
                        current.Pos++;
                        break;

                    default:
                        // anything else means new attribute
                        finished = true;
                        break;
                    }
                    break;

                case 3:     // find quote start
                    switch (c)
                    {
                    case '\\':
                    case '>':
                        finished = true;
                        break;

                    case ' ':
                        current.Pos++;
                        break;

                    case '"':
                    case '\'':
                        isQuoted = true;
                        valStart = current.Pos + 1;
                        current.Pos++;
                        quoteChar = c;
                        step      = 4;
                        break;

                    default:
                        valStart = current.Pos;
                        step     = 4;
                        break;
                    }
                    // any non-whitespace is part of the attribute

                    break;

                case 4:     // parse the attribute until whitespace or closing quote
                    if ((isQuoted && c == quoteChar) ||
                        (!isQuoted && isHtmlTagEnd(c)))
                    {
                        aValue = BaseHtml.SubstringBetween(valStart, current.Pos);
                        if (isQuoted)
                        {
                            isQuoted = false;
                            current.Pos++;
                        }
                        finished = true;
                    }
                    else
                    {
                        current.Pos++;
                    }
                    break;
                }
            }
            if (aName != null)
            {
                // 12-15-11 - don't replace a valid attribute with a bad one

                var curVal = current.Element.GetAttribute(aName);
                if (string.IsNullOrEmpty(curVal))
                {
                    if (aValue == null)
                    {
                        current.Element.SetAttribute(aName);
                    }
                    else
                    {
                        current.Element.SetAttribute(aName, aValue);
                    }
                }
                return(true);
            }
            else
            {
                return(false);
            }
        }
        /// <summary>
        /// When CsQuery is provided, an initial indexing context can be used
        /// </summary>
        /// <param name="csq"></param>
        /// <param name="allowLiterals"></param>
        /// <returns></returns>
        protected IEnumerable <IDomObject> Parse(bool allowLiterals)
        {
            int pos = 0;
            Stack <IterationData> stack = new Stack <IterationData>();

            while (pos <= EndPos)
            {
                IterationData current = new IterationData();
                current.AllowLiterals = allowLiterals;
                current.Reset(pos);
                stack.Push(current);

                while (stack.Count != 0)
                {
                    current = stack.Pop();
                    //Debug.Assert(current.Object == null);

                    while (!current.Finished && current.Pos <= EndPos)
                    {
                        char c = BaseHtml[current.Pos];
                        switch (current.Step)
                        {
                        case 0:
                            current.Pos = CharIndexOf(BaseHtml, '<', current.Pos);
                            if (current.Pos < 0)
                            {
                                // done - no new tags found
                                current.Pos = EndPos + 1;
                            }
                            else
                            {
                                // deal with when we're in a literal block (script/textarea)
                                if (current.ReadTextOnly)
                                {
                                    int endPos = current.Pos;
                                    while (endPos >= 0)
                                    {
                                        // keep going until we find the closing tag for this element
                                        int caretPos = CharIndexOf(BaseHtml, '>', endPos + 1);
                                        if (caretPos > 0)
                                        {
                                            string tag = BaseHtml.SubstringBetween(endPos + 1, caretPos).Trim().ToLower();
                                            if (tag == "/" + current.Parent.Element.NodeName)
                                            {
                                                // this is the end tag -- exit the block
                                                current.Pos = endPos;
                                                break;
                                            }
                                        }
                                        endPos = CharIndexOf(BaseHtml, '<', endPos + 1);
                                    }
                                }
                                // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish
                                current.Step = 1;
                            }
                            break;

                        case 1:
                            if (current.Pos > current.HtmlStart)
                            {
                                IDomObject literal = GetLiteral(current);
                                if (literal != null)
                                {
                                    yield return(literal);
                                }

                                continue;
                            }

                            int    tagStartPos = current.Pos;
                            string newTag;

                            newTag = GetTagOpener(current);

                            string newTagLower = newTag.ToLower();

                            // when Element exists, it's because a previous iteration created it: it's our parent
                            string parentTag = String.Empty;
                            if (current.Parent != null)
                            {
                                parentTag = current.Parent.Element.NodeName.ToLower();
                            }

                            if (newTag == String.Empty)
                            {
                                // It's a tag closer. Make sure it's the right one.
                                current.Pos = tagStartPos + 1;
                                string closeTag = GetCloseTag(current);

                                // Ignore empty tags, or closing tags found when no parent is open
                                bool isProperClose = closeTag.ToLower() == parentTag;
                                if (closeTag == String.Empty)
                                {
                                    // ignore empty tags
                                    continue;
                                }
                                else
                                {
                                    // locate match for this closer up the heirarchy
                                    IterationData actualParent = null;

                                    if (!isProperClose)
                                    {
                                        actualParent = current.Parent;
                                        while (actualParent != null && actualParent.Element.NodeName.ToLower() != closeTag.ToLower())
                                        {
                                            actualParent = actualParent.Parent;
                                        }
                                    }
                                    // if no matching close tag was found up the tree, ignore it
                                    // otherwise always close this and repeat at the same position until the match is found
                                    if (!isProperClose && actualParent == null)
                                    {
                                        current.Invalid = true;
                                        continue;
                                    }
                                }
                                // element is closed

                                if (current.Parent.Parent == null)
                                {
                                    yield return(current.Parent.Element);
                                }
                                current.Finished = true;
                                if (isProperClose)
                                {
                                    current.Parent.Reset(current.Pos);
                                }
                                else
                                {
                                    current.Parent.Reset(tagStartPos);
                                }
                                // already been returned before we added the children
                                continue;
                            }
                            // Before we keep going see if this is an implicit close
                            if (parentTag != String.Empty)
                            {
                                if (TagHasImplicitClose(parentTag, newTag) &&
                                    parentTag == newTag)
                                {
                                    // same tag for a repeater like li occcurred - treat like a close tag
                                    if (current.Parent.Parent == null)
                                    {
                                        yield return(current.Parent.Element);
                                    }
                                    current.Parent.Reset(tagStartPos);
                                    current.Finished = true;

                                    continue;
                                }
                            }
                            // seems to be a new tag. Parse it

                            IDomSpecialElement specialElement = null;

                            if (newTagLower[0] == '!')
                            {
                                if (newTagLower.StartsWith("!doctype"))
                                {
                                    specialElement = new DomDocumentType();
                                    current.Object = specialElement;
                                }
                                else if (newTagLower.StartsWith("![cdata["))
                                {
                                    specialElement = new DomCData();
                                    current.Object = specialElement;
                                    current.Pos    = tagStartPos + 9;
                                }
                                else
                                {
                                    specialElement = new DomComment();
                                    current.Object = specialElement;
                                    if (newTagLower.StartsWith("!--"))
                                    {
                                        ((DomComment)specialElement).IsQuoted = true;
                                        current.Pos = tagStartPos + 4;
                                    }
                                    else
                                    {
                                        current.Pos = tagStartPos + 1;
                                    }
                                }
                            }
                            else
                            {
                                current.Object = new DomElement(newTag);

                                if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed)
                                {
                                    current.ReadTextOnly = true;
                                    current.Step         = 0;
                                }
                            }

                            // Handle non-element/text types -- they have data inside the tag construct

                            if (current.Object is IDomSpecialElement)
                            {
                                string endTag = (current.Object is IDomComment && ((IDomComment)current.Object).IsQuoted) ? "-->" : ">";

                                int tagEndPos = BaseHtml.Seek(endTag, current.Pos);
                                if (tagEndPos < 0)
                                {
                                    // if a tag is unclosed entirely, then just find a new line.
                                    tagEndPos = BaseHtml.Seek(System.Environment.NewLine, current.Pos);
                                }
                                if (tagEndPos < 0)
                                {
                                    // Never closed, no newline - junk, treat it like such
                                    tagEndPos = EndPos;
                                }

                                specialElement.NonAttributeData = BaseHtml.SubstringBetween(current.Pos, tagEndPos);
                                current.Pos = tagEndPos;
                            }
                            else
                            {
                                // Parse attribute data
                                while (current.Pos <= EndPos)
                                {
                                    if (!GetTagAttribute(current))
                                    {
                                        break;
                                    }
                                }
                            }

                            bool hasChildren = MoveOutsideTag(current);

                            // tricky part: if there are children, push ourselves back on the stack and start with a new object
                            // from this position. The children will add themselves as they are created, avoiding recursion.
                            // When the close tag is found, the parent will be yielded if it's a root element.
                            // I think there's a slightly better way to do this, capturing all the yield logic at the end of the
                            // stack but it works for now.

                            if (current.Parent != null)
                            {
                                current.Parent.Element.AppendChild(current.Object);
                            }
                            else if (!hasChildren)
                            {
                                yield return(current.Object);
                            }

                            if (!hasChildren)
                            {
                                current.Reset();
                                continue;
                            }

                            stack.Push(current);

                            IterationData subItem = new IterationData();
                            subItem.Parent        = current;
                            subItem.AllowLiterals = true;
                            subItem.Reset(current.Pos);
                            subItem.ReadTextOnly = current.ReadTextOnly;
                            current = subItem;
                            break;
                        }
                    }
                    // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because  top-level tag was unclosed.
                    // THis will wrap up any straggling text and close any open tags after it.
                    if (!current.Finished)
                    {
                        if (current.Pos > current.HtmlStart)
                        {
                            IDomObject literal = GetLiteral(current);
                            if (literal != null)
                            {
                                yield return(literal);
                            }
                        }

                        if (current.Parent != null)
                        {
                            if (current.Parent.Parent == null)
                            {
                                yield return(current.Parent.Element);
                            }
                            current.Parent.Reset(current.Pos);
                            current.Finished = true;
                        }
                    }
                }
                pos = current.Pos;
            }
        }
Example #15
0
        /// <summary>
        /// Adds a new child to this item, and returns it
        /// </summary>
        ///
        /// <param name="pos">
        /// The index position for the iteration data.
        /// </param>
        ///
        /// <returns>
        /// New IterationData that is a child of the current IterationData
        /// </returns>

        public IterationData AddNewChild(int pos)
        {
            IterationData subItem = new IterationData
            {
                Parent = this,
                Pos = pos,
                HtmlStart = Pos,
                InsertionMode = InsertionMode
            };
            return subItem;

        }
        /// <summary>
        /// Move pointer to the first character after the closing caret of this tag. 
        /// </summary>
        /// <returns>
        /// Returns True if there are children
        /// </returns>
        protected bool MoveOutsideTag(IterationData current)
        {
            int endPos = CharIndexOf(BaseHtml, '>', current.Pos);

            current.HtmlStart = current.Pos + 1;
            if (endPos > 0)
            {
                current.Pos = endPos + 1;
                return BaseHtml[endPos - 1] == '/' ? false :
                    current.Object.InnerHtmlAllowed || current.Object.InnerTextAllowed;

            }
            else
            {
                current.Pos = EndPos + 1;
                return false;
            }
        }
        /// <summary>
        /// When CsQuery is provided, an initial indexing context can be used
        /// </summary>
        /// <param name="csq"></param>
        /// <param name="allowLiterals"></param>
        /// <returns></returns>
        protected IEnumerable<IDomObject> Parse(bool allowLiterals)
        {
            int pos=0;
            Stack<IterationData> stack = new Stack<IterationData>();

            while (pos <= EndPos)
            {
                IterationData current = new IterationData();
                current.AllowLiterals = allowLiterals;
                current.Reset(pos);
                stack.Push(current);

                while (stack.Count != 0)
                {

                    current = stack.Pop();
                    //Debug.Assert(current.Object == null);

                    while (!current.Finished && current.Pos <= EndPos)
                    {
                        char c = BaseHtml[current.Pos];
                        switch (current.Step)
                        {
                            case 0:
                                current.Pos = CharIndexOf(BaseHtml, '<', current.Pos);
                                if (current.Pos  < 0)
                                {
                                    // done - no new tags found
                                    current.Pos = EndPos + 1;
                                }
                                else {
                                    // deal with when we're in a literal block (script/textarea)
                                    if (current.ReadTextOnly)
                                    {
                                        int endPos = current.Pos;
                                        while (endPos >= 0)
                                        {
                                            // keep going until we find the closing tag for this element
                                            int caretPos = CharIndexOf(BaseHtml, '>', endPos + 1);
                                            if (caretPos > 0)
                                            {
                                                string tag = BaseHtml.SubstringBetween(endPos + 1, caretPos).Trim().ToLower();
                                                if (tag == "/" +current.Parent.Element.NodeName)
                                                {
                                                    // this is the end tag -- exit the block
                                                    current.Pos=endPos;
                                                    break;
                                                }
                                            }
                                            endPos = CharIndexOf(BaseHtml, '<', endPos + 1);
                                        }
                                    }
                                    // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish
                                    current.Step=1;
                                }
                                break;
                            case 1:
                                if (current.Pos > current.HtmlStart)
                                {
                                    IDomObject literal = GetLiteral(current);
                                    if (literal != null)
                                    {
                                        yield return literal;
                                    }

                                    continue;
                                }

                                int tagStartPos = current.Pos;
                                string newTag;

                                newTag = GetTagOpener(current);

                                string newTagLower = newTag.ToLower();

                                // when Element exists, it's because a previous iteration created it: it's our parent
                                string parentTag = String.Empty;
                                if (current.Parent != null)
                                {
                                    parentTag = current.Parent.Element.NodeName.ToLower();
                                }

                                if (newTag == String.Empty)
                                {
                                    // It's a tag closer. Make sure it's the right one.
                                    current.Pos = tagStartPos + 1;
                                    string closeTag = GetCloseTag(current);

                                    // Ignore empty tags, or closing tags found when no parent is open
                                    bool isProperClose = closeTag.ToLower() == parentTag;
                                    if (closeTag == String.Empty)
                                    {
                                        // ignore empty tags
                                        continue;
                                    }
                                    else
                                    {
                                        // locate match for this closer up the heirarchy
                                        IterationData actualParent =null;

                                        if (!isProperClose)
                                        {
                                            actualParent = current.Parent;
                                            while (actualParent != null && actualParent.Element.NodeName.ToLower() != closeTag.ToLower())
                                            {
                                                actualParent = actualParent.Parent;
                                            }
                                        }
                                        // if no matching close tag was found up the tree, ignore it
                                        // otherwise always close this and repeat at the same position until the match is found
                                        if (!isProperClose && actualParent == null)
                                        {
                                            current.Invalid = true;
                                            continue;
                                        }
                                    }
                                   // element is closed

                                    if (current.Parent.Parent == null)
                                    {
                                        yield return current.Parent.Element;
                                    }
                                    current.Finished = true;
                                    if (isProperClose)
                                    {
                                        current.Parent.Reset(current.Pos);
                                    }
                                    else
                                    {
                                        current.Parent.Reset(tagStartPos);
                                    }
                                    // already been returned before we added the children
                                    continue;
                                }
                                // Before we keep going see if this is an implicit close
                                if (parentTag != String.Empty)
                                {
                                    if (TagHasImplicitClose(parentTag,newTag)
                                        && parentTag == newTag)
                                    {
                                        // same tag for a repeater like li occcurred - treat like a close tag
                                        if (current.Parent.Parent == null)
                                        {
                                            yield return current.Parent.Element;
                                        }
                                        current.Parent.Reset(tagStartPos);
                                        current.Finished = true;

                                        continue;
                                    }
                                }
                                // seems to be a new tag. Parse it

                                IDomSpecialElement specialElement = null;

                                if (newTagLower[0] == '!')
                                {
                                    if (newTagLower.StartsWith("!doctype"))
                                    {
                                        specialElement = new DomDocumentType();
                                        current.Object = specialElement;
                                    }
                                    else if (newTagLower.StartsWith("![cdata["))
                                    {
                                        specialElement = new DomCData();
                                        current.Object = specialElement;
                                        current.Pos = tagStartPos + 9;
                                    }
                                    else
                                    {
                                        specialElement = new DomComment();
                                        current.Object = specialElement;
                                        if (newTagLower.StartsWith("!--"))
                                        {
                                            ((DomComment)specialElement).IsQuoted = true;
                                            current.Pos = tagStartPos + 4;
                                        } else {
                                            current.Pos = tagStartPos+1;
                                        }
                                    }
                                }
                                else
                                {
                                    current.Object = new DomElement(newTag);

                                    if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed)
                                    {
                                        current.ReadTextOnly = true;
                                        current.Step = 0;
                                    }
                                }

                                // Handle non-element/text types -- they have data inside the tag construct

                                if (current.Object is IDomSpecialElement)
                                {
                                    string endTag = (current.Object is IDomComment && ((IDomComment)current.Object).IsQuoted) ? "-->" : ">";

                                    int tagEndPos = BaseHtml.Seek(endTag, current.Pos);
                                    if (tagEndPos <0)
                                    {
                                        // if a tag is unclosed entirely, then just find a new line.
                                        tagEndPos = BaseHtml.Seek(System.Environment.NewLine, current.Pos);
                                    }
                                    if (tagEndPos < 0)
                                    {
                                        // Never closed, no newline - junk, treat it like such
                                        tagEndPos = EndPos;
                                    }

                                    specialElement.NonAttributeData = BaseHtml.SubstringBetween(current.Pos, tagEndPos);
                                    current.Pos = tagEndPos;
                                }
                                else
                                {
                                    // Parse attribute data
                                    while (current.Pos <= EndPos)
                                    {
                                        if (!GetTagAttribute(current)) break;
                                    }
                                }

                                bool hasChildren = MoveOutsideTag(current);

                                // tricky part: if there are children, push ourselves back on the stack and start with a new object
                                // from this position. The children will add themselves as they are created, avoiding recursion.
                                // When the close tag is found, the parent will be yielded if it's a root element.
                                // I think there's a slightly better way to do this, capturing all the yield logic at the end of the
                                // stack but it works for now.

                                if (current.Parent != null)
                                {
                                    current.Parent.Element.AppendChild(current.Object);
                                } else if (!hasChildren) {
                                    yield return current.Object;
                                }

                                if (!hasChildren)
                                {
                                    current.Reset();
                                    continue;
                                }

                                stack.Push(current);

                                IterationData subItem = new IterationData();
                                subItem.Parent = current;
                                subItem.AllowLiterals = true;
                                subItem.Reset(current.Pos);
                                subItem.ReadTextOnly = current.ReadTextOnly;
                                current = subItem;
                                break;

                        }
                    }
                    // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because  top-level tag was unclosed.
                    // THis will wrap up any straggling text and close any open tags after it.
                    if (!current.Finished)
                    {
                        if (current.Pos > current.HtmlStart)
                        {
                            IDomObject literal = GetLiteral(current);
                            if (literal != null)
                            {
                                yield return literal;
                            }
                        }

                        if (current.Parent != null)
                        {
                            if (current.Parent.Parent == null)
                            {
                                yield return current.Parent.Element;
                            }
                            current.Parent.Reset(current.Pos);
                            current.Finished = true;
                        }

                    }
                }
                pos = current.Pos;
            }
        }
Example #18
0
        /// <summary>
        /// Parse the HTML, and return it, based on options set.
        /// </summary>
        ///
        /// <returns>
        /// An enumerator of the top-level elements.
        /// </returns>

        protected IEnumerable<IDomObject> ParseImplementation()
        {
            int pos=0;
            Stack<IterationData> stack = new Stack<IterationData>();

            while (pos <= EndPos)
            {
                IterationData current = new IterationData();
                if (WrapRootTextNodes)
                {
                    current.WrapLiterals = true;
                }

                current.Reset(pos);
                stack.Push(current);

                while (stack.Count != 0)
                {

                    current = stack.Pop();

                    while (current.TokenizerState != TokenizerState.Finished && current.Pos <= EndPos)
                    {
                        char c = Html[current.Pos];
                        switch (current.TokenizerState)
                        {
                            case TokenizerState.Default:
                                if (current.FindNextTag(Html)) {

                                    // even if we fell through from ReadTextOnly (e.g. was never closed), we should proceeed to finish
                                    current.TokenizerState = TokenizerState.TagStart;
                                }
                                break;
                            case TokenizerState.TagStart:
                                IDomObject literal;
                                if (current.TryGetLiteral(this, out literal))
                                {
                                    yield return literal;
                                }
                                
                                int tagStartPos = current.Pos;
                                
                                string newTag=current.GetTagOpener(Html);

                                if (newTag == String.Empty)
                                {
                            
                            
                                    // It's a tag closer. Make sure it's the right one.
                                    current.Pos = tagStartPos + 1;
                                    ushort closeTagId = HtmlData.Tokenize(current.GetCloseTag(Html));

                                    // Ignore empty tags, or closing tags found when no parent is open
                                    bool isProperClose = closeTagId == current.ParentTagID();
                                    if (closeTagId == 0)
                                    {
                                        // ignore empty tags
                                        continue;
                                    }
                                    else
                                    {
                                        // locate match for this closer up the heirarchy
                                        IterationData actualParent =null;
                                        
                                        if (!isProperClose)
                                        {
                                            actualParent = current.Parent;
                                            while (actualParent != null && actualParent.Element.NodeNameID != closeTagId)
                                            {
                                                actualParent = actualParent.Parent;
                                            }
                                        }
                                        // if no matching close tag was found up the tree, ignore it
                                        // otherwise always close this and repeat at the same position until the match is found
                                        if (!isProperClose && actualParent == null)
                                        {
                                            current.InsertionMode = InsertionMode.Invalid;
                                            continue;
                                        }
                                    }
                                   // element is closed 
                                    
                                    if (current.Parent.Parent == null)
                                    {
                                        yield return current.Parent.Element;
                                    }
                                    current.TokenizerState = TokenizerState.Finished ;
                                    if (isProperClose)
                                    {
                                        current.Parent.Reset(current.Pos);
                                    }
                                    else
                                    {
                                        current.Parent.Reset(tagStartPos);
                                    }
                                    // already been returned before we added the children
                                    continue;
                                } 
                                else if (newTag[0] == '!')
                                {
                                    IDomSpecialElement specialElement = null;
                                    string newTagUpper = newTag.ToUpper();
                                    if (newTagUpper.StartsWith("!DOCTYPE"))
                                    {
                                        specialElement = new DomDocumentType();
                                        current.Element = specialElement;
                                    }
                                    else if (newTagUpper.StartsWith("![CDATA["))
                                    {
                                        specialElement = new DomCData();
                                        current.Element = specialElement;
                                        current.Pos = tagStartPos + 9;
                                    }
                                    else 
                                    {
                                        specialElement = new DomComment();
                                        current.Element = specialElement;
                                        if (newTag.StartsWith("!--"))
                                        {
                                            ((DomComment)specialElement).IsQuoted = true;
                                            current.Pos = tagStartPos + 4;
                                        } else {
                                            current.Pos = tagStartPos+1;
                                        }
                                    }

                                    string endTag = (current.Element is IDomComment && ((IDomComment)current.Element).IsQuoted) ? "-->" : ">";

                                    int tagEndPos = Html.Seek(endTag, current.Pos);
                                    if (tagEndPos < 0)
                                    {
                                        // if a tag is unclosed entirely, then just find a new line.
                                        tagEndPos = Html.Seek(System.Environment.NewLine, current.Pos);
                                    }
                                    if (tagEndPos < 0)
                                    {
                                        // Never closed, no newline - junk, treat it like such
                                        tagEndPos = EndPos;
                                    }

                                    specialElement.NonAttributeData = Html.SubstringBetween(current.Pos, tagEndPos);
                                    current.Pos = tagEndPos;

                                }
                                else
                                {

                                    // seems to be a new element tag, parse it.

                                    ushort newTagId = HtmlData.Tokenize(newTag);
                                    
                                    // Before we keep going see if this is an implicit close
                                    ushort parentTagId = current.ParentTagID();

                                    int lastPos = current.Pos;

                                    if (parentTagId ==0 && IsDocument) {
                                        if (newTagId != HtmlData.tagHTML) {
                                            current.Element =DomElement.Create(HtmlData.tagHTML);
                                            current = current.AddNewChild();
                                            parentTagId = HtmlData.tagHTML;
                                        }
                                    }
                                    
                                    if (parentTagId != 0)
                                    {
                                        ushort action = SpecialTagActionDelegate(parentTagId, newTagId);

                                        while (action != HtmlData.tagActionNothing)
                                        {
                                            if (action == HtmlData.tagActionClose)
                                            {

                                                // track the next parent up the chain

                                                var newNode = (current.Parent != null) ?
                                                    current.Parent : null;

                                                // same tag for a repeater like li occcurred - treat like a close tag

                                                if (current.Parent.Parent == null)
                                                {
                                                    yield return current.Parent.Element;
                                                }

                                                current.TokenizerState = TokenizerState.Finished;
                                                //current.Parent.Reset(tagStartPos);

                                                if (newNode != null && newNode.Parent != null && newNode.Parent.Element != null)
                                                {
                                                    action = SpecialTagActionDelegate(newNode.Parent.Element.NodeNameID, newTagId);
                                                    if (action != HtmlData.tagActionNothing)
                                                    {
                                                        current = newNode;
                                                    }
                                                }
                                                else
                                                {
                                                    action = HtmlData.tagActionNothing;
                                                }
                                            }
                                            else 
                                            {
                                                if (GenerateOptionalElements)
                                                {
                                                    stack.Push(current);
                                                    current = current.AddNewParent(action, lastPos);

                                                }
                                                action = HtmlData.tagActionNothing;

                                            }
                                        }
                                        if (current.TokenizerState == TokenizerState.Finished)
                                        {
                                            current.Parent.Reset(tagStartPos);
                                            continue;
                                        }

                                    }

                                    
                                    current.Element = DomElement.Create(newTagId);


                                    if (!current.Element.InnerHtmlAllowed && current.Element.InnerTextAllowed)
                                    {
                                        current.InsertionMode = InsertionMode.Text;
                                        current.TokenizerState = TokenizerState.Default;
                                    }

                                    // Parse attribute data
                                    while (current.Pos <= EndPos)
                                    {
                                        if (!current.GetTagAttribute(Html)) break;
                                    }
                                }

                                IDomObject el;
                                
                                if (current.FinishTagOpener(Html, out el))
                                {
                                    stack.Push(current);
                                    current = current.AddNewChild();
                                }

                                if (el != null)
                                {
                                    yield return el;
                                }

                                break;

                        }
                    }


                    // Catchall for unclosed tags -- if there's an "unfinished" carrier here, it's because  top-level tag was unclosed.
                    // THis will wrap up any straggling text and close any open tags after it.

                    if (current.TokenizerState != TokenizerState.Finished)
                    {
                        foreach (var el in current.CloseElement(this)) {
                            yield return el;
                        }

                    }
                }
                pos = current.Pos;
            }

        }