Ejemplo n.º 1
0
        private static IEnumerable <HTMLTag> ParseTags(string html, int htmlStart, int htmlEnd)
        {
            while (htmlStart < htmlEnd)
            {
                int pos = IndexOf(html, htmlStart, htmlEnd, '<');
                if (pos == -1)
                {
                    yield break;
                }

                HTMLTag tag = new HTMLTag();
                tag.Offset = pos;
                htmlStart  = pos + 1;
                tag.IsEnd  = StartsWith(html, htmlStart, htmlEnd, '/');
                if (StartsWithLetter(html, tag.IsEnd ? (htmlStart + 1) : htmlStart, htmlEnd))
                {
                    // Parse tag name
                    if (tag.IsEnd)
                    {
                        htmlStart += 1;
                    }
                    pos = IndexOfAny(html, htmlStart, htmlEnd, true, '/', '>');
                    if (pos == -1)
                    {
                        yield break;
                    }
                    tag.Name  = GetSectionLower(html, htmlStart, pos);
                    htmlStart = pos;

                    // Parse attributes
                    bool isTagComplete = false;
                    do
                    {
                        while (StartsWithWhiteSpace(html, htmlStart, htmlEnd))
                        {
                            htmlStart++;
                        }
                        tag.IsSelfClosing = StartsWith(html, htmlStart, htmlEnd, '/');
                        if (tag.IsSelfClosing)
                        {
                            htmlStart += 1;
                        }
                        if (StartsWith(html, htmlStart, htmlEnd, '>'))
                        {
                            htmlStart    += 1;
                            isTagComplete = true;
                        }
                        else if (tag.IsSelfClosing)
                        {
                        }
                        else
                        {
                            HTMLAttribute attribute = new HTMLAttribute();
                            attribute.Offset = htmlStart;

                            // Parse attribute name
                            pos = IndexOfAny(html, htmlStart + 1, htmlEnd, true, '=', '/', '>');
                            if (pos == -1)
                            {
                                yield break;
                            }
                            attribute.Name = GetSectionLower(html, htmlStart, pos);
                            htmlStart      = pos;

                            while (StartsWithWhiteSpace(html, htmlStart, htmlEnd))
                            {
                                htmlStart++;
                            }
                            if (StartsWith(html, htmlStart, htmlEnd, '='))
                            {
                                // Parse attribute value
                                htmlStart += 1;
                                while (StartsWithWhiteSpace(html, htmlStart, htmlEnd))
                                {
                                    htmlStart++;
                                }
                                if (StartsWithAny(html, htmlStart, htmlEnd, '"', '\''))
                                {
                                    char quoteChar = html[htmlStart];
                                    htmlStart += 1;
                                    pos        = IndexOf(html, htmlStart, htmlEnd, quoteChar);
                                    if (pos == -1)
                                    {
                                        yield break;
                                    }
                                    attribute.Value = GetSection(html, htmlStart, pos);
                                    htmlStart       = pos + 1;
                                }
                                else
                                {
                                    pos = IndexOfAny(html, htmlStart, htmlEnd, true, '>');
                                    if (pos == -1)
                                    {
                                        yield break;
                                    }
                                    attribute.Value = GetSection(html, htmlStart, pos);
                                    htmlStart       = pos;
                                }
                            }
                            else
                            {
                                attribute.Value = String.Empty;
                            }

                            attribute.Length = htmlStart - attribute.Offset;
                            if (tag.GetAttribute(attribute.Name) == null)
                            {
                                tag.Attributes.Add(attribute);
                            }
                        }
                    } while (!isTagComplete);
                    tag.Length = htmlStart - tag.Offset;

                    // Yield result
                    yield return(tag);

                    // Skip contents of special tags whose contents are to be treated as raw text
                    if (!tag.IsEnd && !tag.IsSelfClosing && tag.NameEqualsAny("script", "style", "title", "textarea"))
                    {
                        bool foundEndTag = false;
                        do
                        {
                            pos = IndexOf(html, htmlStart, htmlEnd, '<');
                            if (pos == -1)
                            {
                                yield break;
                            }
                            htmlStart = pos + 1;
                            string endTagText = "/" + tag.Name;
                            if (StartsWith(html, htmlStart, htmlEnd, endTagText, true) &&
                                (StartsWithWhiteSpace(html, htmlStart + endTagText.Length, htmlEnd) ||
                                 StartsWithAny(html, htmlStart + endTagText.Length, htmlEnd, '/', '>')))
                            {
                                htmlStart  -= 1;
                                foundEndTag = true;
                            }
                        } while (!foundEndTag);
                    }
                }
                else if (StartsWith(html, htmlStart, htmlEnd, "!--", false) && !StartsWith(html, htmlStart + 3, htmlEnd, '>'))
                {
                    // Skip comment
                    htmlStart += 3;
                    bool foundEnd = false;
                    do
                    {
                        pos = IndexOf(html, htmlStart, htmlEnd, '-');
                        if (pos == -1)
                        {
                            yield break;
                        }
                        htmlStart = pos + 1;
                        if (StartsWith(html, htmlStart, htmlEnd, "->", false))
                        {
                            htmlStart += 2;
                            foundEnd   = true;
                        }
                        else if (StartsWith(html, htmlStart, htmlEnd, "-!>", false))
                        {
                            htmlStart += 3;
                            foundEnd   = true;
                        }
                    } while (!foundEnd);
                }
                else if (StartsWithAny(html, htmlStart, htmlEnd, '?', '/', '!'))
                {
                    // Skip bogus comment or DOCTYPE
                    htmlStart += 1;
                    pos        = IndexOf(html, htmlStart, htmlEnd, '>');
                    if (pos == -1)
                    {
                        yield break;
                    }
                    htmlStart = pos + 1;
                }
            }
        }
Ejemplo n.º 2
0
        private static IEnumerable<HTMLTag> ParseTags(string html, int htmlStart, int htmlEnd)
        {
            while (htmlStart < htmlEnd) {
                int pos = IndexOf(html, htmlStart, htmlEnd, '<');
                if (pos == -1) yield break;

                HTMLTag tag = new HTMLTag();
                tag.Offset = pos;
                htmlStart = pos + 1;
                tag.IsEnd = StartsWith(html, htmlStart, htmlEnd, '/');
                if (StartsWithLetter(html, tag.IsEnd ? (htmlStart + 1) : htmlStart, htmlEnd)) {
                    // Parse tag name
                    if (tag.IsEnd) htmlStart += 1;
                    pos = IndexOfAny(html, htmlStart, htmlEnd, true, '/', '>');
                    if (pos == -1) yield break;
                    tag.Name = GetSectionLower(html, htmlStart, pos);
                    htmlStart = pos;

                    // Parse attributes
                    bool isTagComplete = false;
                    do {
                        while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++;
                        tag.IsSelfClosing = StartsWith(html, htmlStart, htmlEnd, '/');
                        if (tag.IsSelfClosing) htmlStart += 1;
                        if (StartsWith(html, htmlStart, htmlEnd, '>')) {
                            htmlStart += 1;
                            isTagComplete = true;
                        }
                        else if (tag.IsSelfClosing) { }
                        else {
                            HTMLAttribute attribute = new HTMLAttribute();
                            attribute.Offset = htmlStart;

                            // Parse attribute name
                            pos = IndexOfAny(html, htmlStart + 1, htmlEnd, true, '=', '/', '>');
                            if (pos == -1) yield break;
                            attribute.Name = GetSectionLower(html, htmlStart, pos);
                            htmlStart = pos;

                            while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++;
                            if (StartsWith(html, htmlStart, htmlEnd, '=')) {
                                // Parse attribute value
                                htmlStart += 1;
                                while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++;
                                if (StartsWithAny(html, htmlStart, htmlEnd, '"', '\'')) {
                                    char quoteChar = html[htmlStart];
                                    htmlStart += 1;
                                    pos = IndexOf(html, htmlStart, htmlEnd, quoteChar);
                                    if (pos == -1) yield break;
                                    attribute.Value = GetSection(html, htmlStart, pos);
                                    htmlStart = pos + 1;
                                }
                                else {
                                    pos = IndexOfAny(html, htmlStart, htmlEnd, true, '>');
                                    if (pos == -1) yield break;
                                    attribute.Value = GetSection(html, htmlStart, pos);
                                    htmlStart = pos;
                                }
                            }
                            else {
                                attribute.Value = String.Empty;
                            }

                            attribute.Length = htmlStart - attribute.Offset;
                            if (tag.GetAttribute(attribute.Name) == null) {
                                tag.Attributes.Add(attribute);
                            }
                        }
                    } while (!isTagComplete);
                    tag.Length = htmlStart - tag.Offset;

                    // Yield result
                    yield return tag;

                    // Skip contents of special tags whose contents are to be treated as raw text
                    if (!tag.IsEnd && !tag.IsSelfClosing && tag.NameEqualsAny("script", "style", "title", "textarea")) {
                        bool foundEndTag = false;
                        do {
                            pos = IndexOf(html, htmlStart, htmlEnd, '<');
                            if (pos == -1) yield break;
                            htmlStart = pos + 1;
                            string endTagText = "/" + tag.Name;
                            if (StartsWith(html, htmlStart, htmlEnd, endTagText, true) &&
                                (StartsWithWhiteSpace(html, htmlStart + endTagText.Length, htmlEnd) ||
                                 StartsWithAny(html, htmlStart + endTagText.Length, htmlEnd, '/', '>')))
                            {
                                htmlStart -= 1;
                                foundEndTag = true;
                            }
                        } while (!foundEndTag);
                    }
                }
                else if (StartsWith(html, htmlStart, htmlEnd, "!--", false) && !StartsWith(html, htmlStart + 3, htmlEnd, '>')) {
                    // Skip comment
                    htmlStart += 3;
                    bool foundEnd = false;
                    do {
                        pos = IndexOf(html, htmlStart, htmlEnd, '-');
                        if (pos == -1) yield break;
                        htmlStart = pos + 1;
                        if (StartsWith(html, htmlStart, htmlEnd, "->", false)) {
                            htmlStart += 2;
                            foundEnd = true;
                        }
                        else if (StartsWith(html, htmlStart, htmlEnd, "-!>", false)) {
                            htmlStart += 3;
                            foundEnd = true;
                        }
                    } while (!foundEnd);
                }
                else if (StartsWithAny(html, htmlStart, htmlEnd, '?', '/', '!')) {
                    // Skip bogus comment or DOCTYPE
                    htmlStart += 1;
                    pos = IndexOf(html, htmlStart, htmlEnd, '>');
                    if (pos == -1) yield break;
                    htmlStart = pos + 1;
                }
            }
        }