Пример #1
0
        public static void AddOtherReplaces(HTMLParser htmlParser, string pageURL, List <ReplaceInfo> replaceList)
        {
            HashSet <int> existingOffsets = new HashSet <int>();

            foreach (ReplaceInfo replace in replaceList)
            {
                existingOffsets.Add(replace.Offset);
            }

            if (Environment.NewLine != "\n")
            {
                int offset = 0;
                while ((offset = htmlParser.PreprocessedHTML.IndexOf('\n', offset)) != -1)
                {
                    replaceList.Add(new ReplaceInfo {
                        Offset = offset,
                        Length = 1,
                        Type   = ReplaceType.Other,
                        Value  = Environment.NewLine
                    });
                    offset += 1;
                }
            }

            foreach (HTMLTag tag in htmlParser.FindStartTags("base"))
            {
                replaceList.Add(
                    new ReplaceInfo {
                    Offset = tag.Offset,
                    Length = tag.Length,
                    Type   = ReplaceType.Other,
                    Value  = String.Empty
                });
            }

            foreach (HTMLTag tag in htmlParser.FindStartTags("a", "img", "script", "link"))
            {
                bool isATag       = tag.NameEquals("a");
                bool isImgTag     = tag.NameEquals("img");
                bool isScriptTag  = tag.NameEquals("script");
                bool isLinkTag    = tag.NameEquals("link");
                bool usesHRefAttr = isATag || isLinkTag;
                bool usesSrcAttr  = isImgTag || isScriptTag;
                if (usesHRefAttr || usesSrcAttr)
                {
                    HTMLAttribute attribute = tag.GetAttribute(usesHRefAttr ? "href" : usesSrcAttr ? "src" : null);
                    if (attribute != null && !existingOffsets.Contains(attribute.Offset))
                    {
                        // Make attribute's URL absolute
                        string newURL = GetAbsoluteURL(pageURL, HttpUtility.HtmlDecode(attribute.Value));
                        // For links to anchors on the current page, use just the fragment
                        if (isATag && newURL != null && newURL.Length > pageURL.Length &&
                            newURL.StartsWith(pageURL, StringComparison.Ordinal) && newURL[pageURL.Length] == '#')
                        {
                            newURL = newURL.Substring(pageURL.Length);
                        }
                        if (newURL != null)
                        {
                            replaceList.Add(
                                new ReplaceInfo {
                                Offset = attribute.Offset,
                                Length = attribute.Length,
                                Type   = ReplaceType.Other,
                                Value  = attribute.Name + "=\"" + HttpUtility.HtmlAttributeEncode(newURL) + "\""
                            });
                        }
                    }
                }
            }
        }
Пример #2
0
        private static IEnumerable <HTMLTag> ParseTags(string html, int htmlStart, int htmlEnd)
        {
            while (htmlStart < htmlEnd)
            {
                int pos = IndexOf(html, htmlStart, htmlEnd, '<');
                if (pos == -1)
                {
                    yield break;
                }

                HTMLTag tag = new HTMLTag();
                tag.Offset = pos;
                htmlStart  = pos + 1;
                tag.IsEnd  = StartsWith(html, htmlStart, htmlEnd, '/');
                if (StartsWithLetter(html, tag.IsEnd ? (htmlStart + 1) : htmlStart, htmlEnd))
                {
                    // Parse tag name
                    if (tag.IsEnd)
                    {
                        htmlStart += 1;
                    }
                    pos = IndexOfAny(html, htmlStart, htmlEnd, true, '/', '>');
                    if (pos == -1)
                    {
                        yield break;
                    }
                    tag.Name  = GetSectionLower(html, htmlStart, pos);
                    htmlStart = pos;

                    // Parse attributes
                    bool isTagComplete = false;
                    do
                    {
                        while (StartsWithWhiteSpace(html, htmlStart, htmlEnd))
                        {
                            htmlStart++;
                        }
                        tag.IsSelfClosing = StartsWith(html, htmlStart, htmlEnd, '/');
                        if (tag.IsSelfClosing)
                        {
                            htmlStart += 1;
                        }
                        if (StartsWith(html, htmlStart, htmlEnd, '>'))
                        {
                            htmlStart    += 1;
                            isTagComplete = true;
                        }
                        else if (tag.IsSelfClosing)
                        {
                        }
                        else
                        {
                            HTMLAttribute attribute = new HTMLAttribute();
                            attribute.Offset = htmlStart;

                            // Parse attribute name
                            pos = IndexOfAny(html, htmlStart + 1, htmlEnd, true, '=', '/', '>');
                            if (pos == -1)
                            {
                                yield break;
                            }
                            attribute.Name = GetSectionLower(html, htmlStart, pos);
                            htmlStart      = pos;

                            while (StartsWithWhiteSpace(html, htmlStart, htmlEnd))
                            {
                                htmlStart++;
                            }
                            if (StartsWith(html, htmlStart, htmlEnd, '='))
                            {
                                // Parse attribute value
                                htmlStart += 1;
                                while (StartsWithWhiteSpace(html, htmlStart, htmlEnd))
                                {
                                    htmlStart++;
                                }
                                if (StartsWithAny(html, htmlStart, htmlEnd, '"', '\''))
                                {
                                    char quoteChar = html[htmlStart];
                                    htmlStart += 1;
                                    pos        = IndexOf(html, htmlStart, htmlEnd, quoteChar);
                                    if (pos == -1)
                                    {
                                        yield break;
                                    }
                                    attribute.Value = GetSection(html, htmlStart, pos);
                                    htmlStart       = pos + 1;
                                }
                                else
                                {
                                    pos = IndexOfAny(html, htmlStart, htmlEnd, true, '>');
                                    if (pos == -1)
                                    {
                                        yield break;
                                    }
                                    attribute.Value = GetSection(html, htmlStart, pos);
                                    htmlStart       = pos;
                                }
                            }
                            else
                            {
                                attribute.Value = String.Empty;
                            }

                            attribute.Length = htmlStart - attribute.Offset;
                            if (tag.GetAttribute(attribute.Name) == null)
                            {
                                tag.Attributes.Add(attribute);
                            }
                        }
                    } while (!isTagComplete);
                    tag.Length = htmlStart - tag.Offset;

                    // Yield result
                    yield return(tag);

                    // Skip contents of special tags whose contents are to be treated as raw text
                    if (!tag.IsEnd && !tag.IsSelfClosing && tag.NameEqualsAny("script", "style", "title", "textarea"))
                    {
                        bool foundEndTag = false;
                        do
                        {
                            pos = IndexOf(html, htmlStart, htmlEnd, '<');
                            if (pos == -1)
                            {
                                yield break;
                            }
                            htmlStart = pos + 1;
                            string endTagText = "/" + tag.Name;
                            if (StartsWith(html, htmlStart, htmlEnd, endTagText, true) &&
                                (StartsWithWhiteSpace(html, htmlStart + endTagText.Length, htmlEnd) ||
                                 StartsWithAny(html, htmlStart + endTagText.Length, htmlEnd, '/', '>')))
                            {
                                htmlStart  -= 1;
                                foundEndTag = true;
                            }
                        } while (!foundEndTag);
                    }
                }
                else if (StartsWith(html, htmlStart, htmlEnd, "!--", false) && !StartsWith(html, htmlStart + 3, htmlEnd, '>'))
                {
                    // Skip comment
                    htmlStart += 3;
                    bool foundEnd = false;
                    do
                    {
                        pos = IndexOf(html, htmlStart, htmlEnd, '-');
                        if (pos == -1)
                        {
                            yield break;
                        }
                        htmlStart = pos + 1;
                        if (StartsWith(html, htmlStart, htmlEnd, "->", false))
                        {
                            htmlStart += 2;
                            foundEnd   = true;
                        }
                        else if (StartsWith(html, htmlStart, htmlEnd, "-!>", false))
                        {
                            htmlStart += 3;
                            foundEnd   = true;
                        }
                    } while (!foundEnd);
                }
                else if (StartsWithAny(html, htmlStart, htmlEnd, '?', '/', '!'))
                {
                    // Skip bogus comment or DOCTYPE
                    htmlStart += 1;
                    pos        = IndexOf(html, htmlStart, htmlEnd, '>');
                    if (pos == -1)
                    {
                        yield break;
                    }
                    htmlStart = pos + 1;
                }
            }
        }
Пример #3
0
        public string GetAttributeValue(string attributeName)
        {
            HTMLAttribute attribute = GetAttribute(attributeName);

            return(attribute != null ? attribute.Value : null);
        }
Пример #4
0
        private static IEnumerable<HTMLTag> ParseTags(string html, int htmlStart, int htmlEnd)
        {
            while (htmlStart < htmlEnd) {
                int pos = IndexOf(html, htmlStart, htmlEnd, '<');
                if (pos == -1) yield break;

                HTMLTag tag = new HTMLTag();
                tag.Offset = pos;
                htmlStart = pos + 1;
                tag.IsEnd = StartsWith(html, htmlStart, htmlEnd, '/');
                if (StartsWithLetter(html, tag.IsEnd ? (htmlStart + 1) : htmlStart, htmlEnd)) {
                    // Parse tag name
                    if (tag.IsEnd) htmlStart += 1;
                    pos = IndexOfAny(html, htmlStart, htmlEnd, true, '/', '>');
                    if (pos == -1) yield break;
                    tag.Name = GetSectionLower(html, htmlStart, pos);
                    htmlStart = pos;

                    // Parse attributes
                    bool isTagComplete = false;
                    do {
                        while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++;
                        tag.IsSelfClosing = StartsWith(html, htmlStart, htmlEnd, '/');
                        if (tag.IsSelfClosing) htmlStart += 1;
                        if (StartsWith(html, htmlStart, htmlEnd, '>')) {
                            htmlStart += 1;
                            isTagComplete = true;
                        }
                        else if (tag.IsSelfClosing) { }
                        else {
                            HTMLAttribute attribute = new HTMLAttribute();
                            attribute.Offset = htmlStart;

                            // Parse attribute name
                            pos = IndexOfAny(html, htmlStart + 1, htmlEnd, true, '=', '/', '>');
                            if (pos == -1) yield break;
                            attribute.Name = GetSectionLower(html, htmlStart, pos);
                            htmlStart = pos;

                            while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++;
                            if (StartsWith(html, htmlStart, htmlEnd, '=')) {
                                // Parse attribute value
                                htmlStart += 1;
                                while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++;
                                if (StartsWithAny(html, htmlStart, htmlEnd, '"', '\'')) {
                                    char quoteChar = html[htmlStart];
                                    htmlStart += 1;
                                    pos = IndexOf(html, htmlStart, htmlEnd, quoteChar);
                                    if (pos == -1) yield break;
                                    attribute.Value = GetSection(html, htmlStart, pos);
                                    htmlStart = pos + 1;
                                }
                                else {
                                    pos = IndexOfAny(html, htmlStart, htmlEnd, true, '>');
                                    if (pos == -1) yield break;
                                    attribute.Value = GetSection(html, htmlStart, pos);
                                    htmlStart = pos;
                                }
                            }
                            else {
                                attribute.Value = String.Empty;
                            }

                            attribute.Length = htmlStart - attribute.Offset;
                            if (tag.GetAttribute(attribute.Name) == null) {
                                tag.Attributes.Add(attribute);
                            }
                        }
                    } while (!isTagComplete);
                    tag.Length = htmlStart - tag.Offset;

                    // Yield result
                    yield return tag;

                    // Skip contents of special tags whose contents are to be treated as raw text
                    if (!tag.IsEnd && !tag.IsSelfClosing && tag.NameEqualsAny("script", "style", "title", "textarea")) {
                        bool foundEndTag = false;
                        do {
                            pos = IndexOf(html, htmlStart, htmlEnd, '<');
                            if (pos == -1) yield break;
                            htmlStart = pos + 1;
                            string endTagText = "/" + tag.Name;
                            if (StartsWith(html, htmlStart, htmlEnd, endTagText, true) &&
                                (StartsWithWhiteSpace(html, htmlStart + endTagText.Length, htmlEnd) ||
                                 StartsWithAny(html, htmlStart + endTagText.Length, htmlEnd, '/', '>')))
                            {
                                htmlStart -= 1;
                                foundEndTag = true;
                            }
                        } while (!foundEndTag);
                    }
                }
                else if (StartsWith(html, htmlStart, htmlEnd, "!--", false) && !StartsWith(html, htmlStart + 3, htmlEnd, '>')) {
                    // Skip comment
                    htmlStart += 3;
                    bool foundEnd = false;
                    do {
                        pos = IndexOf(html, htmlStart, htmlEnd, '-');
                        if (pos == -1) yield break;
                        htmlStart = pos + 1;
                        if (StartsWith(html, htmlStart, htmlEnd, "->", false)) {
                            htmlStart += 2;
                            foundEnd = true;
                        }
                        else if (StartsWith(html, htmlStart, htmlEnd, "-!>", false)) {
                            htmlStart += 3;
                            foundEnd = true;
                        }
                    } while (!foundEnd);
                }
                else if (StartsWithAny(html, htmlStart, htmlEnd, '?', '/', '!')) {
                    // Skip bogus comment or DOCTYPE
                    htmlStart += 1;
                    pos = IndexOf(html, htmlStart, htmlEnd, '>');
                    if (pos == -1) yield break;
                    htmlStart = pos + 1;
                }
            }
        }