public static void AddOtherReplaces(HTMLParser htmlParser, string pageURL, List <ReplaceInfo> replaceList) { HashSet <int> existingOffsets = new HashSet <int>(); foreach (ReplaceInfo replace in replaceList) { existingOffsets.Add(replace.Offset); } if (Environment.NewLine != "\n") { int offset = 0; while ((offset = htmlParser.PreprocessedHTML.IndexOf('\n', offset)) != -1) { replaceList.Add(new ReplaceInfo { Offset = offset, Length = 1, Type = ReplaceType.Other, Value = Environment.NewLine }); offset += 1; } } foreach (HTMLTag tag in htmlParser.FindStartTags("base")) { replaceList.Add( new ReplaceInfo { Offset = tag.Offset, Length = tag.Length, Type = ReplaceType.Other, Value = String.Empty }); } foreach (HTMLTag tag in htmlParser.FindStartTags("a", "img", "script", "link")) { bool isATag = tag.NameEquals("a"); bool isImgTag = tag.NameEquals("img"); bool isScriptTag = tag.NameEquals("script"); bool isLinkTag = tag.NameEquals("link"); bool usesHRefAttr = isATag || isLinkTag; bool usesSrcAttr = isImgTag || isScriptTag; if (usesHRefAttr || usesSrcAttr) { HTMLAttribute attribute = tag.GetAttribute(usesHRefAttr ? "href" : usesSrcAttr ? "src" : null); if (attribute != null && !existingOffsets.Contains(attribute.Offset)) { // Make attribute's URL absolute string newURL = GetAbsoluteURL(pageURL, HttpUtility.HtmlDecode(attribute.Value)); // For links to anchors on the current page, use just the fragment if (isATag && newURL != null && newURL.Length > pageURL.Length && newURL.StartsWith(pageURL, StringComparison.Ordinal) && newURL[pageURL.Length] == '#') { newURL = newURL.Substring(pageURL.Length); } if (newURL != null) { replaceList.Add( new ReplaceInfo { Offset = attribute.Offset, Length = attribute.Length, Type = ReplaceType.Other, Value = attribute.Name + "=\"" + HttpUtility.HtmlAttributeEncode(newURL) + "\"" }); } } } } }
private static IEnumerable <HTMLTag> ParseTags(string html, int htmlStart, int htmlEnd) { while (htmlStart < htmlEnd) { int pos = IndexOf(html, htmlStart, htmlEnd, '<'); if (pos == -1) { yield break; } HTMLTag tag = new HTMLTag(); tag.Offset = pos; htmlStart = pos + 1; tag.IsEnd = StartsWith(html, htmlStart, htmlEnd, '/'); if (StartsWithLetter(html, tag.IsEnd ? (htmlStart + 1) : htmlStart, htmlEnd)) { // Parse tag name if (tag.IsEnd) { htmlStart += 1; } pos = IndexOfAny(html, htmlStart, htmlEnd, true, '/', '>'); if (pos == -1) { yield break; } tag.Name = GetSectionLower(html, htmlStart, pos); htmlStart = pos; // Parse attributes bool isTagComplete = false; do { while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) { htmlStart++; } tag.IsSelfClosing = StartsWith(html, htmlStart, htmlEnd, '/'); if (tag.IsSelfClosing) { htmlStart += 1; } if (StartsWith(html, htmlStart, htmlEnd, '>')) { htmlStart += 1; isTagComplete = true; } else if (tag.IsSelfClosing) { } else { HTMLAttribute attribute = new HTMLAttribute(); attribute.Offset = htmlStart; // Parse attribute name pos = IndexOfAny(html, htmlStart + 1, htmlEnd, true, '=', '/', '>'); if (pos == -1) { yield break; } attribute.Name = GetSectionLower(html, htmlStart, pos); htmlStart = pos; while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) { htmlStart++; } if (StartsWith(html, htmlStart, htmlEnd, '=')) { // Parse attribute value htmlStart += 1; while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) { htmlStart++; } if (StartsWithAny(html, htmlStart, htmlEnd, '"', '\'')) { char quoteChar = html[htmlStart]; htmlStart += 1; pos = IndexOf(html, htmlStart, htmlEnd, quoteChar); if (pos == -1) { yield break; } attribute.Value = GetSection(html, htmlStart, pos); htmlStart = pos + 1; } else { pos = IndexOfAny(html, htmlStart, htmlEnd, true, '>'); if (pos == -1) { yield break; } attribute.Value = GetSection(html, htmlStart, pos); htmlStart = pos; } } else { attribute.Value = String.Empty; } attribute.Length = htmlStart - attribute.Offset; if (tag.GetAttribute(attribute.Name) == null) { tag.Attributes.Add(attribute); } } } while (!isTagComplete); tag.Length = htmlStart - tag.Offset; // Yield result yield return(tag); // Skip contents of special tags whose contents are to be treated as raw text if (!tag.IsEnd && !tag.IsSelfClosing && tag.NameEqualsAny("script", "style", "title", "textarea")) { bool foundEndTag = false; do { pos = IndexOf(html, htmlStart, htmlEnd, '<'); if (pos == -1) { yield break; } htmlStart = pos + 1; string endTagText = "/" + tag.Name; if (StartsWith(html, htmlStart, htmlEnd, endTagText, true) && (StartsWithWhiteSpace(html, htmlStart + endTagText.Length, htmlEnd) || StartsWithAny(html, htmlStart + endTagText.Length, htmlEnd, '/', '>'))) { htmlStart -= 1; foundEndTag = true; } } while (!foundEndTag); } } else if (StartsWith(html, htmlStart, htmlEnd, "!--", false) && !StartsWith(html, htmlStart + 3, htmlEnd, '>')) { // Skip comment htmlStart += 3; bool foundEnd = false; do { pos = IndexOf(html, htmlStart, htmlEnd, '-'); if (pos == -1) { yield break; } htmlStart = pos + 1; if (StartsWith(html, htmlStart, htmlEnd, "->", false)) { htmlStart += 2; foundEnd = true; } else if (StartsWith(html, htmlStart, htmlEnd, "-!>", false)) { htmlStart += 3; foundEnd = true; } } while (!foundEnd); } else if (StartsWithAny(html, htmlStart, htmlEnd, '?', '/', '!')) { // Skip bogus comment or DOCTYPE htmlStart += 1; pos = IndexOf(html, htmlStart, htmlEnd, '>'); if (pos == -1) { yield break; } htmlStart = pos + 1; } } }
public string GetAttributeValue(string attributeName) { HTMLAttribute attribute = GetAttribute(attributeName); return(attribute != null ? attribute.Value : null); }
private static IEnumerable<HTMLTag> ParseTags(string html, int htmlStart, int htmlEnd) { while (htmlStart < htmlEnd) { int pos = IndexOf(html, htmlStart, htmlEnd, '<'); if (pos == -1) yield break; HTMLTag tag = new HTMLTag(); tag.Offset = pos; htmlStart = pos + 1; tag.IsEnd = StartsWith(html, htmlStart, htmlEnd, '/'); if (StartsWithLetter(html, tag.IsEnd ? (htmlStart + 1) : htmlStart, htmlEnd)) { // Parse tag name if (tag.IsEnd) htmlStart += 1; pos = IndexOfAny(html, htmlStart, htmlEnd, true, '/', '>'); if (pos == -1) yield break; tag.Name = GetSectionLower(html, htmlStart, pos); htmlStart = pos; // Parse attributes bool isTagComplete = false; do { while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++; tag.IsSelfClosing = StartsWith(html, htmlStart, htmlEnd, '/'); if (tag.IsSelfClosing) htmlStart += 1; if (StartsWith(html, htmlStart, htmlEnd, '>')) { htmlStart += 1; isTagComplete = true; } else if (tag.IsSelfClosing) { } else { HTMLAttribute attribute = new HTMLAttribute(); attribute.Offset = htmlStart; // Parse attribute name pos = IndexOfAny(html, htmlStart + 1, htmlEnd, true, '=', '/', '>'); if (pos == -1) yield break; attribute.Name = GetSectionLower(html, htmlStart, pos); htmlStart = pos; while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++; if (StartsWith(html, htmlStart, htmlEnd, '=')) { // Parse attribute value htmlStart += 1; while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++; if (StartsWithAny(html, htmlStart, htmlEnd, '"', '\'')) { char quoteChar = html[htmlStart]; htmlStart += 1; pos = IndexOf(html, htmlStart, htmlEnd, quoteChar); if (pos == -1) yield break; attribute.Value = GetSection(html, htmlStart, pos); htmlStart = pos + 1; } else { pos = IndexOfAny(html, htmlStart, htmlEnd, true, '>'); if (pos == -1) yield break; attribute.Value = GetSection(html, htmlStart, pos); htmlStart = pos; } } else { attribute.Value = String.Empty; } attribute.Length = htmlStart - attribute.Offset; if (tag.GetAttribute(attribute.Name) == null) { tag.Attributes.Add(attribute); } } } while (!isTagComplete); tag.Length = htmlStart - tag.Offset; // Yield result yield return tag; // Skip contents of special tags whose contents are to be treated as raw text if (!tag.IsEnd && !tag.IsSelfClosing && tag.NameEqualsAny("script", "style", "title", "textarea")) { bool foundEndTag = false; do { pos = IndexOf(html, htmlStart, htmlEnd, '<'); if (pos == -1) yield break; htmlStart = pos + 1; string endTagText = "/" + tag.Name; if (StartsWith(html, htmlStart, htmlEnd, endTagText, true) && (StartsWithWhiteSpace(html, htmlStart + endTagText.Length, htmlEnd) || StartsWithAny(html, htmlStart + endTagText.Length, htmlEnd, '/', '>'))) { htmlStart -= 1; foundEndTag = true; } } while (!foundEndTag); } } else if (StartsWith(html, htmlStart, htmlEnd, "!--", false) && !StartsWith(html, htmlStart + 3, htmlEnd, '>')) { // Skip comment htmlStart += 3; bool foundEnd = false; do { pos = IndexOf(html, htmlStart, htmlEnd, '-'); if (pos == -1) yield break; htmlStart = pos + 1; if (StartsWith(html, htmlStart, htmlEnd, "->", false)) { htmlStart += 2; foundEnd = true; } else if (StartsWith(html, htmlStart, htmlEnd, "-!>", false)) { htmlStart += 3; foundEnd = true; } } while (!foundEnd); } else if (StartsWithAny(html, htmlStart, htmlEnd, '?', '/', '!')) { // Skip bogus comment or DOCTYPE htmlStart += 1; pos = IndexOf(html, htmlStart, htmlEnd, '>'); if (pos == -1) yield break; htmlStart = pos + 1; } } }