protected virtual bool IsImage(HTMLTag linkTag) { string url = General.GetAbsoluteURL(_url, HttpUtility.HtmlDecode(linkTag.GetAttributeValueOrEmpty("href"))); return url != null && url.IndexOf(ImageURLKeyword, StringComparison.OrdinalIgnoreCase) >= 0; }
public HTMLTagRange(HTMLTag startTag, HTMLTag endTag) { StartTag = startTag; EndTag = endTag; }
public string GetHTML(HTMLTag startTag, HTMLTag endTag) { return(GetSection(_preprocessedHTML, startTag.Offset, startTag.IsSelfClosing ? startTag.EndOffset : endTag.EndOffset)); }
public static bool ClassAttributeValueHas(HTMLTag tag, string targetClassName) { string attributeValue = tag.GetAttributeValue("class"); return(attributeValue != null && ClassAttributeValueHas(attributeValue, targetClassName)); }
public HTMLTag FindCorrespondingEndTag(HTMLTag tag) { return(FindCorrespondingEndTag(tag, null)); }
public HTMLTagRange CreateTagRange(HTMLTag tag, HTMLTag stopBeforeTag) { HTMLTag endTag = FindCorrespondingEndTag(tag, stopBeforeTag); return((tag != null && endTag != null) ? new HTMLTagRange(tag, endTag) : null); }
public IEnumerable<HTMLTag> FindStartTags(HTMLTag startAfterTag, HTMLTag stopBeforeTag, params string[] names) { return FindTags(false, startAfterTag, stopBeforeTag, names); }
public IEnumerable <HTMLTag> FindEndTags(HTMLTag startAfterTag, HTMLTag stopBeforeTag, params string[] names) { return(FindTags(true, startAfterTag, stopBeforeTag, names)); }
public HTMLTag FindCorrespondingEndTag(HTMLTag tag, HTMLTag stopBeforeTag) { if (tag == null) { return null; } if (tag.IsEnd) { throw new ArgumentException("Tag must be a start tag."); } if (tag.IsSelfClosing) { return tag; } int startIndex = GetTagIndex(tag) + 1; int stopIndex = stopBeforeTag != null ? (GetTagIndex(stopBeforeTag) - 1) : (_tags.Count - 1); int depth = 1; for (int i = startIndex; i <= stopIndex; i++) { HTMLTag tag2 = _tags[i]; if (!tag2.IsSelfClosing && tag2.NameEquals(tag.Name)) { depth += tag2.IsEnd ? -1 : 1; if (depth == 0) { return tag2; } } } return null; }
public HTMLTag FindStartTag(HTMLTag startAfterTag, HTMLTag stopBeforeTag, params string[] names) { return FindTag(false, startAfterTag, stopBeforeTag, names); }
public HTMLTag FindCorrespondingEndTag(HTMLTag tag) { return FindCorrespondingEndTag(tag, null); }
public IEnumerable<HTMLTag> EnumerateTags(HTMLTag startAfterTag, HTMLTag stopBeforeTag) { int startIndex = startAfterTag != null ? (GetTagIndex(startAfterTag) + 1) : 0; int stopIndex = stopBeforeTag != null ? (GetTagIndex(stopBeforeTag) - 1) : (_tags.Count - 1); for (int i = startIndex; i <= stopIndex; i++) { yield return _tags[i]; } }
public HTMLTagRange CreateTagRange(HTMLTag tag, HTMLTag stopBeforeTag) { HTMLTag endTag = FindCorrespondingEndTag(tag, stopBeforeTag); return (tag != null && endTag != null) ? new HTMLTagRange(tag, endTag) : null; }
public static bool ClassAttributeValueHas(HTMLTag tag, string targetClassName) { string attributeValue = tag.GetAttributeValue("class"); return attributeValue != null && ClassAttributeValueHas(attributeValue, targetClassName); }
public HTMLTag FindTag(bool isEndTag, HTMLTag startAfterTag, HTMLTag stopBeforeTag, params string[] names) { foreach (HTMLTag tag in FindTags(isEndTag, startAfterTag, stopBeforeTag, names)) { return tag; } return null; }
public HTMLTagRange CreateTagRange(HTMLTag tag) { return CreateTagRange(tag, null); }
public IEnumerable<HTMLTag> FindTags(bool isEndTag, HTMLTag startAfterTag, HTMLTag stopBeforeTag, params string[] names) { foreach (HTMLTag tag in EnumerateTags(startAfterTag, stopBeforeTag)) { if (tag.IsEnd == isEndTag && tag.NameEqualsAny(names)) { yield return tag; } } }
public HTMLTag FindEndTag(HTMLTag startAfterTag, HTMLTag stopBeforeTag, params string[] names) { return(FindTag(true, startAfterTag, stopBeforeTag, names)); }
public string GetHTML(HTMLTag startTag, HTMLTag endTag) { return GetSection(_preprocessedHTML, startTag.Offset, startTag.IsSelfClosing ? startTag.EndOffset : endTag.EndOffset); }
public HTMLTagRange CreateTagRange(HTMLTag tag) { return(CreateTagRange(tag, null)); }
public string GetInnerHTML(HTMLTag startTag, HTMLTag endTag) { return startTag.IsSelfClosing ? String.Empty : GetSection(_preprocessedHTML, startTag.EndOffset, endTag.Offset); }
private static IEnumerable <HTMLTag> ParseTags(string html, int htmlStart, int htmlEnd) { while (htmlStart < htmlEnd) { int pos = IndexOf(html, htmlStart, htmlEnd, '<'); if (pos == -1) { yield break; } HTMLTag tag = new HTMLTag(); tag.Offset = pos; htmlStart = pos + 1; tag.IsEnd = StartsWith(html, htmlStart, htmlEnd, '/'); if (StartsWithLetter(html, tag.IsEnd ? (htmlStart + 1) : htmlStart, htmlEnd)) { // Parse tag name if (tag.IsEnd) { htmlStart += 1; } pos = IndexOfAny(html, htmlStart, htmlEnd, true, '/', '>'); if (pos == -1) { yield break; } tag.Name = GetSectionLower(html, htmlStart, pos); htmlStart = pos; // Parse attributes bool isTagComplete = false; do { while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) { htmlStart++; } tag.IsSelfClosing = StartsWith(html, htmlStart, htmlEnd, '/'); if (tag.IsSelfClosing) { htmlStart += 1; } if (StartsWith(html, htmlStart, htmlEnd, '>')) { htmlStart += 1; isTagComplete = true; } else if (tag.IsSelfClosing) { } else { HTMLAttribute attribute = new HTMLAttribute(); attribute.Offset = htmlStart; // Parse attribute name pos = IndexOfAny(html, htmlStart + 1, htmlEnd, true, '=', '/', '>'); if (pos == -1) { yield break; } attribute.Name = GetSectionLower(html, htmlStart, pos); htmlStart = pos; while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) { htmlStart++; } if (StartsWith(html, htmlStart, htmlEnd, '=')) { // Parse attribute value htmlStart += 1; while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) { htmlStart++; } if (StartsWithAny(html, htmlStart, htmlEnd, '"', '\'')) { char quoteChar = html[htmlStart]; htmlStart += 1; pos = IndexOf(html, htmlStart, htmlEnd, quoteChar); if (pos == -1) { yield break; } attribute.Value = GetSection(html, htmlStart, pos); htmlStart = pos + 1; } else { pos = IndexOfAny(html, htmlStart, htmlEnd, true, '>'); if (pos == -1) { yield break; } attribute.Value = GetSection(html, htmlStart, pos); htmlStart = pos; } } else { attribute.Value = String.Empty; } attribute.Length = htmlStart - attribute.Offset; if (tag.GetAttribute(attribute.Name) == null) { tag.Attributes.Add(attribute); } } } while (!isTagComplete); tag.Length = htmlStart - tag.Offset; // Yield result yield return(tag); // Skip contents of special tags whose contents are to be treated as raw text if (!tag.IsEnd && !tag.IsSelfClosing && tag.NameEqualsAny("script", "style", "title", "textarea")) { bool foundEndTag = false; do { pos = IndexOf(html, htmlStart, htmlEnd, '<'); if (pos == -1) { yield break; } htmlStart = pos + 1; string endTagText = "/" + tag.Name; if (StartsWith(html, htmlStart, htmlEnd, endTagText, true) && (StartsWithWhiteSpace(html, htmlStart + endTagText.Length, htmlEnd) || StartsWithAny(html, htmlStart + endTagText.Length, htmlEnd, '/', '>'))) { htmlStart -= 1; foundEndTag = true; } } while (!foundEndTag); } } else if (StartsWith(html, htmlStart, htmlEnd, "!--", false) && !StartsWith(html, htmlStart + 3, htmlEnd, '>')) { // Skip comment htmlStart += 3; bool foundEnd = false; do { pos = IndexOf(html, htmlStart, htmlEnd, '-'); if (pos == -1) { yield break; } htmlStart = pos + 1; if (StartsWith(html, htmlStart, htmlEnd, "->", false)) { htmlStart += 2; foundEnd = true; } else if (StartsWith(html, htmlStart, htmlEnd, "-!>", false)) { htmlStart += 3; foundEnd = true; } } while (!foundEnd); } else if (StartsWithAny(html, htmlStart, htmlEnd, '?', '/', '!')) { // Skip bogus comment or DOCTYPE htmlStart += 1; pos = IndexOf(html, htmlStart, htmlEnd, '>'); if (pos == -1) { yield break; } htmlStart = pos + 1; } } }
private static IEnumerable<HTMLTag> ParseTags(string html, int htmlStart, int htmlEnd) { while (htmlStart < htmlEnd) { int pos = IndexOf(html, htmlStart, htmlEnd, '<'); if (pos == -1) yield break; HTMLTag tag = new HTMLTag(); tag.Offset = pos; htmlStart = pos + 1; tag.IsEnd = StartsWith(html, htmlStart, htmlEnd, '/'); if (StartsWithLetter(html, tag.IsEnd ? (htmlStart + 1) : htmlStart, htmlEnd)) { // Parse tag name if (tag.IsEnd) htmlStart += 1; pos = IndexOfAny(html, htmlStart, htmlEnd, true, '/', '>'); if (pos == -1) yield break; tag.Name = GetSectionLower(html, htmlStart, pos); htmlStart = pos; // Parse attributes bool isTagComplete = false; do { while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++; tag.IsSelfClosing = StartsWith(html, htmlStart, htmlEnd, '/'); if (tag.IsSelfClosing) htmlStart += 1; if (StartsWith(html, htmlStart, htmlEnd, '>')) { htmlStart += 1; isTagComplete = true; } else if (tag.IsSelfClosing) { } else { HTMLAttribute attribute = new HTMLAttribute(); attribute.Offset = htmlStart; // Parse attribute name pos = IndexOfAny(html, htmlStart + 1, htmlEnd, true, '=', '/', '>'); if (pos == -1) yield break; attribute.Name = GetSectionLower(html, htmlStart, pos); htmlStart = pos; while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++; if (StartsWith(html, htmlStart, htmlEnd, '=')) { // Parse attribute value htmlStart += 1; while (StartsWithWhiteSpace(html, htmlStart, htmlEnd)) htmlStart++; if (StartsWithAny(html, htmlStart, htmlEnd, '"', '\'')) { char quoteChar = html[htmlStart]; htmlStart += 1; pos = IndexOf(html, htmlStart, htmlEnd, quoteChar); if (pos == -1) yield break; attribute.Value = GetSection(html, htmlStart, pos); htmlStart = pos + 1; } else { pos = IndexOfAny(html, htmlStart, htmlEnd, true, '>'); if (pos == -1) yield break; attribute.Value = GetSection(html, htmlStart, pos); htmlStart = pos; } } else { attribute.Value = String.Empty; } attribute.Length = htmlStart - attribute.Offset; if (tag.GetAttribute(attribute.Name) == null) { tag.Attributes.Add(attribute); } } } while (!isTagComplete); tag.Length = htmlStart - tag.Offset; // Yield result yield return tag; // Skip contents of special tags whose contents are to be treated as raw text if (!tag.IsEnd && !tag.IsSelfClosing && tag.NameEqualsAny("script", "style", "title", "textarea")) { bool foundEndTag = false; do { pos = IndexOf(html, htmlStart, htmlEnd, '<'); if (pos == -1) yield break; htmlStart = pos + 1; string endTagText = "/" + tag.Name; if (StartsWith(html, htmlStart, htmlEnd, endTagText, true) && (StartsWithWhiteSpace(html, htmlStart + endTagText.Length, htmlEnd) || StartsWithAny(html, htmlStart + endTagText.Length, htmlEnd, '/', '>'))) { htmlStart -= 1; foundEndTag = true; } } while (!foundEndTag); } } else if (StartsWith(html, htmlStart, htmlEnd, "!--", false) && !StartsWith(html, htmlStart + 3, htmlEnd, '>')) { // Skip comment htmlStart += 3; bool foundEnd = false; do { pos = IndexOf(html, htmlStart, htmlEnd, '-'); if (pos == -1) yield break; htmlStart = pos + 1; if (StartsWith(html, htmlStart, htmlEnd, "->", false)) { htmlStart += 2; foundEnd = true; } else if (StartsWith(html, htmlStart, htmlEnd, "-!>", false)) { htmlStart += 3; foundEnd = true; } } while (!foundEnd); } else if (StartsWithAny(html, htmlStart, htmlEnd, '?', '/', '!')) { // Skip bogus comment or DOCTYPE htmlStart += 1; pos = IndexOf(html, htmlStart, htmlEnd, '>'); if (pos == -1) yield break; htmlStart = pos + 1; } } }
public string GetInnerHTML(HTMLTag startTag, HTMLTag endTag) { return(startTag.IsSelfClosing ? String.Empty : GetSection(_preprocessedHTML, startTag.EndOffset, endTag.Offset)); }
private int GetTagIndex(HTMLTag tag) { int i; if (!_offsetToIndex.TryGetValue(tag.Offset, out i)) { throw new Exception("Unable to locate the specified tag."); } return i; }
protected override bool IsImage(HTMLTag linkTag) { return Enumerable.FirstOrDefault(Enumerable.Where(_htmlParser.FindStartTags(_htmlParser.CreateTagRange(linkTag), "img"), t => HTMLParser.ClassAttributeValueHas(t, "thumb"))) != null; }