private static int FindCleanupIndexForParagraphTrim(Element[] els) { int i; for (i = els.Length - 1; i >= 0; i--) { Element el = els[i]; if (el is Tag) { if (!((Tag)el).NameEquals("p")) { return i; } } else if (el is Text) { if (HtmlUtils.UnEscapeEntities(el.RawText, HtmlUtils.UnEscapeMode.NonMarkupText).Trim().Length == 0) { els[i] = null; } else { return i; } } else { return i; } } return i; }
protected override string Replace(Element el) { if (el is BeginTag) { BeginTag tag = (BeginTag)el; string lowerName = tag.Name.ToLowerInvariant(); if (elements.ContainsKey(lowerName)) { Hashtable attrs = elements[lowerName] as Hashtable; if (attrs != null) { foreach (Attr attr in tag.Attributes) { if (attrs.Contains(attr.Name.ToLowerInvariant())) return OnMatchingAttr(tag, attr); } } } } else if (el is ScriptLiteral) { return OnScriptLiteral((ScriptLiteral)el); } return base.Replace(el); }
public bool IsMatch(Element e) { BeginTag tag = e as BeginTag; if (tag == null) return false; if (!tag.NameEquals("meta")) return false; if (tag.GetAttributeValue("name") != "generator") return false; string generator = tag.GetAttributeValue("content"); if (generator == null || CaseInsensitiveComparer.DefaultInvariant.Compare("blogger", generator) != 0) return false; return true; }
public override int ElementCost(Element el) { return el.ToString().Length; }
public override int ElementCost(Element el) { if (!(el is Text)) return 0; else return HtmlUtils.UnEscapeEntities(el.ToString(), HtmlUtils.UnEscapeMode.NonMarkupText).Length; }
private int ParseBeginTag(Match beginMatch, out Element element, out EndTag trailingEnd) { trailingEnd = null; Group tagNameGroup = beginMatch.Groups["tagname"]; string tagName = tagNameGroup.Value; int tagPos = tagNameGroup.Index + tagNameGroup.Length; ArrayList attributes = null; LazySubstring extraResidue = null; bool isComplete = false; while (true) { Match match = endBeginTagMatcher.Match(tagPos); if (match != null) { tagPos += match.Length; if (match.Groups[1].Success) { isComplete = true; if (supportTrailingEnd) trailingEnd = new EndTag(data, tagPos, 0, tagName, true); } break; } match = attrNameMatcher.Match(tagPos); if (match == null) { int residueStart = tagPos; int residueEnd; residueEnd = tagPos = data.IndexOfAny(new char[] { '<', '>' }, tagPos); if (tagPos == -1) { residueEnd = tagPos = data.Length; } else if (data[tagPos] == '>') { tagPos++; } else { Debug.Assert(data[tagPos] == '<'); } extraResidue = residueStart < residueEnd ? new LazySubstring(data, residueStart, residueEnd - residueStart) : null; break; } else { tagPos += match.Length; LazySubstring attrName = new LazySubstring(data, match.Groups[1].Index, match.Groups[1].Length); LazySubstring attrValue = null; match = quotedAttrValueMatcher.Match(tagPos); if (match != null) { attrValue = new LazySubstring(data, match.Groups[2].Index, match.Groups[2].Length); tagPos += match.Length; } else { match = unquotedAttrValueMatcher.Match(tagPos); if (match != null) { attrValue = new LazySubstring(data, match.Groups[1].Index, match.Groups[1].Length); tagPos += match.Length; } } // no attribute value; that's OK if (attributes == null) attributes = new ArrayList(); attributes.Add(new Attr(attrName, attrValue)); } } int len = tagPos - beginMatch.Index; element = new BeginTag(data, beginMatch.Index, len, tagName, attributes == null ? null : (Attr[])attributes.ToArray(typeof(Attr)), isComplete, extraResidue); return len; }
protected override void DefaultAction(Element el) { _output.Write(el.ToString()); }
private bool IsWhitespaceOrZeroLengthText(Element e) { if (!(e is Text)) return false; int end = e.Offset + e.Length; for (int i = e.Offset; i < end; i++) { if (!char.IsWhiteSpace(html[i])) return false; } return true; }
public bool IsMatch(Element e) { foreach (IElementPredicate predicate in predicates) if (predicate.IsMatch(e)) return true; return false; }
public HtmlExtractor MatchNext(IElementPredicate predicate, bool ignoreWhitespace) { lastMatch = null; Element e; do { e = parser.Next(); } while (e != null && ignoreWhitespace && IsWhitespaceOrZeroLengthText(e)); if (e != null && predicate.IsMatch(e)) lastMatch = e; return this; }
public HtmlExtractor MatchNext(string criterion, bool ignoreWhitespace) { lastMatch = null; MatchNext(Parse(criterion), ignoreWhitespace); return this; }
public HtmlExtractor SeekWithin(string criterion, string withinCriterion) { lastMatch = null; SeekWithin(Parse(criterion), Parse(withinCriterion)); return this; }
/// <summary> /// Seeks forward from the current position for the criterion. /// /// If the seek fails, the parser will be positioned at the end of the file--all /// future seeks will also fail (until Reset() is called). /// </summary> /// <param name="criterion"> /// Can be either a begin tag or end tag, or a run of text, or a comment. /// /// Examples of start tags: /// <a> (any anchor tag) /// <a name> (any anchor tag that has at least one "name" attribute (with or without value) /// <a name='title'> (any anchor tag that has a name attribute whose value is "title") /// /// Example of end tag: /// </a> (any end anchor tag) /// /// Examples of invalid criteria: /// <a></a> (only one criterion allowed per seek; chain Seek() calls if necessary) /// foo (only begin tags and end tags are allowed) /// /// TODO: Allow regular expression matching on attribute values, e.g. <a class=/^heading.*$/> /// </param> public HtmlExtractor Seek(string criterion) { lastMatch = null; SeekWithin(Parse(criterion), null); return this; }
protected override void DefaultAction(Element el) { if (suspendTagDepth == 0) Emit(el.ToString()); }
public HtmlExtractor Seek(IElementPredicate predicate) { lastMatch = null; SeekWithin(predicate, null); return this; }
public bool IsMatch(Element e) { EndTag tag = e as EndTag; if (tag == null) return false; return tag.NameEquals(tagName); }
public HtmlExtractor SeekWithin(IElementPredicate predicate, IElementPredicate withinPredicate) { lastMatch = null; Element e; while (null != (e = parser.Next())) { if (predicate.IsMatch(e)) { lastMatch = e; break; } if (withinPredicate != null && withinPredicate.IsMatch(e)) break; } return this; }
public bool IsMatch(Element e) { BeginTag tag = e as BeginTag; if (tag == null) return false; if (tagName != null && !tag.NameEquals(tagName)) return false; foreach (RequiredAttribute reqAttr in attrs) { int foundAt; Attr attr = tag.GetAttribute(reqAttr.Name, true, 0, out foundAt); if (attr == null) return false; if (reqAttr.Value != null && reqAttr.Value != attr.Value) return false; } return true; }
public bool IsMatch(Element e) { return predicate(e); }
public bool IsMatch(Element e) { Text text = e as Text; if (text == null) return false; return text.RawText == textToMatch; }
private int ParseMarkup(out Element element, out EndTag trailingEnd) { trailingEnd = null; Match m; // commentMatcher MUST be checked before directiveMatcher! m = commentMatcher.Match(pos); if (m != null) { element = new Comment(data, pos, m.Length); return m.Length; } // commentMatcher MUST be checked before directiveMatcher! m = directiveMatcher.Match(pos); if (m != null) { element = new MarkupDirective(data, pos, m.Length); return m.Length; } m = endMatcher.Match(pos); if (m != null) { element = new EndTag(data, pos, m.Length, m.Groups[1].Value); return m.Length; } m = beginMatcher.Match(pos); if (m != null) { return ParseBeginTag(m, out element, out trailingEnd); } element = null; return -1; }
public bool IsMatch(Element e) { Comment comment = e as Comment; if (comment == null) return false; return comment.RawText == textToMatch; }
public override int ElementCost(Element el) { return HttpUtility.UrlEncode(el.ToString()).Length; }
public bool IsMatch(Element e) { return actualPredicate.IsMatch(e); }
/// <summary> /// Determine the cost of an element. /// </summary> public abstract int ElementCost(Element el);
public bool IsMatch(Element e) { if (!allowSubtypes) return e.GetType().Equals(type); else return type.IsInstanceOfType(e); }
protected virtual string Replace(Element el) { return el.RawText; }
/// <summary> /// Reposition the extractor back to the beginning of the /// HTML. /// </summary> /// <returns>Returns this. This allows chaining together of calls, /// like this: /// /// if (ex.Seek(...).Success || ex.Reset().Seek(...).Success) { ... } /// </returns> public HtmlExtractor Reset() { lastMatch = null; parser = new SimpleHtmlParser(html); return this; }
protected override string Replace(Element el) { if (el is BeginTag) { BeginTag beginTag = (BeginTag)el; if (beginTag.NameEquals("a")) { Attr href = beginTag.GetAttribute("href"); if (href != null && href.Value != null) { href.Value = ConvertUrl(href.Value); return beginTag.ToString(); } } else if (beginTag.NameEquals("img")) { Attr src = beginTag.GetAttribute("src"); if (src != null && src.Value != null) { src.Value = ConvertUrl(src.Value); return beginTag.ToString(); } } } return base.Replace(el); }
private static Visibility DetermineVisibility(Element el) { if (el is Tag) { if (el is BeginTag && ((BeginTag)el).Attributes.Length > 0) return Visibility.Visible; switch (((Tag)el).Name.ToUpperInvariant()) { case "P": case "BLOCKQUOTE": case "H1": case "H2": case "H3": case "H4": case "H5": case "H6": case "TT": case "I": case "B": case "U": case "S": case "STRIKE": case "BIG": case "SMALL": case "EM": case "STRONG": case "DFN": case "CODE": case "SAMP": case "KBD": case "VAR": case "CITE": case "ABBR": case "ACRONYM": case "SUB": case "SUP": case "Q": case "INS": case "DEL": return Visibility.Invisible; case "BR": return Visibility.Whitespace; // anything else is significant default: return Visibility.Visible; } } else if (el is Text) { string text = HtmlUtils.UnEscapeEntities(el.RawText, HtmlUtils.UnEscapeMode.NonMarkupText); if (Regex.IsMatch(text, @"^(\s| )*$", RegexOptions.ExplicitCapture)) return Visibility.Whitespace; else return Visibility.Visible; } return Visibility.Visible; }