private static int FindCleanupIndexForParagraphTrim(Element[] els)
        {
            int i;
            for (i = els.Length - 1; i >= 0; i--)
            {
                Element el = els[i];

                if (el is Tag)
                {
                    if (!((Tag)el).NameEquals("p"))
                    {
                        return i;
                    }
                }
                else if (el is Text)
                {
                    if (HtmlUtils.UnEscapeEntities(el.RawText, HtmlUtils.UnEscapeMode.NonMarkupText).Trim().Length == 0)
                    {
                        els[i] = null;
                    }
                    else
                    {
                        return i;
                    }
                }
                else
                {
                    return i;
                }
            }
            return i;
        }
        protected override string Replace(Element el)
        {
            if (el is BeginTag)
            {
                BeginTag tag = (BeginTag)el;
                string lowerName = tag.Name.ToLowerInvariant();
                if (elements.ContainsKey(lowerName))
                {
                    Hashtable attrs = elements[lowerName] as Hashtable;
                    if (attrs != null)
                    {
                        foreach (Attr attr in tag.Attributes)
                        {
                            if (attrs.Contains(attr.Name.ToLowerInvariant()))
                                return OnMatchingAttr(tag, attr);
                        }
                    }

                }
            }
            else if (el is ScriptLiteral)
            {
                return OnScriptLiteral((ScriptLiteral)el);
            }

            return base.Replace(el);
        }
            public bool IsMatch(Element e)
            {
                BeginTag tag = e as BeginTag;
                if (tag == null)
                    return false;

                if (!tag.NameEquals("meta"))
                    return false;

                if (tag.GetAttributeValue("name") != "generator")
                    return false;

                string generator = tag.GetAttributeValue("content");
                if (generator == null || CaseInsensitiveComparer.DefaultInvariant.Compare("blogger", generator) != 0)
                    return false;

                return true;
            }
 public override int ElementCost(Element el)
 {
     return el.ToString().Length;
 }
 public override int ElementCost(Element el)
 {
     if (!(el is Text))
         return 0;
     else
         return HtmlUtils.UnEscapeEntities(el.ToString(), HtmlUtils.UnEscapeMode.NonMarkupText).Length;
 }
        private int ParseBeginTag(Match beginMatch, out Element element, out EndTag trailingEnd)
        {
            trailingEnd = null;

            Group tagNameGroup = beginMatch.Groups["tagname"];
            string tagName = tagNameGroup.Value;

            int tagPos = tagNameGroup.Index + tagNameGroup.Length;

            ArrayList attributes = null;
            LazySubstring extraResidue = null;
            bool isComplete = false;

            while (true)
            {
                Match match = endBeginTagMatcher.Match(tagPos);
                if (match != null)
                {
                    tagPos += match.Length;
                    if (match.Groups[1].Success)
                    {
                        isComplete = true;
                        if (supportTrailingEnd)
                            trailingEnd = new EndTag(data, tagPos, 0, tagName, true);
                    }
                    break;
                }

                match = attrNameMatcher.Match(tagPos);
                if (match == null)
                {
                    int residueStart = tagPos;
                    int residueEnd;

                    residueEnd = tagPos = data.IndexOfAny(new char[] { '<', '>' }, tagPos);
                    if (tagPos == -1)
                    {
                        residueEnd = tagPos = data.Length;
                    }
                    else if (data[tagPos] == '>')
                    {
                        tagPos++;
                    }
                    else
                    {
                        Debug.Assert(data[tagPos] == '<');
                    }

                    extraResidue = residueStart < residueEnd ? new LazySubstring(data, residueStart, residueEnd - residueStart) : null;
                    break;
                }
                else
                {
                    tagPos += match.Length;
                    LazySubstring attrName = new LazySubstring(data, match.Groups[1].Index, match.Groups[1].Length);
                    LazySubstring attrValue = null;
                    match = quotedAttrValueMatcher.Match(tagPos);
                    if (match != null)
                    {
                        attrValue = new LazySubstring(data, match.Groups[2].Index, match.Groups[2].Length);
                        tagPos += match.Length;
                    }
                    else
                    {
                        match = unquotedAttrValueMatcher.Match(tagPos);
                        if (match != null)
                        {
                            attrValue = new LazySubstring(data, match.Groups[1].Index, match.Groups[1].Length);
                            tagPos += match.Length;
                        }
                    }

                    // no attribute value; that's OK

                    if (attributes == null)
                        attributes = new ArrayList();
                    attributes.Add(new Attr(attrName, attrValue));
                }
            }

            int len = tagPos - beginMatch.Index;
            element = new BeginTag(data, beginMatch.Index, len, tagName, attributes == null ? null : (Attr[])attributes.ToArray(typeof(Attr)), isComplete, extraResidue);
            return len;
        }
 protected override void DefaultAction(Element el)
 {
     _output.Write(el.ToString());
 }
 private bool IsWhitespaceOrZeroLengthText(Element e)
 {
     if (!(e is Text))
         return false;
     int end = e.Offset + e.Length;
     for (int i = e.Offset; i < end; i++)
     {
         if (!char.IsWhiteSpace(html[i]))
             return false;
     }
     return true;
 }
 public bool IsMatch(Element e)
 {
     foreach (IElementPredicate predicate in predicates)
         if (predicate.IsMatch(e))
             return true;
     return false;
 }
        public HtmlExtractor MatchNext(IElementPredicate predicate, bool ignoreWhitespace)
        {
            lastMatch = null;

            Element e;
            do
            {
                e = parser.Next();
            }
            while (e != null && ignoreWhitespace && IsWhitespaceOrZeroLengthText(e));

            if (e != null && predicate.IsMatch(e))
                lastMatch = e;

            return this;
        }
        public HtmlExtractor MatchNext(string criterion, bool ignoreWhitespace)
        {
            lastMatch = null;

            MatchNext(Parse(criterion), ignoreWhitespace);
            return this;
        }
        public HtmlExtractor SeekWithin(string criterion, string withinCriterion)
        {
            lastMatch = null;

            SeekWithin(Parse(criterion), Parse(withinCriterion));
            return this;
        }
        /// <summary>
        /// Seeks forward from the current position for the criterion.
        ///
        /// If the seek fails, the parser will be positioned at the end of the file--all
        /// future seeks will also fail (until Reset() is called).
        /// </summary>
        /// <param name="criterion">
        /// Can be either a begin tag or end tag, or a run of text, or a comment.
        ///
        /// Examples of start tags:
        /// <a> (any anchor tag)
        /// <a name> (any anchor tag that has at least one "name" attribute (with or without value)
        /// <a name='title'> (any anchor tag that has a name attribute whose value is "title")
        ///
        /// Example of end tag:
        /// </a> (any end anchor tag)
        ///
        /// Examples of invalid criteria:
        /// <a></a> (only one criterion allowed per seek; chain Seek() calls if necessary)
        /// foo (only begin tags and end tags are allowed)
        ///
        /// TODO: Allow regular expression matching on attribute values, e.g. <a class=/^heading.*$/>
        /// </param>
        public HtmlExtractor Seek(string criterion)
        {
            lastMatch = null;

            SeekWithin(Parse(criterion), null);
            return this;
        }
 protected override void DefaultAction(Element el)
 {
     if (suspendTagDepth == 0)
         Emit(el.ToString());
 }
        public HtmlExtractor Seek(IElementPredicate predicate)
        {
            lastMatch = null;

            SeekWithin(predicate, null);
            return this;
        }
 public bool IsMatch(Element e)
 {
     EndTag tag = e as EndTag;
     if (tag == null)
         return false;
     return tag.NameEquals(tagName);
 }
        public HtmlExtractor SeekWithin(IElementPredicate predicate, IElementPredicate withinPredicate)
        {
            lastMatch = null;

            Element e;
            while (null != (e = parser.Next()))
            {
                if (predicate.IsMatch(e))
                {
                    lastMatch = e;
                    break;
                }
                if (withinPredicate != null && withinPredicate.IsMatch(e))
                    break;
            }

            return this;
        }
        public bool IsMatch(Element e)
        {
            BeginTag tag = e as BeginTag;
            if (tag == null)
                return false;

            if (tagName != null && !tag.NameEquals(tagName))
                return false;

            foreach (RequiredAttribute reqAttr in attrs)
            {
                int foundAt;
                Attr attr = tag.GetAttribute(reqAttr.Name, true, 0, out foundAt);
                if (attr == null)
                    return false;
                if (reqAttr.Value != null && reqAttr.Value != attr.Value)
                    return false;
            }

            return true;
        }
Example #19
0
 public bool IsMatch(Element e)
 {
     return predicate(e);
 }
 public bool IsMatch(Element e)
 {
     Text text = e as Text;
     if (text == null)
         return false;
     return text.RawText == textToMatch;
 }
        private int ParseMarkup(out Element element, out EndTag trailingEnd)
        {
            trailingEnd = null;

            Match m;

            // commentMatcher MUST be checked before directiveMatcher!
            m = commentMatcher.Match(pos);
            if (m != null)
            {
                element = new Comment(data, pos, m.Length);
                return m.Length;
            }

            // commentMatcher MUST be checked before directiveMatcher!
            m = directiveMatcher.Match(pos);
            if (m != null)
            {
                element = new MarkupDirective(data, pos, m.Length);
                return m.Length;
            }

            m = endMatcher.Match(pos);
            if (m != null)
            {
                element = new EndTag(data, pos, m.Length, m.Groups[1].Value);
                return m.Length;
            }

            m = beginMatcher.Match(pos);
            if (m != null)
            {
                return ParseBeginTag(m, out element, out trailingEnd);
            }

            element = null;
            return -1;
        }
 public bool IsMatch(Element e)
 {
     Comment comment = e as Comment;
     if (comment == null)
         return false;
     return comment.RawText == textToMatch;
 }
 public override int ElementCost(Element el)
 {
     return HttpUtility.UrlEncode(el.ToString()).Length;
 }
 public bool IsMatch(Element e)
 {
     return actualPredicate.IsMatch(e);
 }
 /// <summary>
 /// Determine the cost of an element.
 /// </summary>
 public abstract int ElementCost(Element el);
 public bool IsMatch(Element e)
 {
     if (!allowSubtypes)
         return e.GetType().Equals(type);
     else
         return type.IsInstanceOfType(e);
 }
 protected virtual string Replace(Element el)
 {
     return el.RawText;
 }
 /// <summary>
 /// Reposition the extractor back to the beginning of the
 /// HTML.
 /// </summary>
 /// <returns>Returns this. This allows chaining together of calls,
 /// like this:
 ///
 /// if (ex.Seek(...).Success || ex.Reset().Seek(...).Success) { ... }
 /// </returns>
 public HtmlExtractor Reset()
 {
     lastMatch = null;
     parser = new SimpleHtmlParser(html);
     return this;
 }
 protected override string Replace(Element el)
 {
     if (el is BeginTag)
     {
         BeginTag beginTag = (BeginTag)el;
         if (beginTag.NameEquals("a"))
         {
             Attr href = beginTag.GetAttribute("href");
             if (href != null && href.Value != null)
             {
                 href.Value = ConvertUrl(href.Value);
                 return beginTag.ToString();
             }
         }
         else if (beginTag.NameEquals("img"))
         {
             Attr src = beginTag.GetAttribute("src");
             if (src != null && src.Value != null)
             {
                 src.Value = ConvertUrl(src.Value);
                 return beginTag.ToString();
             }
         }
     }
     return base.Replace(el);
 }
        private static Visibility DetermineVisibility(Element el)
        {
            if (el is Tag)
            {
                if (el is BeginTag && ((BeginTag)el).Attributes.Length > 0)
                    return Visibility.Visible;

                switch (((Tag)el).Name.ToUpperInvariant())
                {
                    case "P":
                    case "BLOCKQUOTE":
                    case "H1":
                    case "H2":
                    case "H3":
                    case "H4":
                    case "H5":
                    case "H6":
                    case "TT":
                    case "I":
                    case "B":
                    case "U":
                    case "S":
                    case "STRIKE":
                    case "BIG":
                    case "SMALL":
                    case "EM":
                    case "STRONG":
                    case "DFN":
                    case "CODE":
                    case "SAMP":
                    case "KBD":
                    case "VAR":
                    case "CITE":
                    case "ABBR":
                    case "ACRONYM":
                    case "SUB":
                    case "SUP":
                    case "Q":
                    case "INS":
                    case "DEL":
                        return Visibility.Invisible;
                    case "BR":
                        return Visibility.Whitespace;
                    // anything else is significant
                    default:
                        return Visibility.Visible;
                }
            }
            else if (el is Text)
            {
                string text = HtmlUtils.UnEscapeEntities(el.RawText, HtmlUtils.UnEscapeMode.NonMarkupText);
                if (Regex.IsMatch(text, @"^(\s|&nbsp;)*$", RegexOptions.ExplicitCapture))
                    return Visibility.Whitespace;
                else
                    return Visibility.Visible;

            }

            return Visibility.Visible;
        }