Example #1
0
        private List<Uri> ExtractItemSources(Page page)
        {
            HashSet<Uri> srcAttributeSources = ExtractSources(
                page,
                _imgTagRegex,
                _srcAttributeRegex,
                source => source.EndsWith(".gif", StringComparison.OrdinalIgnoreCase));

            var hrefAttributeSources = ExtractSources(
                page,
                _aTagRegex,
                _hrefAttributeRegex,
                source => source.EndsWith(".gif", StringComparison.OrdinalIgnoreCase));

            foreach (var otherSource in hrefAttributeSources)
            {
                srcAttributeSources.Add(otherSource);
            }

            return srcAttributeSources.ToList();
        }
Example #2
0
 private List<Uri> ExtractPageSources(Page page)
 {
     return ExtractSources(
         page,
         _aTagRegex,
         _hrefAttributeRegex,
         source =>
             _excludingUriPaths.All(
                 excludingUriPath => !source.StartsWith(excludingUriPath, StringComparison.OrdinalIgnoreCase))).ToList();
 }
Example #3
0
        private HashSet<Uri> ExtractSources(Page page, Regex tagRegex, Regex attributeRegex, Func<string, bool> onCheckSource = null)
        {
            var sources = new HashSet<Uri>();
            foreach (Match tagMatch in tagRegex.Matches(page.Content))
            {
                Match attributeMatch = attributeRegex.Match(tagMatch.Value);
                if (attributeMatch.Groups.Count != 2)
                {
                    continue;
                }

                string attributeValue = attributeMatch.Groups[1].Value;
                if (!Uri.IsWellFormedUriString(attributeValue, UriKind.Absolute))
                {
                    string leftPart = page.Source.GetLeftPart(UriPartial.Authority);
                    if (!leftPart.EndsWith("/") && !attributeValue.StartsWith("/"))
                    {
                        leftPart += "/";
                    }

                    attributeValue = leftPart + attributeValue;
                }

                if (onCheckSource != null)
                {
                    if (!onCheckSource(attributeValue))
                    {
                        continue;
                    }
                }

                sources.Add(new Uri(attributeValue));
            }

            return sources;
        }