private List<Uri> ExtractItemSources(Page page) { HashSet<Uri> srcAttributeSources = ExtractSources( page, _imgTagRegex, _srcAttributeRegex, source => source.EndsWith(".gif", StringComparison.OrdinalIgnoreCase)); var hrefAttributeSources = ExtractSources( page, _aTagRegex, _hrefAttributeRegex, source => source.EndsWith(".gif", StringComparison.OrdinalIgnoreCase)); foreach (var otherSource in hrefAttributeSources) { srcAttributeSources.Add(otherSource); } return srcAttributeSources.ToList(); }
private List<Uri> ExtractPageSources(Page page) { return ExtractSources( page, _aTagRegex, _hrefAttributeRegex, source => _excludingUriPaths.All( excludingUriPath => !source.StartsWith(excludingUriPath, StringComparison.OrdinalIgnoreCase))).ToList(); }
private HashSet<Uri> ExtractSources(Page page, Regex tagRegex, Regex attributeRegex, Func<string, bool> onCheckSource = null) { var sources = new HashSet<Uri>(); foreach (Match tagMatch in tagRegex.Matches(page.Content)) { Match attributeMatch = attributeRegex.Match(tagMatch.Value); if (attributeMatch.Groups.Count != 2) { continue; } string attributeValue = attributeMatch.Groups[1].Value; if (!Uri.IsWellFormedUriString(attributeValue, UriKind.Absolute)) { string leftPart = page.Source.GetLeftPart(UriPartial.Authority); if (!leftPart.EndsWith("/") && !attributeValue.StartsWith("/")) { leftPart += "/"; } attributeValue = leftPart + attributeValue; } if (onCheckSource != null) { if (!onCheckSource(attributeValue)) { continue; } } sources.Add(new Uri(attributeValue)); } return sources; }