private static bool IsApplicable(Regex[] filters, string name) { return filters.Length == 0 || !filters.Any(filter => filter.IsMatch(name)); }
/// <summary> /// Get every WebPage.Internal on a web site (or part of a web site) visiting all internal links just once /// plus every external page (or other Url) linked to the web site as a WebPage.External /// </summary> /// <remarks> /// Use .OfType WebPage.Internal to get just the internal ones if that's what you want /// </remarks> public static IEnumerable<WebPageWithDocument> GetAllPagesUnderWithDocument(Uri urlRoot, int delayBetweenPagesInSeconds, Regex[] excludedPaths) { // Get the root page ... HttpWebResponse webResponse = FetchWebPageWithRetries(urlRoot); if (webResponse.ResponseUri.AbsoluteUri != urlRoot.AbsoluteUri) { Console.WriteLine("*** ROOT REQUEST WAS REDIRECTED USING: " + webResponse.ResponseUri + "***"); urlRoot = webResponse.ResponseUri; } var queue = new QueueWithDeduping<PendingCrawlItem>(); var start = PendingCrawlItem.Factory(urlRoot:urlRoot, href:urlRoot, referrer:urlRoot); queue.Enqueue(start); while (queue.Count > 0) { // pull an item off the queue, inspect it and deal with it var nextItem = queue.Dequeue(); if (excludedPaths.Any(p => p.IsMatch(nextItem.Uri.AbsoluteUri))) { Console.WriteLine("Skipping " + nextItem.Uri + " because path is excluded"); continue; } var result = ProcessNextItem(nextItem, queue); yield return result; // And delay but only on internal pages - external pages don't count if (!(result.WebPage is WebPage.External)) Thread.Sleep(delayBetweenPagesInSeconds); } }