private static bool IsApplicable(Regex[] filters, string name)
 {
     return filters.Length == 0 || !filters.Any(filter => filter.IsMatch(name));
 }
Beispiel #2
0
        /// <summary>
        /// Get every WebPage.Internal on a web site (or part of a web site) visiting all internal links just once
        /// plus every external page (or other Url) linked to the web site as a WebPage.External
        /// </summary>
        /// <remarks>
        /// Use .OfType WebPage.Internal to get just the internal ones if that's what you want
        /// </remarks>
        public static IEnumerable<WebPageWithDocument> GetAllPagesUnderWithDocument(Uri urlRoot, int delayBetweenPagesInSeconds, Regex[] excludedPaths)
        {
            // Get the root page ...

            HttpWebResponse webResponse = FetchWebPageWithRetries(urlRoot);

            if (webResponse.ResponseUri.AbsoluteUri != urlRoot.AbsoluteUri)
            {
                Console.WriteLine("*** ROOT REQUEST WAS REDIRECTED USING: " + webResponse.ResponseUri + "***");
                urlRoot = webResponse.ResponseUri;
            }

            var queue = new QueueWithDeduping<PendingCrawlItem>();

            var start = PendingCrawlItem.Factory(urlRoot:urlRoot, href:urlRoot, referrer:urlRoot);
            queue.Enqueue(start);

            while (queue.Count > 0)
            {
                // pull an item off the queue, inspect it and deal with it
                var nextItem = queue.Dequeue();

                if (excludedPaths.Any(p => p.IsMatch(nextItem.Uri.AbsoluteUri)))
                {
                    Console.WriteLine("Skipping " + nextItem.Uri + " because path is excluded");
                    continue;
                }

                var result = ProcessNextItem(nextItem, queue);
                yield return result;
                // And delay but only on internal pages - external pages don't count
                if (!(result.WebPage is WebPage.External))
                    Thread.Sleep(delayBetweenPagesInSeconds);
            }
        }