Beispiel #1
0
        /// <summary>
        /// Build the sitemap tree structure using a top-down DFS approach using a supervisor
        /// that controls concurrency and accounts for error recovery and
        /// retrials.
        /// </summary>
        public CrawlingResult Crawl(CrawlerTaskInfo taskInfo)
        {
            // baseUrl is validated in the controller,
            // so it's safe to create an Uri instance from it.
            var root = new Uri(taskInfo.BaseUrl);
            // sitemap is the root of the hierarchy.
            var sitemap = new NodeInfo(root);
            var queue   = new ConcurrentQueue <NodeInfo>();

            queue.Enqueue(sitemap);
            var queueProcessor = new QueueProcessor(
                provider,
                logger,
                root,
                taskInfo.MaxDepth,
                queue,
                configuration.GetValue <int>("QueueProcessorConcurrencyLevel"),
                configuration.GetValue <int>("FailuresMaxRetries"),
                configuration.GetValue <int>("QueueIdleTime")
                );

            queueProcessor.Start();
            return(new CrawlingResult {
                Sitemap = sitemap
            });
        }
        /// <summary>
        /// Spawn tasks to crawl each level of the tree at the time. Maintain a list
        /// of previously found URLs to avoid infinite loops.
        /// </summary>
        public CrawlingResult Crawl(CrawlerTaskInfo taskInfo)
        {
            // baseUrl is validated in the controller,
            // so it's safe to create an Uri instance from it.
            var root   = new Uri(taskInfo.BaseUrl);
            var domain = new UriBuilder(root.Scheme, root.Host).Uri;
            // sitemap is the root of the hierarchy.
            var sitemap = new NodeInfo(root);
            var queue   = new List <NodeInfo> {
                sitemap
            };
            var paths = new HashSet <string> {
                root.AbsolutePath
            };
            var depth = 0;

            // MaxDepth equals zero means all the way down in the tree.
            while ((taskInfo.MaxDepth == 0 || ++depth <= taskInfo.MaxDepth) && queue.Count > 0)
            {
                // Spawn a thread (task) for each node in this level.
                var tasks = new List <Task <string> >();
                foreach (var node in queue)
                {
                    tasks.Add(Task <string> .Run(() => CrawlNode(node)));
                }
                Task.WaitAll(tasks.ToArray());
                var next = new List <NodeInfo>();
                for (var j = 0; j < queue.Count; j++)
                {
                    if (tasks[j].Result != null)
                    {
                        var nodeUris     = Utils.GetAbsoluteUrisFromHtml(domain, tasks[j].Result);
                        var childrenUris = new HashSet <Uri>(nodeUris);
                        foreach (var uri in childrenUris)
                        {
                            var child = new NodeInfo(uri);
                            queue[j].AddChild(child);
                            // Add it to the next iteration iff it hasn't been found and it
                            // is in the same domain.
                            if (domain.Host == uri.Host && !paths.Contains(uri.AbsolutePath))
                            {
                                paths.Add(uri.AbsolutePath);
                                next.Add(child);
                            }
                        }
                    }
                }
                queue = next;
            }
            return(new CrawlingResult {
                Sitemap = sitemap
            });
        }
Beispiel #3
0
        /// <summary>
        /// Build the sitemap tree structure using a top-down DFS approach.
        /// Thread safety is at the level class.
        /// </summary>
        public CrawlingResult Crawl(CrawlerTaskInfo taskInfo)
        {
            // baseUrl is validated in the controller,
            // so it's safe to create an Uri instance from it.
            var root   = new Uri(taskInfo.BaseUrl);
            var domain = new UriBuilder(root.Scheme, root.Host).Uri;
            // sitemap is the root of the hierarchy.
            var sitemap = new NodeInfo(root);

            paths = new HashSet <string>()
            {
                root.AbsolutePath
            };
            DfsAsync(sitemap, 0, taskInfo.MaxDepth, domain);
            return(new CrawlingResult {
                Sitemap = sitemap
            });
        }