示例#1
0
        public IDictionary <string, List <string> > Crawl(string root, ICrawlPolicy policy = null)
        {
            var task = CrawlAsync(root, policy);

            task.Wait();
            return(task.Result);
        }
示例#2
0
        /// <summary>
        /// Returns a Dictionary mapping Urls to pages that they link to.
        ///
        /// If the page was not fetched successfully then the Value for its key will be null.
        /// </summary>
        /// <param name="root">The root url to start the crawl</param>
        /// <param name="policy">Policy which dictates which urls may be crawled</param>
        /// <returns></returns>
        public async Task <IDictionary <string, List <string> > > CrawlAsync(string root, ICrawlPolicy policy = null)
        {
            // FIXME: Use Uri throughout
            var rootUri = new Uri(root, UriKind.Absolute);

            if (rootUri.GetLeftPart(UriPartial.Authority) != root.TrimEnd('/') ||
                rootUri.AbsolutePath != "/")
            {
                throw new ArgumentException("Must provide a root url with a trailing slash.");
            }

            policy = policy ?? new WhiteListDomainPolicy(root);
            var results = new ConcurrentDictionary <string, List <string> >();

            var initialLinks = await GetLinks(root);

            if (initialLinks == null)
            {
                Debug.Assert(results.TryAdd(root, null));
                return(results);
            }

            Debug.Assert(results.TryAdd(root, initialLinks));

            // No point spawning a bunch of threads for this
            if (initialLinks.Count == 0)
            {
                return(results);
            }

            var remainingLinks = new ConcurrentBag <string>(initialLinks);

            var tasks = new Task[this.jobs];

            for (var i = 0; i < this.jobs; i++)
            {
                tasks[i] = Task.Run(async() =>
                {
                    string url;
                    while (!remainingLinks.IsEmpty)
                    {
                        // Skip if we don't want to do this url or we  already have a thread doing
                        // the job
                        if (!remainingLinks.TryTake(out url) || !policy.Accept(url) || !results.TryAdd(url, null))
                        {
                            continue;
                        }

                        // Now we *should* be the only Task trying to get `url`
                        var links = await GetLinks(url);

                        if (links != null)
                        {
                            // Add all the links to the joblist
                            foreach (var link in links)
                            {
                                remainingLinks.Add(link);
                            }
                            Debug.Assert(results.TryUpdate(url, links, null));
                        }
                    }
                });
            }

            Task.WaitAll(tasks);

            return(results);
        }