private async Task <CrawlResult> CrawlAsync(string url, int currentDepth)
        {
            currentDepth++;
            CrawlResult   crawlResult = new CrawlResult();
            List <string> innerUrls   = await GetHtmlPageInnerUrlsAsync(url);

            if (innerUrls != null)
            {
                if (currentDepth < _maxDepth)
                {
                    foreach (string innerUrl in innerUrls)
                    {
                        if (!crawlResult.InnerCrawlResults.ContainsKey(innerUrl))
                        {
                            crawlResult.InnerCrawlResults.Add(innerUrl, await CrawlAsync(innerUrl, currentDepth));
                        }
                    }
                }
                else
                {
                    foreach (string innerUrl in innerUrls)
                    {
                        if (!crawlResult.InnerCrawlResults.ContainsKey(innerUrl))
                        {
                            crawlResult.InnerCrawlResults.Add(innerUrl, null);
                        }
                    }
                }
            }
            return(crawlResult);
        }
Esempio n. 2
0
        public async Task <CrawlResult> RunAsync(string url, CancellationToken ct = default(CancellationToken))
        {
            if (url == null)
            {
                throw new ArgumentNullException(nameof(url));
            }

            EnsureHttpClient();

            var result = new CrawlResult();

            result.Urls = GetRootUrls(url).ToList();

            foreach (var item in result.Urls)
            {
                _discoveredUrls.Add(new DiscoveredUrl {
                    Url = item
                }, ct);
            }

            var maxConcurrency = _options.MaxConcurrency;
            var tasks          = new Task <Task> [maxConcurrency];

            for (var i = 0; i < maxConcurrency; i++)
            {
                tasks[i] = Task.Run <Task>(() => ProcessCollectionAsync(result, ct), ct);
            }

            await Task.WhenAll(tasks.Select(task => task.Unwrap())).ConfigureAwait(false);

            return(result);
        }
Esempio n. 3
0
        static void Main(string[] args)
        {
            SiteMapFinder    finder  = new SiteMapFinder();
            PoliteWebCrawler crawler = new PoliteWebCrawler(null, null, null, null, null, finder, null, null, null);


            crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted;
            CrawlResult result = crawler.Crawl(new Uri("http://tenders.rfpalertservices.com/sitemap/"));
        }
Esempio n. 4
0
        private bool IsSameHost(CrawlResult result, string url)
        {
            foreach (var u in result.Urls)
            {
                if (Utilities.IsSameHost(u, url))
                {
                    return(true);
                }
            }

            return(false);
        }
        public async Task <CrawlResult> PerformCrawlingAsync(List <string> rootUrls)
        {
            CrawlResult crawlResult = new CrawlResult();

            foreach (string rootUrl in rootUrls)
            {
                if (!crawlResult.InnerCrawlResults.ContainsKey(rootUrl))
                {
                    crawlResult.InnerCrawlResults.Add(rootUrl, await CrawlAsync(rootUrl, InitialDepth));
                }
            }
            return(crawlResult);
        }
Esempio n. 6
0
        private async Task ProcessItemAsync(CrawlResult result, DiscoveredUrl discoveredUrl, CancellationToken ct)
        {
            // Test the domain, same domain as start url or external by 1 level
            if (!MustProcess(result, discoveredUrl))
            {
                return;
            }

            // Already processed
            Document existingDocument;

            lock (result.Documents)
            {
                existingDocument = result.Documents.FirstOrDefault(d => discoveredUrl.IsSame(d));
            }

            if (existingDocument != null)
            {
                AddReference(discoveredUrl, existingDocument);
                return;
            }

            var doc = await GetAsync(discoveredUrl, ct).ConfigureAwait(false);

            lock (result.Documents)
            {
                existingDocument = result.Documents.FirstOrDefault(d => doc.IsSame(d)); // Another thread as processed the same URL at the same time
                if (existingDocument != null)
                {
                    AddReference(discoveredUrl, existingDocument);
                    return;
                }
            }

            if (discoveredUrl.SourceDocument != null)
            {
                lock (doc.ReferencedBy)
                {
                    doc.ReferencedBy.Add(new DocumentRef {
                        SourceDocument = discoveredUrl.SourceDocument, TargetDocument = doc, Excerpt = discoveredUrl.Excerpt
                    });
                }
            }

            lock (result.Documents)
            {
                result.Documents.Add(doc);
            }

            OnDocumentParsed(doc);
        }
Esempio n. 7
0
        private async Task ProcessCollectionAsync(CrawlResult result, CancellationToken ct)
        {
            foreach (var item in _discoveredUrls.GetConsumingEnumerable(ct))
            {
                ct.ThrowIfCancellationRequested();

                Interlocked.Increment(ref _processingThreadCount);
                try
                {
                    await ProcessItemAsync(result, item, ct).ConfigureAwait(false);
                }
                finally
                {
                    Interlocked.Decrement(ref _processingThreadCount);
                    if (_processingThreadCount == 0 && _discoveredUrls.Count == 0)
                    {
                        _discoveredUrls.CompleteAdding();
                    }
                }
            }
        }
Esempio n. 8
0
        private bool MustProcess(CrawlResult result, DiscoveredUrl discoveredUrl)
        {
            if (discoveredUrl.SourceDocument == null) // root page
            {
                return(true);
            }

            if (discoveredUrl.IsRedirect) // we go to the redicted page
            {
                return(true);
            }

            var isSameHost = IsSameHost(result, discoveredUrl.Url);

            if (!isSameHost && IsSameHost(result, discoveredUrl.SourceDocument.Url)) // External link by one level
            {
                return(true);
            }

            if (isSameHost) // same domain
            {
                return(true);
            }

            if (_options.Includes != null)
            {
                foreach (var include in _options.Includes)
                {
                    if (include.IsMatch(discoveredUrl.Url))
                    {
                        return(true);
                    }
                }
            }

            return(false);
        }