private async Task <CrawlResult> CrawlAsync(string url, int currentDepth) { currentDepth++; CrawlResult crawlResult = new CrawlResult(); List <string> innerUrls = await GetHtmlPageInnerUrlsAsync(url); if (innerUrls != null) { if (currentDepth < _maxDepth) { foreach (string innerUrl in innerUrls) { if (!crawlResult.InnerCrawlResults.ContainsKey(innerUrl)) { crawlResult.InnerCrawlResults.Add(innerUrl, await CrawlAsync(innerUrl, currentDepth)); } } } else { foreach (string innerUrl in innerUrls) { if (!crawlResult.InnerCrawlResults.ContainsKey(innerUrl)) { crawlResult.InnerCrawlResults.Add(innerUrl, null); } } } } return(crawlResult); }
public async Task <CrawlResult> RunAsync(string url, CancellationToken ct = default(CancellationToken)) { if (url == null) { throw new ArgumentNullException(nameof(url)); } EnsureHttpClient(); var result = new CrawlResult(); result.Urls = GetRootUrls(url).ToList(); foreach (var item in result.Urls) { _discoveredUrls.Add(new DiscoveredUrl { Url = item }, ct); } var maxConcurrency = _options.MaxConcurrency; var tasks = new Task <Task> [maxConcurrency]; for (var i = 0; i < maxConcurrency; i++) { tasks[i] = Task.Run <Task>(() => ProcessCollectionAsync(result, ct), ct); } await Task.WhenAll(tasks.Select(task => task.Unwrap())).ConfigureAwait(false); return(result); }
static void Main(string[] args) { SiteMapFinder finder = new SiteMapFinder(); PoliteWebCrawler crawler = new PoliteWebCrawler(null, null, null, null, null, finder, null, null, null); crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted; CrawlResult result = crawler.Crawl(new Uri("http://tenders.rfpalertservices.com/sitemap/")); }
private bool IsSameHost(CrawlResult result, string url) { foreach (var u in result.Urls) { if (Utilities.IsSameHost(u, url)) { return(true); } } return(false); }
public async Task <CrawlResult> PerformCrawlingAsync(List <string> rootUrls) { CrawlResult crawlResult = new CrawlResult(); foreach (string rootUrl in rootUrls) { if (!crawlResult.InnerCrawlResults.ContainsKey(rootUrl)) { crawlResult.InnerCrawlResults.Add(rootUrl, await CrawlAsync(rootUrl, InitialDepth)); } } return(crawlResult); }
private async Task ProcessItemAsync(CrawlResult result, DiscoveredUrl discoveredUrl, CancellationToken ct) { // Test the domain, same domain as start url or external by 1 level if (!MustProcess(result, discoveredUrl)) { return; } // Already processed Document existingDocument; lock (result.Documents) { existingDocument = result.Documents.FirstOrDefault(d => discoveredUrl.IsSame(d)); } if (existingDocument != null) { AddReference(discoveredUrl, existingDocument); return; } var doc = await GetAsync(discoveredUrl, ct).ConfigureAwait(false); lock (result.Documents) { existingDocument = result.Documents.FirstOrDefault(d => doc.IsSame(d)); // Another thread as processed the same URL at the same time if (existingDocument != null) { AddReference(discoveredUrl, existingDocument); return; } } if (discoveredUrl.SourceDocument != null) { lock (doc.ReferencedBy) { doc.ReferencedBy.Add(new DocumentRef { SourceDocument = discoveredUrl.SourceDocument, TargetDocument = doc, Excerpt = discoveredUrl.Excerpt }); } } lock (result.Documents) { result.Documents.Add(doc); } OnDocumentParsed(doc); }
private async Task ProcessCollectionAsync(CrawlResult result, CancellationToken ct) { foreach (var item in _discoveredUrls.GetConsumingEnumerable(ct)) { ct.ThrowIfCancellationRequested(); Interlocked.Increment(ref _processingThreadCount); try { await ProcessItemAsync(result, item, ct).ConfigureAwait(false); } finally { Interlocked.Decrement(ref _processingThreadCount); if (_processingThreadCount == 0 && _discoveredUrls.Count == 0) { _discoveredUrls.CompleteAdding(); } } } }
private bool MustProcess(CrawlResult result, DiscoveredUrl discoveredUrl) { if (discoveredUrl.SourceDocument == null) // root page { return(true); } if (discoveredUrl.IsRedirect) // we go to the redicted page { return(true); } var isSameHost = IsSameHost(result, discoveredUrl.Url); if (!isSameHost && IsSameHost(result, discoveredUrl.SourceDocument.Url)) // External link by one level { return(true); } if (isSameHost) // same domain { return(true); } if (_options.Includes != null) { foreach (var include in _options.Includes) { if (include.IsMatch(discoveredUrl.Url)) { return(true); } } } return(false); }