private bool ProcessExtractedUrl(CrawlRequest request, Uri url) { var newUrl = false; if (this.discoveredUrls.TryGetValue(url, out var urlProperties) == false) { urlProperties = new CrawledUrlPropertiesImpl(url); if (this.discoveredUrls.TryAdd(url, urlProperties)) { newUrl = true; this.observer.OnNewUrl(url); } else { urlProperties = this.discoveredUrls[url]; } } urlProperties.referrers.TryAdd(request.Url, 0); return(newUrl); }
public bool ShouldCrawl(CrawlRequest crawlRequest) { return(this.filters.All(f => f.ShouldCrawl(crawlRequest))); }
private async Task <bool> Crawl(CrawlRequest request, CancellationToken cancellationToken, bool isLastTry) { this.observer.OnCrawling(request); var requestStopWatch = Stopwatch.StartNew(); try { var httpRequestMessage = new HttpRequestMessage { RequestUri = request.Url, Method = HttpMethod.Get, }; if (string.IsNullOrEmpty(this.userAgent) == false) { httpRequestMessage.Headers.TryAddWithoutValidation("User-Agent", this.userAgent); } foreach (var customHttpHeader in this.customHttpHeaders) { httpRequestMessage.Headers.Add(customHttpHeader.Key, customHttpHeader.Value); } using (var response = await this.httpClient.SendAsync(httpRequestMessage, cancellationToken)) { this.discoveredUrls[request.Url].status = response.StatusCode; if (response.IsSuccessStatusCode) { this.observer.OnCrawled(new CrawlResult(request.Url, response.StatusCode, request.Referrer, requestStopWatch.Elapsed)); var links = await this.linkExtractor.ExtractLinks(request, response.Content); foreach (var url in links) { if (cancellationToken.IsCancellationRequested) { break; } if (this.ProcessExtractedUrl(request, url)) { var crawlRequest = new CrawlRequest(url, request.Url, request.Depth + 1); if (this.crawlRequestFilter.ShouldCrawl(crawlRequest) == false) { continue; } this.workQueue.Add(crawlRequest, cancellationToken); } } return(true); } else { var crawlError = new CrawlError(request.Url, response.StatusCode, request.Referrer); this.ReportError(isLastTry, crawlError); return(false); } } } catch (OperationCanceledException ex) { if (cancellationToken.IsCancellationRequested == false) { // it means timeout but there is no easy way to find it out // https://github.com/dotnet/corefx/issues/20296 var exception = new OperationCanceledException($"Task canceled for {request.Url} after {requestStopWatch.Elapsed} (timeout?).", ex); var crawlError = new CrawlError(request.Url, exception, request.Referrer); this.ReportError(isLastTry, crawlError); } // otherwise it means request to stop processing new urls return(false); } catch (Exception e) { var crawlError = new CrawlError(request.Url, e, request.Referrer); this.ReportError(isLastTry, crawlError); return(false); } }
public void OnCrawling(CrawlRequest request) { Console.WriteLine($"CRAWLING: {request.Url}"); }